youtube-dl/youtube_dl/extractor/bloomberg.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor


class BloombergIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'

    _TESTS = [{
        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
        # The md5 checksum changes
        'info_dict': {
            'id': 'qurhIVlJSB6hzkVi229d8g',
            'ext': 'flv',
            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
            'description': 'md5:a8ba0302912d03d246979735c17d2761',
        },
        'params': {
            'format': 'best[format_id^=hds]',
        },
    }, {
        # video ID in BPlayer(...)
        'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
        'info_dict': {
            'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
            'ext': 'flv',
            'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
            'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.',
        },
        'params': {
            'format': 'best[format_id^=hds]',
        },
    }, {
        # data-bmmrid=
        'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
        'only_matching': True,
    }, {
        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
        'only_matching': True,
    }, {
        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        name = self._match_id(url)
        webpage = self._download_webpage(url, name)
        video_id = self._search_regex(
            (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
             r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
             r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
            webpage, 'id', group='id', default=None)
        if not video_id:
            bplayer_data = self._parse_json(self._search_regex(
                r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
            video_id = bplayer_data['id']
        title = re.sub(': Video$', '', self._og_search_title(webpage))

        embed_info = self._download_json(
            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
        formats = []
        for stream in embed_info['streams']:
            stream_url = stream.get('url')
            if not stream_url:
                continue
            if stream['muxing_format'] == 'TS':
                formats.extend(self._extract_m3u8_formats(
                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
            else:
                formats.extend(self._extract_f4m_formats(
                    stream_url, video_id, f4m_id='hds', fatal=False))
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': self._og_search_description(webpage),
            'thumbnail': self._og_search_thumbnail(webpage),
        }
-												[bloomberg] Support BPlayer() players (closes #10187)

											
										
										
											8 years ago
+								# coding: utf-8
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											10 years ago
+								from __future__ import unicode_literals
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											11 years ago
+								import re
 								from .common import InfoExtractor
 								class BloombergIE(InfoExtractor):
-												[bloomberg] Relax _VALID_URL even more (Closes #7685)

											
										
										
											9 years ago
+								    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											11 years ago
-												[bloomberg] Reax _VALID_URL (Closes #7546)

											
										
										
											9 years ago
+								    _TESTS = [{
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
-												[bloomberg] Extract the available formats (closes #2776)

It uses a helper method in the InfoExtractor class.
The downloader will pick the requested formats using the bitrate in the info dict.

											
										
										
											10 years ago
+								        # The md5 checksum changes
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											10 years ago
+								        'info_dict': {
 								            'id': 'qurhIVlJSB6hzkVi229d8g',
 								            'ext': 'flv',
 								            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								            'description': 'md5:a8ba0302912d03d246979735c17d2761',
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											11 years ago
+								        },
-												[bloomberg] Fix test_Bloomberg

In this test case, sometimes HLS is the best format while sometimes HDS
is. To prevent occasional test failures, force HDS to be the best
format. In the past, testing against HDS formats causes the same error
as #9214, which is fixed as #9377 landed.

											
										
										
											8 years ago
+								        'params': {
 								            'format': 'best[format_id^=hds]',
 								        },
-												[bloomberg] Support BPlayer() players (closes #10187)

											
										
										
											8 years ago
+								    }, {
 								        # video ID in BPlayer(...)
 								        'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
 								        'info_dict': {
 								            'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
 								            'ext': 'flv',
 								            'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
 								            'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.',
 								        },
 								        'params': {
 								            'format': 'best[format_id^=hds]',
 								        },
-												[bloomberg] Add another video id regex (closes #12062)

											
										
										
											7 years ago
+								    }, {
 								        # data-bmmrid=
 								        'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
 								        'only_matching': True,
-												[bloomberg] Reax _VALID_URL (Closes #7546)

											
										
										
											9 years ago
+								    }, {
 								        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
 								        'only_matching': True,
-												[bloomberg] Relax _VALID_URL even more (Closes #7685)

											
										
										
											9 years ago
+								    }, {
 								        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
 								        'only_matching': True,
-												[bloomberg] Reax _VALID_URL (Closes #7546)

											
										
										
											9 years ago
+								    }]
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											11 years ago
 								    def _real_extract(self, url):
-												[bloomberg] Modernize

											
										
										
											9 years ago
+								        name = self._match_id(url)
-												Add an extractor for Bloomberg (closes #1436)

											
										
										
											11 years ago
+								        webpage = self._download_webpage(url, name)
-												[bloomberg] Improve video id regex

											
										
										
											9 years ago
+								        video_id = self._search_regex(
-												[bloomberg] Add another video id regex (closes #12062)

											
										
										
											7 years ago
+								            (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
 								             r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
 								             r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
 								            webpage, 'id', group='id', default=None)
-												[bloomberg] Support BPlayer() players (closes #10187)

											
										
										
											8 years ago
+								        if not video_id:
 								            bplayer_data = self._parse_json(self._search_regex(
 								                r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
 								            video_id = bplayer_data['id']
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											10 years ago
+								        title = re.sub(': Video$', '', self._og_search_title(webpage))
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								        embed_info = self._download_json(
 								            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
 								        formats = []
 								        for stream in embed_info['streams']:
-												[bloomberg] Improve formats extraction

											
										
										
											9 years ago
+								            stream_url = stream.get('url')
 								            if not stream_url:
 								                continue
-												[bloomberg] Modernize

											
										
										
											9 years ago
+								            if stream['muxing_format'] == 'TS':
-												Simplify formats accumulation for f4m/m3u8/smil formats

Now all _extract_*_formats routines return a list

											
										
										
											8 years ago
+								                formats.extend(self._extract_m3u8_formats(
 								                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								            else:
-												Simplify formats accumulation for f4m/m3u8/smil formats

Now all _extract_*_formats routines return a list

											
										
										
											8 years ago
+								                formats.extend(self._extract_f4m_formats(
 								                    stream_url, video_id, f4m_id='hds', fatal=False))
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								        self._sort_formats(formats)
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											10 years ago
+								        return {
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								            'id': video_id,
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											10 years ago
+								            'title': title,
-												[bloomberg] Adapt to website changes (fixes #5347)

											
										
										
											9 years ago
+								            'formats': formats,
-												[bloomberg] Fix extraction (fixes #2154)

Stop using the OoyalaIE, extract the f4m url instead.

											
										
										
											10 years ago
+								            'description': self._og_search_description(webpage),
 								            'thumbnail': self._og_search_thumbnail(webpage),
 								        }