From ab25f3f43196ca56964ba34ba4674fcb2d08f69a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 3 Feb 2021 17:15:31 +0100 Subject: [PATCH 001/242] [youtube] pass embed URL to get_video_info request --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75a007353..42b0f452c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1397,6 +1397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, + 'eurl': 'https://www.youtube.com/embed/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 1b731ebcaa3ef2a1e52cf6968cf93e08d50fe0d4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 3 Feb 2021 18:13:17 +0100 Subject: [PATCH 002/242] [bravotv] add support for oxygen.com(closes #13357)(closes #22500) --- youtube_dl/extractor/bravotv.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] From 83031d749b11f062b9ba97023c228329e771cbd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 00:25:26 +0700 Subject: [PATCH 003/242] [pornhub:user] Add support for URLs unavailable via /videos page and improve paging (closes #27853) --- youtube_dl/extractor/pornhub.py | 56 +++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2fcbd186f..67e3731c8 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -22,6 +22,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, url_or_none, ) @@ -405,6 +406,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -463,14 +468,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -488,17 +506,37 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num): + note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) From e22ff4e35681a600ed61918beab8ed316728ec39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 04:09:11 +0700 Subject: [PATCH 004/242] [pornhub] Add support for authentication (closes #18797, closes #21416, closes #24294) --- youtube_dl/extractor/pornhub.py | 106 +++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 67e3731c8..83307a233 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -23,6 +23,7 @@ from ..utils import ( remove_quotes, str_to_int, update_url_query, + urlencode_postdata, url_or_none, ) @@ -53,6 +54,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -164,12 +225,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -181,12 +250,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -427,26 +491,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -506,12 +550,14 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') + self._login(host) + page = self._extract_page(url) VIDEOS = '/videos' - def download_page(base_url, num): - note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) @@ -532,7 +578,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): # 1. https://github.com/ytdl-org/youtube-dl/issues/27853 if is_404(e) and page_num == first_page and VIDEOS in base_url: base_url = base_url.replace(VIDEOS, '') - webpage = download_page(base_url, page_num) + webpage = download_page(base_url, page_num, fallback=True) else: raise except ExtractorError as e: From 1f0910bc2742b16be8425841d5ed6a0fd96f82a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 04:17:45 +0700 Subject: [PATCH 005/242] [svtplay] Fix video id extraction (closes #28058) --- youtube_dl/extractor/svt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index a0b6ef4db..4acc29fce 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -255,8 +255,10 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) From 2adc0c51cdf38e039fba0ede11f65bbd9c71bde8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 04:20:09 +0700 Subject: [PATCH 006/242] [pornhub] Add placeholder netrc machine --- youtube_dl/extractor/pornhub.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83307a233..83773aebb 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -29,6 +29,8 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): + _NETRC_MACHINE = 'pornhub' + def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) From 89c5a7d5aabd138a14c76453d79d5d66ef573bde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 04:36:57 +0700 Subject: [PATCH 007/242] [pornhub] Implement lazy playlist extraction --- youtube_dl/extractor/pornhub.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 83773aebb..b7631e4e1 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -547,13 +547,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): ]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') - - self._login(host) - + def _entries(self, url, host, item_id): page = self._extract_page(url) VIDEOS = '/videos' @@ -566,7 +560,6 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): def is_404(e): return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 - entries = [] base_url = url has_page = page is not None first_page = page if has_page else 1 @@ -590,11 +583,19 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): From 3c07d007ca5376719a0cfe6b9c6627b38cbd3e1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 04:47:30 +0700 Subject: [PATCH 008/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/ChangeLog b/ChangeLog index 7f2e0aad1..bd753d524 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,45 @@ +version + +Extractors +* [pornhub] Implement lazy playlist extraction +* [svtplay] Fix video id extraction (#28058) ++ [pornhub] Add support for authentication (#18797, #21416, #24294) +* [pornhub:user] Improve paging ++ [pornhub:user] Add support for URLs unavailable via /videos page (#27853) ++ [bravotv] Add support for oxygen.com (#13357, #22500) ++ [youtube] Pass embed URL to get_video_info request +* [ccma] Improve metadata extraction (#27994) + + Extract age limit, alt title, categories, series and episode number + * Fix timestamp multiple subtitles extraction +* [egghead] Update API domain (#28038) +- [vidzi] Remove extractor (#12629) +* [vidio] Improve metadata extraction +* [youtube] Improve subtitles extraction +* [youtube] Fix chapter extraction fallback +* [youtube] Rewrite extractor + * Improve format sorting + * Remove unused code + * Fix series metadata extraction + * Fix trailer video extraction + * Improve error reporting + + Extract video location ++ [vvvvid] Add support for youtube embeds (#27825) +* [googledrive] Report download page errors (#28005) +* [vlive] Fix error message decoding for python 2 (#28004) +* [youtube] Improve DASH formats file size extraction +* [cda] Improve birth validation detection (#14022, #27929) ++ [awaan] Extract uploader id (#27963) ++ [medialaan] Add support DPG Media MyChannels based websites (#14871, #15597, + #16106, #16489) +* [abcnews] Fix extraction (#12394, #27920) +* [AMP] Fix upload date and timestamp extraction (#27970) +* [tv4] Relax URL regular expression (#27964) ++ [tv2] Add support for mtvuutiset.fi (#27744) +* [adn] Improve login warning reporting +* [zype] Fix uplynk id extraction (#27956) ++ [adn] Add support for authentication (#17091, #27841, #27937) + + version 2021.01.24.1 Core From cfefb7d854f87e02c971170fcfa08f3ff2cb1bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 04:49:25 +0700 Subject: [PATCH 009/242] release 2021.02.04 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2dde97a2c..86e48bc4e 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c520d1ee0..fa369b744 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 4aacd3bdc..806c7c58d 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 91bbed506..1d1a36dda 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.01.24.1 + [debug] youtube-dl version 2021.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a0a2c989a..c19052a7a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.01.24.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index bd753d524..d5d9c00a2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.02.04 Extractors * [pornhub] Implement lazy playlist extraction diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13bac6e27..e1b85b1d1 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -537,6 +537,7 @@ - **mtv:video** - **mtvjapan** - **mtvservices:embedded** + - **MTVUutisetArticle** - **MuenchenTV**: münchen.tv - **mva**: Microsoft Virtual Academy videos - **mva:course**: Microsoft Virtual Academy courses @@ -1058,7 +1059,6 @@ - **vidme** - **vidme:user** - **vidme:user:likes** - - **Vidzi** - **vier**: vier.be and vijf.be - **vier:videos** - **viewlift** @@ -1103,6 +1103,7 @@ - **vrv** - **vrv:series** - **VShare** + - **VTM** - **VTXTV** - **vube**: Vube.com - **VuClip** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c52f1d9ca..d898525c9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.01.24.1' +__version__ = '2021.02.04' From fc88e8f0e3e66f17f787cbc1ea45c87fdc70781e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Thu, 4 Feb 2021 00:57:56 +0100 Subject: [PATCH 010/242] [azmedien] Fix extraction (#28064) --- youtube_dl/extractor/azmedien.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): From 7215691ab7cabc858b17c16928c372da3e35ec59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 13:07:43 +0700 Subject: [PATCH 011/242] [youtube] Prefer DASH formats (closes #28070) --- youtube_dl/extractor/youtube.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 42b0f452c..a3b10c094 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1549,16 +1549,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: + dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): - if f['format_id'] in itags: - continue filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - formats.append(f) + dash_formats.append(f) + # Until further investigation prefer DASH formats as non-DASH + # may not be available (see [1]) + # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 + if dash_formats: + dash_formats_keys = [f['format_id'] for f in dash_formats] + formats = [f for f in formats if f['format_id'] not in dash_formats_keys] + formats.extend(dash_formats) if not formats: if streaming_data.get('licenseInfos'): From c7d407bca205d8eb248b94b611435187265b79da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 13:09:28 +0700 Subject: [PATCH 012/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index d5d9c00a2..4392a4e6f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +version + +Extractors +* [youtube] Prefer DASH formats (#28070) +* [azmedien] Fix extraction (#28064) + + version 2021.02.04 Extractors From a4bdc3112bf0e925afc2e512d5f23f9097f6bc7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 4 Feb 2021 13:11:33 +0700 Subject: [PATCH 013/242] release 2021.02.04.1 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 86e48bc4e..19b750f86 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index fa369b744..8acb80b60 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 806c7c58d..66edcf752 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 1d1a36dda..18203fb34 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04 + [debug] youtube-dl version 2021.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c19052a7a..20df40cc5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04** +- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 4392a4e6f..784b73d8d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.02.04.1 Extractors * [youtube] Prefer DASH formats (#28070) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d898525c9..425f15589 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04' +__version__ = '2021.02.04.1' From 1641b132323b544b9ae0dad06707425eba1f926b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Feb 2021 13:05:35 +0100 Subject: [PATCH 014/242] [youtube] skip OTF formats(#28070) --- youtube_dl/extractor/youtube.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3b10c094..eb5f70763 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1477,6 +1477,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] + itag_qualities = {} player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} @@ -1486,6 +1487,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): continue + itag = str_or_none(fmt.get('itag')) + quality = fmt.get('quality') + if itag and quality: + itag_qualities[itag] = quality + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF': + continue + fmt_url = fmt.get('url') if not fmt_url: sc = compat_parse_qs(fmt.get('signatureCipher')) @@ -1505,10 +1516,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature - itag = str_or_none(fmt.get('itag')) if itag: itags.append(itag) - quality = fmt.get('quality') dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1549,22 +1558,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('youtube_include_dash_manifest'): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: - dash_formats = [] for f in self._extract_mpd_formats( dash_manifest_url, video_id, fatal=False): + itag = f['format_id'] + if itag in itags: + continue + if itag in itag_qualities: + f['quality'] = q(itag_qualities[itag]) filesize = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if filesize: f['filesize'] = filesize - dash_formats.append(f) - # Until further investigation prefer DASH formats as non-DASH - # may not be available (see [1]) - # 1. https://github.com/ytdl-org/youtube-dl/issues/28070 - if dash_formats: - dash_formats_keys = [f['format_id'] for f in dash_formats] - formats = [f for f in formats if f['format_id'] not in dash_formats_keys] - formats.extend(dash_formats) + formats.append(f) if not formats: if streaming_data.get('licenseInfos'): From 0156ce95c5ba83de6c68a149d352ccecd983a294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Feb 2021 01:54:46 +0700 Subject: [PATCH 015/242] [youtube] Extract abr and vbr (closes #28100) --- youtube_dl/extractor/youtube.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eb5f70763..b5e0f4eaa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -500,6 +500,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', + 'abr': 129.495, }, 'params': { 'youtube_include_dash_manifest': True, @@ -1518,6 +1519,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if itag: itags.append(itag) + tbr = float_or_none( + fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -1526,8 +1529,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'fps': int_or_none(fmt.get('fps')), 'height': int_or_none(fmt.get('height')), 'quality': q(quality), - 'tbr': float_or_none(fmt.get( - 'averageBitrate') or fmt.get('bitrate'), 1000), + 'tbr': tbr, 'url': fmt_url, 'width': fmt.get('width'), } @@ -1538,7 +1540,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if mobj: dct['ext'] = mimetype2ext(mobj.group(1)) dct.update(parse_codecs(mobj.group(2))) - if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + no_audio = dct.get('acodec') == 'none' + no_video = dct.get('vcodec') == 'none' + if no_audio: + dct['vbr'] = tbr + if no_video: + dct['abr'] = tbr + if no_audio or no_video: dct['downloader_options'] = { # Youtube throttles chunks >~10M 'http_chunk_size': 10485760, From 0cf09c2b4168cb99800836d8c1ff0d6d8b16fb6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Feb 2021 02:17:03 +0700 Subject: [PATCH 016/242] [youtube] Fix release date extraction (closes #28094) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b5e0f4eaa..c87e54e6b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1019,6 +1019,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', 'only_matching': True, }, + { + # https://github.com/ytdl-org/youtube-dl/pull/28094 + 'url': 'OtqTfy26tG0', + 'info_dict': { + 'id': 'OtqTfy26tG0', + 'ext': 'mp4', + 'title': 'Burn Out', + 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131', + 'upload_date': '20141120', + 'uploader': 'The Cinematic Orchestra - Topic', + 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw', + 'artist': 'The Cinematic Orchestra', + 'track': 'Burn Out', + 'album': 'Every Day', + 'release_data': None, + 'release_year': None, + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1743,7 +1765,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')), 'track': mobj.group('track').strip(), 'release_date': release_date, - 'release_year': int(release_year), + 'release_year': int_or_none(release_year), }) initial_data = None From 240585470539d31d9c3785a67861491fa3696451 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Heine=20n=C3=A9=20Lang?= Date: Sat, 6 Feb 2021 20:46:05 +0100 Subject: [PATCH 017/242] [urplay] Fix extraction (closes #28073) (#28074) --- youtube_dl/extractor/urplay.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 10b817760..5452c7ca1 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -42,8 +42,8 @@ class URPlayIE(InfoExtractor): url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) urplayer_data = self._parse_json(self._html_search_regex( - r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['currentProduct'] + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( From 7a9161578e42abe681c9d3352ecc9a18a9b8df6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Feb 2021 19:18:06 +0700 Subject: [PATCH 018/242] [cda] Detect geo restricted videos (refs #28106) --- youtube_dl/extractor/cda.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 6429454fb..1b4362144 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -95,6 +95,9 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + if re.search(r'niedostępn[ey] w(?: |\s+)Twoim kraju\s*<', webpage): + self.raise_geo_restricted() + need_confirm_age = False if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', webpage, 'birthday validate form', default=None): From 5fc53690cbe6abb11941a3f4846b566a7472753e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 7 Feb 2021 20:34:41 +0700 Subject: [PATCH 019/242] [archiveorg] Fix and improve extraction (closes #21330, closes #23586, closes #25277, closes #26780, closes #27109, closes #27236, closes #28063) --- youtube_dl/extractor/archiveorg.py | 54 +++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info From 99c68db0a8adc634e2e928ea2756a2ceee3ae863 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 8 Feb 2021 09:20:28 +0100 Subject: [PATCH 020/242] [youtube] add support phone/tablet JS player(closes #26424) --- test/test_youtube_signature.py | 31 ++++++++++--------------------- youtube_dl/extractor/youtube.py | 6 +++++- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index b5a4d0d5f..627d4cb92 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -19,55 +19,46 @@ from youtube_dl.compat import compat_str, compat_urlretrieve _TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', - 'js', 86, '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', - 'js', 85, '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', - 'js', 90, ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', - 'js', 84, 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', - 'js', '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', - 'js', 84, '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', - 'js', 83, '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', - 'js', '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B' ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', - 'js', '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', ) @@ -78,6 +69,10 @@ class TestPlayerInfo(unittest.TestCase): def test_youtube_extract_player_info(self): PLAYER_URLS = ( ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), + ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), # obsolete ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), @@ -100,13 +95,13 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, stype, sig_input, expected_sig): +def make_tfunc(url, sig_input, expected_sig): m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): - basename = 'player-%s.%s' % (test_id, stype) + basename = 'player-%s.js' % test_id fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): @@ -114,22 +109,16 @@ def make_tfunc(url, stype, sig_input, expected_sig): ydl = FakeYDL() ie = YoutubeIE(ydl) - if stype == 'js': - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - else: - assert stype == 'swf' - with open(fn, 'rb') as testf: - swfcode = testf.read() - func = ie._parse_sig_swf(swfcode) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + func = ie._parse_sig_js(jscode) src_sig = ( compat_str(string.printable[:sig_input]) if isinstance(sig_input, int) else sig_input) got_sig = func(src_sig) self.assertEqual(got_sig, expected_sig) - test_func.__name__ = str('test_signature_' + stype + '_' + test_id) + test_func.__name__ = str('test_signature_js_' + test_id) setattr(TestSignature, test_func.__name__, test_func) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c87e54e6b..346311d9b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -398,7 +398,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?(1).+)? # if we found the ID, everything can follow $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _PLAYER_INFO_RE = ( - r'/(?P[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$', + r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', + r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?Pvfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') @@ -1237,6 +1238,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bm=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)', + r'\bc&&\(c=(?P[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)', + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)', r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns From 311ebdd9a57e72116136a464fbc0fa8cad32db42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 8 Feb 2021 15:46:32 +0700 Subject: [PATCH 021/242] [xhamster] Extract formats from xplayer settings and extract filesizes (closes #28114) --- youtube_dl/extractor/xhamster.py | 80 +++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 76aeaf9a4..f73b9778f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -11,11 +11,14 @@ from ..utils import ( dict_get, extract_attributes, ExtractorError, + float_or_none, int_or_none, parse_duration, + str_or_none, try_get, unified_strdate, url_or_none, + urljoin, ) @@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor): video = initials['videoModel'] title = video['title'] formats = [] - for format_id, formats_dict in video['sources'].items(): + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): if not isinstance(formats_dict, dict): continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) for quality, format_item in formats_dict.items(): if format_id == 'download': # Download link takes some time to be generated, # skipping for now continue - if not isinstance(format_item, dict): - continue - format_url = format_item.get('link') - filesize = int_or_none( - format_item.get('size'), invscale=1000000) - else: - format_url = format_item - filesize = None + format_url = format_item format_url = url_or_none(format_url) - if not format_url: + if not format_url or format_url in format_urls: continue + format_urls.add(format_url) formats.append({ 'format_id': '%s-%s' % (format_id, quality), 'url': format_url, 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), - 'filesize': filesize, + 'filesize': format_sizes.get(quality), 'http_headers': { 'Referer': urlh.geturl(), }, }) - self._sort_formats(formats) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) categories_list = video.get('categories') if isinstance(categories_list, list): From 7f8b8bc418b8831ea1c2ae8de64e3bf0e8b707f8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 8 Feb 2021 15:56:42 +0100 Subject: [PATCH 022/242] [ign] fix extraction(closes #24771) --- youtube_dl/extractor/extractors.py | 4 +- youtube_dl/extractor/ign.py | 371 +++++++++++++++-------------- 2 files changed, 200 insertions(+), 175 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 97b0b4034..84998316c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -470,8 +470,8 @@ from .hungama import ( from .hypem import HypemIE from .ign import ( IGNIE, - OneUPIE, - PCMagIE, + IGNVideoIE, + IGNArticleIE, ) from .iheart import ( IHeartRadioIE, diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index a96ea8010..0d9f50ed2 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,230 +3,255 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + HEADRequest, + determine_ext, int_or_none, parse_iso8601, + strip_or_none, + try_get, ) -class IGNIE(InfoExtractor): +class IGNBaseIE(InfoExtractor): + def _call_api(self, slug): + return self._download_json( + 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + + +class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?Pvideos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' - _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' - _EMBED_RE = r']+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' - - _TESTS = [ - { - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'uploader_id': 'cberidon@ign.com', - } - }, - { - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - 'uploader_id': 'danieljkrupa@gmail.com', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - 'uploader_id': 'togilvie@ign.com', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'md5': '618fedb9c901fd086f6f093564ef8558', - 'info_dict': { - 'id': '078fdd005f6d3c02f63d795faa1b984f', - 'ext': 'mp4', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', - 'timestamp': 1408047180, - 'upload_date': '20140814', - 'uploader_id': 'jamesduggan1990@gmail.com', - }, - }, - { - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'only_matching': True, - }, - { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, - { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, - ] - - def _find_video_id(self, webpage): - res_id = [ - r'"video_id"\s*:\s*"(.*?)"', - r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', - r'data-video-id="(.+?)"', - r']*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', - webpage) - if multiple_urls: - entries = [self.url_result(u, ie='IGN') for u in multiple_urls] - return { - '_type': 'playlist', - 'id': name_or_id, - 'entries': entries, - } - - video_id = self._find_video_id(webpage) - if not video_id: - return self.url_result(self._search_regex( - self._EMBED_RE, webpage, 'embed url')) - return self._get_video_info(video_id) - - def _get_video_info(self, video_id): - api_data = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id) + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - m3u8_url = api_data['refs'].get('m3uUrl') + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = api_data['refs'].get('f4mUrl') + + f4m_url = refs.get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - for asset in api_data['assets']: + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue formats.append({ - 'url': asset['url'], - 'tbr': asset.get('actual_bitrate_kbps'), - 'fps': asset.get('frame_rate'), + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + self._sort_formats(formats) - thumbnails = [{ - 'url': thumbnail['url'] - } for thumbnail in api_data.get('thumbnails', [])] + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) - metadata = api_data['metadata'] + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) return { - 'id': api_data.get('videoId') or video_id, - 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], - 'description': metadata.get('description'), + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': metadata.get('slug') or video_id, - 'uploader_id': metadata.get('creator'), + 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, + 'tags': tags, } -class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?Pvideo)/id/(?P.+)\.html' - IE_NAME = '1up.com' - +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': 'c9cc69e07acb675c31a16719f909e347', + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', 'info_dict': { - 'id': '34976', + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', 'ext': 'mp4', - 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', - 'timestamp': 1313099220, - 'upload_date': '20110811', - 'uploader_id': 'IGN', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - result = super(OneUPIE, self)._real_extract(url) - result['id'] = mobj.group('name_or_id') - return result - - -class PCMagIE(IGNIE): - _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?Pvideos|article2)(/.+)?/(?P.+)' - IE_NAME = 'pcmag' + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) - _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _PAGE_TYPE = 'article' _TESTS = [{ - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'uploader_id': 'cozzipix@gmail.com', - } + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, }, { - 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', - 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { - 'id': '042e560ba94823d43afcb12ddf7142ca', - 'ext': 'mp4', - 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', - 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', - 'timestamp': 1412953920, - 'upload_date': '20141010', - 'uploader_id': 'chris_snyder@pcmag.com', - } + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) From a4c7ed6b1e9100be8ef65c44e7e6e43b9314ff5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Feb 2021 22:28:58 +0700 Subject: [PATCH 023/242] [youtube:tab] Improve grid continuation extraction (closes #28130) --- youtube_dl/extractor/youtube.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 346311d9b..c78996629 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2374,9 +2374,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): next_continuation = cls._extract_next_continuation_data(renderer) if next_continuation: return next_continuation - contents = renderer.get('contents') - if not isinstance(contents, list): - return + contents = [] + for key in ('contents', 'items'): + contents.extend(try_get(renderer, lambda x: x[key], list) or []) for content in contents: if not isinstance(content, dict): continue @@ -2509,6 +2509,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue + renderer = continuation_item.get('gridVideoRenderer') + if renderer: + grid_renderer = {'items': continuation_items} + for entry in self._grid_entries(grid_renderer): + yield entry + continuation = self._extract_continuation(grid_renderer) + continue renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') if renderer: video_list_renderer = {'contents': continuation_items} From cd493c5adcb526cdfa2a9d5194269b671a0dc343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Feb 2021 22:32:25 +0700 Subject: [PATCH 024/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ChangeLog b/ChangeLog index 784b73d8d..5951372b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +version + +Extractors +* [youtube:tab] Improve grid continuation extraction (#28130) +* [ign] Fix extraction (#24771) ++ [xhamster] Extract format filesize ++ [xhamster] Extract formats from xplayer settings (#28114) ++ [youtube] Add support phone/tablet JS player (#26424) +* [archiveorg] Fix and improve extraction (#21330, #23586, #25277, #26780, + #27109, #27236, #28063) ++ [cda] Detect geo restricted videos (#28106) +* [urplay] Fix extraction (#28073, #28074) +* [youtube] Fix release date extraction (#28094) ++ [youtube] Extract abr and vbr (#28100) +* [youtube] Skip OTF formats (#28070) + + version 2021.02.04.1 Extractors From 360d5f0daac879a1371c6a45e0d3310ced60e352 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Feb 2021 22:34:47 +0700 Subject: [PATCH 025/242] release 2021.02.10 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 19b750f86..ea0a59dca 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8acb80b60..d24855c72 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 66edcf752..8b96a2883 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 18203fb34..e46971047 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.04.1 + [debug] youtube-dl version 2021.02.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 20df40cc5..a9ca379ca 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.04.1** +- [ ] I've verified that I'm running youtube-dl version **2021.02.10** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 5951372b3..384bd19c2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.02.10 Extractors * [youtube:tab] Improve grid continuation extraction (#28130) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e1b85b1d1..1373cc4f6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1,6 +1,5 @@ # Supported sites - **1tv**: Первый канал - - **1up.com** - **20min** - **220.ro** - **23video** @@ -376,6 +375,8 @@ - **HungamaSong** - **Hypem** - **ign.com** + - **IGNArticle** + - **IGNVideo** - **IHeartRadio** - **iheartradio:podcast** - **imdb**: Internet Movie Database trailers @@ -676,7 +677,6 @@ - **parliamentlive.tv**: UK parliament videos - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - - **pcmag** - **PearVideo** - **PeerTube** - **People** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 425f15589..79d2be625 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.04.1' +__version__ = '2021.02.10' From f28f1b4d6ed053f1dbfbc7fc992162c4ea4f2ce7 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Thu, 11 Feb 2021 09:04:16 +0100 Subject: [PATCH 026/242] [canvas] Add new extractor for Dagelijkse Kost (#28119) --- youtube_dl/extractor/canvas.py | 56 ++++++++++++++++++++++++++++-- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 8b76a0200..eefbab241 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,19 +7,21 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( - extract_attributes, ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -332,3 +334,51 @@ class VrtNUIE(GigyaBaseIE): 'display_id': display_id, 'season_number': int_or_none(page.get('episode_season')), }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 84998316c..e4c475fd8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -163,6 +163,7 @@ from .canvas import ( CanvasIE, CanvasEenIE, VrtNUIE, + DagelijkseKostIE, ) from .carambatv import ( CarambaTVIE, From f94d76499362017f673520286bc3848916735275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Feb 2021 05:03:15 +0700 Subject: [PATCH 027/242] [ard] Improve formats extraction (closes #28155) --- youtube_dl/extractor/ard.py | 44 +++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf5b3f13..143fc51e9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -284,20 +284,42 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) From 6d32c6c6d3b0588b193eaeb4178592219c3b4df8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 16:22:45 +0100 Subject: [PATCH 028/242] [xboxclips] fix extraction(closes #27151) --- youtube_dl/extractor/xboxclips.py | 43 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index d9c277bc3..25f487e1e 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -1,40 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, + month_by_abbreviation, parse_filesize, - unified_strdate, ) class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P[\w-]{36})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', - 'title': 'Iabdulelah playing Titanfall', + 'title': 'iAbdulElah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } - } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + if '/video.php' in url: + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] - video_url = self._html_search_regex( - r'>(?:Link|Download): ]+href="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'XboxClips \| ([^<]+)', webpage, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( @@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - return { + info.update({ 'id': video_id, - 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, - } + }) + return info From d8085580f63ad3b146a31712ff76cf41d5a4558a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 19:48:26 +0100 Subject: [PATCH 029/242] [kakao] improve info extraction and detect geo restriction(closes #26577) --- youtube_dl/extractor/kakao.py | 64 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 32935bb28..31ce7a85c 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, + str_or_none, strip_or_none, + try_get, unified_timestamp, update_url_query, ) @@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', @@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor): 'fields': ','.join([ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } @@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - query['tid'] = impress.get('tid', '') + query.update({ + 'fields': '-*,code,message,url', + 'tid': impress.get('tid') or '', + }) formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in (clip.get('videoOutputList') or []): try: profile_name = fmt['profile'] if profile_name == 'AUDIO': continue - query.update({ - 'profile': profile_name, - 'fields': '-*,url', - }) - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header, fatal=False) - - if fmt_url_json is None: + query['profile'] = profile_name + try: + fmt_url_json = self._download_json( + api_base + 'raw/videolocation', display_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, headers=player_header) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() continue fmt_url = fmt_url_json['url'] @@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor): pass self._sort_formats(formats) - thumbs = [] - for thumb in clip.get('clipChapterThumbnailList', []): - thumbs.append({ - 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 - }) - top_thumbnail = clip.get('thumbnailUrl') - if top_thumbnail: - thumbs.append({ - 'url': top_thumbnail, - 'preference': 10, - }) - return { 'id': display_id, 'title': title, 'description': strip_or_none(clip.get('description')), - 'uploader': clip_link.get('channel', {}).get('name'), - 'uploader_id': clip_link.get('channelId'), - 'thumbnails': thumbs, + 'uploader': try_get(clip_link, lambda x: x['channel']['name']), + 'uploader_id': str_or_none(clip_link.get('channelId')), + 'thumbnail': clip.get('thumbnailUrl'), 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), 'view_count': int_or_none(clip.get('playCount')), From be2e9b76eea73b073f00871ea831ee3f4a1000b3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Feb 2021 22:10:06 +0100 Subject: [PATCH 030/242] [videopress] add support for video.wordpress.com --- youtube_dl/extractor/videopress.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index e5f964d39..6376ff096 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -4,21 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, + int_or_none, parse_age_limit, qualities, random_birthday, - try_get, unified_timestamp, urljoin, ) class VideoPressIE(InfoExtractor): - _VALID_URL = r'https?://videopress\.com/embed/(?P[\da-zA-Z]+)' + _ID_REGEX = r'[\da-zA-Z]{8}' + _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' + _VALID_URL = r'https?://%s(?P%s)' % (_PATH_REGEX, _ID_REGEX) _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor): # 17+, requires birth_* params 'url': 'https://videopress.com/embed/iH3gstfZ', 'only_matching': True, + }, { + 'url': 'https://video.wordpress.com/embed/kUJmAcSf', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + r']+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), webpage) def _real_extract(self, url): video_id = self._match_id(url) query = random_birthday('birth_year', 'birth_month', 'birth_day') + query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width' video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, video_id, query=query) title = video['title'] - def base_url(scheme): - return try_get( - video, lambda x: x['file_url_base'][scheme], compat_str) - - base_url = base_url('https') or base_url('http') + file_url_base = video.get('file_url_base') or {} + base_url = file_url_base.get('https') or file_url_base.get('http') QUALITIES = ('std', 'dvd', 'hd') quality = qualities(QUALITIES) formats = [] - for format_id, f in video['files'].items(): + for format_id, f in (video.get('files') or {}).items(): if not isinstance(f, dict): continue for ext, path in f.items(): @@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor): 'ext': determine_ext(path, ext), 'quality': quality(format_id), }) - original_url = try_get(video, lambda x: x['original'], compat_str) + original_url = video.get('original') if original_url: formats.append({ 'url': original_url, 'format_id': 'original', 'quality': len(QUALITIES), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) From 4b5410c5c841b826965ea76d62bea82a26a9e1b8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Feb 2021 13:06:54 +0100 Subject: [PATCH 031/242] [ccma] fix timestamp parsing in python 2 --- youtube_dl/extractor/ccma.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py index 4db51e650..e6ae49352 100644 --- a/youtube_dl/extractor/ccma.py +++ b/youtube_dl/extractor/ccma.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, + extract_timezone, int_or_none, parse_duration, parse_resolution, @@ -97,8 +99,9 @@ class CCMAIE(InfoExtractor): timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) try: - timestamp = datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) except TypeError: pass From 07eb8f19169c58bce0e784607ea350ae16ed5363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Feb 2021 05:57:53 +0700 Subject: [PATCH 032/242] [youtube] Fix controversial videos when authenticated with cookies (closes #28174) --- youtube_dl/extractor/youtube.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c78996629..7db4503e0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1042,6 +1042,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # controversial video, only works with bpctr when authenticated with cookies + 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', + 'only_matching': True, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1405,7 +1410,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = self._match_id(url) base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id - webpage = self._download_webpage(webpage_url, video_id, fatal=False) + webpage = self._download_webpage( + webpage_url + '&bpctr=9999999999', video_id, fatal=False) player_response = None if webpage: From 56c63c8c02d7b9aabbced8d150badb6b520825d2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 16 Feb 2021 10:08:43 +0100 Subject: [PATCH 033/242] [zhihu] Add new extractor(closes #28177) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/zhihu.py | 69 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 youtube_dl/extractor/zhihu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e4c475fd8..4347f1b74 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1603,5 +1603,6 @@ from .zattoo import ( ZattooLiveIE, ) from .zdf import ZDFIE, ZDFChannelIE +from .zhihu import ZhihuIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zhihu.py b/youtube_dl/extractor/zhihu.py new file mode 100644 index 000000000..d1ed55be3 --- /dev/null +++ b/youtube_dl/extractor/zhihu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none + + +class ZhihuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.zhihu.com/zvideo/1342930761977176064', + 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464', + 'info_dict': { + 'id': '1342930761977176064', + 'ext': 'mp4', + 'title': '写春联也太难了吧!', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': '桥半舫', + 'timestamp': 1612959715, + 'upload_date': '20210210', + 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365', + 'duration': 146.333, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + zvideo = self._download_json( + 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id) + title = zvideo['title'] + video = zvideo.get('video') or {} + + formats = [] + for format_id, q in (video.get('playlist') or {}).items(): + play_url = q.get('url') or q.get('play_url') + if not play_url: + continue + formats.append({ + 'asr': int_or_none(q.get('sample_rate')), + 'filesize': int_or_none(q.get('size')), + 'format_id': format_id, + 'fps': int_or_none(q.get('fps')), + 'height': int_or_none(q.get('height')), + 'tbr': float_or_none(q.get('bitrate')), + 'url': play_url, + 'width': int_or_none(q.get('width')), + }) + self._sort_formats(formats) + + author = zvideo.get('author') or {} + url_token = author.get('url_token') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'), + 'uploader': author.get('name'), + 'timestamp': int_or_none(zvideo.get('published_at')), + 'uploader_id': author.get('id'), + 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None, + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(zvideo.get('play_count')), + 'like_count': int_or_none(zvideo.get('liked_count')), + 'comment_count': int_or_none(zvideo.get('comment_count')), + } From 844e4cbc547f2a2f76053786522bdd6b53bf9ae1 Mon Sep 17 00:00:00 2001 From: Stephen Stair Date: Sun, 16 Aug 2020 17:07:14 -0700 Subject: [PATCH 034/242] [storyfire] Add new extractor(closes #25628)(closes #26349) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/storyfire.py | 151 +++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 youtube_dl/extractor/storyfire.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4347f1b74..51f6d38e9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1141,6 +1141,11 @@ from .srgssr import ( from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py new file mode 100644 index 000000000..9c698626f --- /dev/null +++ b/youtube_dl/extractor/storyfire.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools + +from .common import InfoExtractor +from ..utils import ( + # HEADRequest, + int_or_none, + OnDemandPagedList, + smuggle_url, +) + + +class StoryFireBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' + + def _call_api(self, path, video_id, resource, query=None): + return self._download_json( + 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, + 'Downloading %s JSON metadata' % resource, query=query) + + def _parse_video(self, video): + title = video['title'] + vimeo_id = self._search_regex( + r'https?://player\.vimeo\.com/external/(\d+)', + video['vimeoVideoURL'], 'vimeo id') + + # video_url = self._request_webpage( + # HEADRequest(video['vimeoVideoURL']), video_id).geturl() + # formats = [] + # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: + # formats.extend(self._extract_m3u8_formats( + # v_url, video_id, 'mp4', 'm3u8_native', + # m3u8_id='hls' + suffix, fatal=False)) + # formats.extend(self._extract_mpd_formats( + # v_url.replace('.m3u8', '.mpd'), video_id, + # mpd_id='dash' + suffix, fatal=False)) + # self._sort_formats(formats) + + uploader_id = video.get('hostID') + + return { + '_type': 'url_transparent', + 'id': vimeo_id, + 'title': title, + 'description': video.get('description'), + 'url': smuggle_url( + 'https://player.vimeo.com/video/' + vimeo_id, { + 'http_headers': { + 'Referer': 'https://storyfire.com/', + } + }), + # 'formats': formats, + 'thumbnail': video.get('storyImage'), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likesCount')), + 'comment_count': int_or_none(video.get('commentsCount')), + 'duration': int_or_none(video.get('videoDuration')), + 'timestamp': int_or_none(video.get('publishDate')), + 'uploader': video.get('username'), + 'uploader_id': uploader_id, + 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), + } + + +class StoryFireIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P[0-9a-f]{24})' + _TEST = { + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': 'caec54b9e4621186d6079c7ec100c1eb', + 'info_dict': { + 'id': '378954662', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', + 'uploader': 'whang!', + 'upload_date': '20191212', + 'duration': 418, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'] + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._call_api( + 'generic/video-detail', video_id, 'video')['video'] + return self._parse_video(video) + + +class StoryFireUserIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P[^/]+)/video' + _TEST = { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + }, + 'playlist_mincount': 151, + } + _PAGE_SIZE = 20 + + def _fetch_page(self, user_id, page): + videos = self._call_api( + 'publicVideos', user_id, 'page %d' % (page + 1), { + 'skip': page * self._PAGE_SIZE, + })['videos'] + for video in videos: + yield self._parse_video(video) + + def _real_extract(self, url): + user_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, user_id), self._PAGE_SIZE) + return self.playlist_result(entries, user_id) + + +class StoryFireSeriesIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13, + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0, + }] + + def _extract_videos(self, stories): + for story in stories.values(): + if story.get('hasVideo'): + yield self._parse_video(story) + + def _real_extract(self, url): + series_id = self._match_id(url) + stories = self._call_api( + 'seriesStories', series_id, 'series stories') + return self.playlist_result(self._extract_videos(stories), series_id) From 646052e416577cc805b7ba169c49158716541570 Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 16 Feb 2021 15:22:51 -0500 Subject: [PATCH 035/242] [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase (#28112) --- youtube_dl/postprocessor/embedthumbnail.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 5a3359588..3990908b6 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -89,10 +89,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): + atomicparsley = next((x + for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + + if atomicparsley is None: raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = [encodeFilename('AtomicParsley', True), + cmd = [encodeFilename(atomicparsley, True), encodeFilename(filename, True), encodeArgument('--artwork'), encodeFilename(thumbnail_filename, True), From a363fb5d28da7c1b651e6de98b9e799544a4df73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Feb 2021 04:03:54 +0700 Subject: [PATCH 036/242] [yandexmusic:playlist] Request missing tracks in chunks (closes #27355, closes #28184) --- youtube_dl/extractor/yandexmusic.py | 35 +++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 7893f363e..84969f8e1 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -1,8 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re import hashlib +import itertools +import re from .common import InfoExtractor from ..compat import compat_str @@ -209,17 +210,27 @@ class YandexMusicPlaylistBaseIE(YandexMusicBaseIE): missing_track_ids = [ track_id for track_id in track_ids if track_id not in present_track_ids] - missing_tracks = self._call_api( - 'track-entries', tld, url, item_id, - 'Downloading missing tracks JSON', { - 'entries': ','.join(missing_track_ids), - 'lang': tld, - 'external-domain': 'music.yandex.%s' % tld, - 'overembed': 'false', - 'strict': 'true', - }) - if missing_tracks: - tracks.extend(missing_tracks) + # Request missing tracks in chunks to avoid exceeding max HTTP header size, + # see https://github.com/ytdl-org/youtube-dl/issues/27355 + _TRACKS_PER_CHUNK = 250 + for chunk_num in itertools.count(0): + start = chunk_num * _TRACKS_PER_CHUNK + end = start + _TRACKS_PER_CHUNK + missing_track_ids_req = missing_track_ids[start:end] + assert missing_track_ids_req + missing_tracks = self._call_api( + 'track-entries', tld, url, item_id, + 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), { + 'entries': ','.join(missing_track_ids_req), + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + 'strict': 'true', + }) + if missing_tracks: + tracks.extend(missing_tracks) + if end >= len(missing_track_ids): + break return tracks From 8980f53b4227bc213048fce52c634830dd25e4bb Mon Sep 17 00:00:00 2001 From: PrinceOfPuppers Date: Tue, 2 Feb 2021 01:46:39 -0500 Subject: [PATCH 037/242] [youtube] Fix uploader extraction in flat playlist mode (#28045) --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7db4503e0..e0b15f859 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -308,7 +308,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'^([\d,]+)', re.sub(r'\s', '', view_count_text), 'view count', default=None)) uploader = try_get( - renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + renderer, + (lambda x: x['ownerText']['runs'][0]['text'], + lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) return { '_type': 'url_transparent', 'ie_key': YoutubeIE.ie_key(), From 70baa7bfae8890c8274af7f3c7e2a704d300a326 Mon Sep 17 00:00:00 2001 From: PrinceOfPuppers Date: Mon, 15 Feb 2021 14:38:41 -0500 Subject: [PATCH 038/242] [test_youtube_lists] Actualize youtube flat playlist test (closes #28045) --- test/test_youtube_lists.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c4f0abbea..cf2fdf14f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -12,6 +12,7 @@ from test.helper import FakeYDL from youtube_dl.extractor import ( YoutubePlaylistIE, + YoutubeTabIE, YoutubeIE, ) @@ -57,14 +58,22 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) - def test_youtube_flat_playlist_titles(self): + def test_youtube_flat_playlist_extraction(self): dl = FakeYDL() dl.params['extract_flat'] = True - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv') + ie = YoutubeTabIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc') self.assertIsPlaylist(result) - for entry in result['entries']: - self.assertTrue(entry.get('title')) + entries = list(result['entries']) + self.assertTrue(len(entries) == 1) + video = entries[0] + self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['ie_key'], 'Youtube') + self.assertEqual(video['id'], 'BaW_jenozKc') + self.assertEqual(video['url'], 'BaW_jenozKc') + self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐') + self.assertEqual(video['duration'], 10) + self.assertEqual(video['uploader'], 'Philipp Hagemeister') if __name__ == '__main__': From e20ec43094c09c41d71cef512c882a9d66163cd2 Mon Sep 17 00:00:00 2001 From: dmsummers Date: Thu, 20 Feb 2020 14:33:05 -0600 Subject: [PATCH 039/242] [simplecast] Add new extractor(closes #24107) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/generic.py | 16 +++ youtube_dl/extractor/simplecast.py | 160 +++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+) create mode 100644 youtube_dl/extractor/simplecast.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 51f6d38e9..60c032c7d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1057,6 +1057,11 @@ from .shared import ( VivoIE, ) from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) from .sina import SinaIE from .sixplay import SixPlayIE from .skyit import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 09e680c96..c2b1b3bdf 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -129,6 +129,7 @@ from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE +from .simplecast import SimplecastIE class GenericIE(InfoExtractor): @@ -2238,6 +2239,15 @@ class GenericIE(InfoExtractor): 'duration': 159, }, }, + { + # Simplecast player embed + 'url': 'https://www.bio.org/podcast', + 'info_dict': { + 'id': 'podcast', + 'title': 'I AM BIO Podcast | BIO', + }, + 'playlist_mincount': 52, + }, ] def report_following_redirect(self, new_url): @@ -2792,6 +2802,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') + # Look for Simplecast embeds + simplecast_urls = SimplecastIE._extract_urls(webpage) + if simplecast_urls: + return self.playlist_from_matches( + simplecast_urls, video_id, video_title) + # Look for BBC iPlayer embed matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) if matches: diff --git a/youtube_dl/extractor/simplecast.py b/youtube_dl/extractor/simplecast.py new file mode 100644 index 000000000..2d0b3c06d --- /dev/null +++ b/youtube_dl/extractor/simplecast.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class SimplecastBaseIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _API_BASE = 'https://api.simplecast.com/' + + def _call_api(self, path_tmpl, video_id): + return self._download_json( + self._API_BASE + path_tmpl % video_id, video_id) + + def _call_search_api(self, resource, resource_id, resource_url): + return self._download_json( + 'https://api.simplecast.com/%ss/search' % resource, resource_id, + data=urlencode_postdata({'url': resource_url})) + + def _parse_episode(self, episode): + episode_id = episode['id'] + title = episode['title'].strip() + audio_file = episode.get('audio_file') or {} + audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] + + season = episode.get('season') or {} + season_href = season.get('href') + season_id = None + if season_href: + season_id = self._search_regex( + r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, + season_href, 'season id', default=None) + + webpage_url = episode.get('episode_url') + channel_url = None + if webpage_url: + channel_url = self._search_regex( + r'(https?://[^/]+\.simplecast\.com)', + webpage_url, 'channel url', default=None) + + return { + 'id': episode_id, + 'display_id': episode.get('slug'), + 'title': title, + 'url': clean_podcast_url(audio_file_url), + 'webpage_url': webpage_url, + 'channel_url': channel_url, + 'series': try_get(episode, lambda x: x['podcast']['title']), + 'season_number': int_or_none(season.get('number')), + 'season_id': season_id, + 'thumbnail': episode.get('image_url'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode.get('number')), + 'description': strip_or_none(episode.get('description')), + 'timestamp': parse_iso8601(episode.get('published_at')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), + } + + +class SimplecastIE(SimplecastBaseIE): + IE_NAME = 'simplecast' + _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P%s)' % SimplecastBaseIE._UUID_REGEX + _COMMON_TEST_INFO = { + 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', + 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'ext': 'mp3', + 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', + 'episode_number': 1, + 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', + 'season_number': 1, + 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', + 'series': 'The RE:BIND.io Podcast', + 'duration': 5343, + 'timestamp': 1580979475, + 'upload_date': '20200206', + 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', + } + _TESTS = [{ + 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': _COMMON_TEST_INFO, + }, { + 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'''(?x)]+src=["\'] + ( + https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/%s + ))''' % SimplecastBaseIE._UUID_REGEX, webpage) + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('episodes/%s', episode_id) + return self._parse_episode(episode) + + +class SimplecastEpisodeIE(SimplecastBaseIE): + IE_NAME = 'simplecast:episode' + _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': SimplecastIE._COMMON_TEST_INFO, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = self._call_search_api( + 'episode', mobj.group(1), mobj.group(0)) + return self._parse_episode(episode) + + +class SimplecastPodcastIE(SimplecastBaseIE): + IE_NAME = 'simplecast:podcast' + _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' + _TESTS = [{ + 'url': 'https://the-re-bind-io-podcast.simplecast.com', + 'playlist_mincount': 33, + 'info_dict': { + 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', + 'title': 'The RE:BIND.io Podcast', + }, + }, { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + subdomain = self._match_id(url) + site = self._call_search_api('site', subdomain, url) + podcast = site['podcast'] + podcast_id = podcast['id'] + podcast_title = podcast.get('title') + + def entries(): + episodes = self._call_api('podcasts/%s/episodes', podcast_id) + for episode in (episodes.get('collection') or []): + info = self._parse_episode(episode) + info['series'] = podcast_title + yield info + + return self.playlist_result(entries(), podcast_id, podcast_title) From a7356dffe90ed68958d839da073f1321f87a4feb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 17 Feb 2021 18:33:33 +0100 Subject: [PATCH 040/242] [dplay] Add support for discoveryplus.com (closes #24698) --- youtube_dl/extractor/dplay.py | 123 ++++++++++++++++++++++------- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 99 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 47501dbe6..540505719 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -151,56 +152,79 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = 'Bearer ' + self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, }) disco_base = 'https://%s/' % disco_host - token = self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] headers = { 'Referer': url, - 'Authorization': 'Bearer ' + token, } - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise video_id = video['data']['id'] info = video['data']['attributes'] title = info['name'].strip() formats = [] try: - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - display_id, headers=headers)['data']['attributes']['streaming'] + streaming = self._download_video_playback_info( + disco_base, video_id, headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code == 'access.denied.missingpackage': - self.raise_login_required() - raise ExtractorError(info['errors'][0]['detail'], expected=True) + self._process_errors(e, geo_countries) raise - for format_id, format_dict in streaming.items(): + for format_dict in streaming: if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') if not format_url: continue + format_id = format_dict.get('type') ext = determine_ext(format_url) if format_id == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -268,3 +292,46 @@ class DPlayIE(InfoExtractor): host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) + + +class DiscoveryPlusIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video/(?P[^/]+/[^/]+)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + 'wisteriaProperties': { + 'platform': 'desktop', + }, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 60c032c7d..acf8cf73b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -288,7 +288,10 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import DPlayIE +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE From 3997efb65ef16dbd8c4792e79e797cbcab0fbec1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 17 Feb 2021 19:50:04 +0100 Subject: [PATCH 041/242] [dplay] add support for de.hgtv.com (closes #28182) --- youtube_dl/extractor/dplay.py | 37 +++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 540505719..0f0632f26 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -11,11 +11,13 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + strip_or_none, unified_timestamp, ) class DPlayIE(InfoExtractor): + _PATH_REGEX = r'/(?P[^/]+/[^/?#]+)' _VALID_URL = r'''(?x)https?:// (?P (?:www\.)?(?Pd @@ -25,7 +27,7 @@ class DPlayIE(InfoExtractor): ) )| (?Pes|it)\.dplay\.com - )/[^/]+/(?P[^/]+/[^/?#]+)''' + )/[^/]+''' + _PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -272,7 +274,7 @@ class DPlayIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': info.get('description'), + 'description': strip_or_none(info.get('description')), 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, @@ -295,7 +297,7 @@ class DPlayIE(InfoExtractor): class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video/(?P[^/]+/[^/]+)' + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -335,3 +337,32 @@ class DiscoveryPlusIE(DPlayIE): display_id = self._match_id(url) return self._get_disco_api_info( url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') + + +class HGTVDeIE(DPlayIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', + 'info_dict': { + 'id': '151205', + 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', + 'ext': 'mp4', + 'title': 'Wer braucht schon eine Toilette', + 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', + 'duration': 1177.024, + 'timestamp': 1595705400, + 'upload_date': '20200725', + 'creator': 'HGTV', + 'series': 'Tiny House - klein, aber oho', + 'season_number': 3, + 'episode_number': 3, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index acf8cf73b..62819ddcf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -291,6 +291,7 @@ from .douyutv import ( from .dplay import ( DPlayIE, DiscoveryPlusIE, + HGTVDeIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE From 08c2fbb8443de3949af205d08015e5c6048d2e86 Mon Sep 17 00:00:00 2001 From: bopol Date: Wed, 17 Feb 2021 22:29:32 +0100 Subject: [PATCH 042/242] [youtube] Add support for redirect.invidious.io (#28193) Co-authored-by: Sergey M --- youtube_dl/extractor/youtube.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e0b15f859..f9e554ca9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -335,8 +335,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| - # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances + # invidious-redirect websites + (?:www\.)?redirect\.invidious\.io/| (?:(?:www|dev)\.)?invidio\.us/| + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md (?:(?:www|no)\.)?invidiou\.sh/| (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| @@ -906,6 +908,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://invidio.us/watch?v=BaW_jenozKc', 'only_matching': True, }, + { + 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m + 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA', + 'only_matching': True, + }, { # DRM protected 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', From 9fc5eafb8e384453a49f7cfe73147be491f0b19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Feb 2021 04:59:56 +0700 Subject: [PATCH 043/242] [youtube] Improve _VALID_URL (refs #28193) --- youtube_dl/extractor/youtube.py | 99 ++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f9e554ca9..ff32758df 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -326,54 +326,57 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + ) _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| - (?:www\.)?deturl\.com/www\.youtube\.com/| - (?:www\.)?pwnyoutube\.com/| - (?:www\.)?hooktube\.com/| - (?:www\.)?yourepeat\.com/| - tube\.majestyc\.net/| - # invidious-redirect websites - (?:www\.)?redirect\.invidious\.io/| - (?:(?:www|dev)\.)?invidio\.us/| - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md - (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi)\.)?invidious\.snopyta\.org/| - (?:www\.)?invidious\.kabi\.tk/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.mastodon\.host/| - (?:www\.)?invidious\.zapashcanon\.fr/| - (?:www\.)?invidious\.kavin\.rocks/| - (?:www\.)?invidious\.tube/| - (?:www\.)?invidiou\.site/| - (?:www\.)?invidious\.site/| - (?:www\.)?invidious\.xyz/| - (?:www\.)?invidious\.nixnet\.xyz/| - (?:www\.)?invidious\.drycat\.fr/| - (?:www\.)?tube\.poal\.co/| - (?:www\.)?tube\.connect\.cafe/| - (?:www\.)?vid\.wxzm\.sx/| - (?:www\.)?vid\.mint\.lgbt/| - (?:www\.)?yewtu\.be/| - (?:www\.)?yt\.elukerio\.org/| - (?:www\.)?yt\.lelux\.fi/| - (?:www\.)?invidious\.ggc-project\.de/| - (?:www\.)?yt\.maisputain\.ovh/| - (?:www\.)?invidious\.13ad\.de/| - (?:www\.)?invidious\.toot\.koeln/| - (?:www\.)?invidious\.fdn\.fr/| - (?:www\.)?watch\.nettohikari\.com/| - (?:www\.)?kgg2m7yk5aybusll\.onion/| - (?:www\.)?qklhadlycap4cnod\.onion/| - (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| - (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| - (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| - (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| - (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/| - (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/| - youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| + (?:www\.)?deturl\.com/www\.youtube\.com| + (?:www\.)?pwnyoutube\.com| + (?:www\.)?hooktube\.com| + (?:www\.)?yourepeat\.com| + tube\.majestyc\.net| + %(invidious)s| + youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ @@ -388,6 +391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): youtu\.be| # just youtu.be/xxxx vid\.plus| # or vid.plus/xxxx zwearz\.com/watch| # or zwearz.com/watch/xxxx + %(invidious)s )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) @@ -400,7 +404,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) ) (?(1).+)? # if we found the ID, everything can follow - $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + $""" % { + 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, + 'invidious': '|'.join(_INVIDIOUS_SITES), + } _PLAYER_INFO_RE = ( r'/s/player/(?P[a-zA-Z0-9_-]{8,})/player', r'/(?P[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', From 40edffae3d9f86ca696dda6c8a4c9c0497cb6d76 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Feb 2021 11:55:14 +0100 Subject: [PATCH 044/242] [ninegag] unscape title(#28201) --- youtube_dl/extractor/ninegag.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index 440f865bc..14390823b 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -2,10 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, try_get, + unescapeHTML, url_or_none, ) @@ -14,7 +15,7 @@ class NineGagIE(InfoExtractor): IE_NAME = '9gag' _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TEST = { + _TESTS = [{ 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', @@ -29,7 +30,11 @@ class NineGagIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, } - } + }, { + # HTML escaped title + 'url': 'https://9gag.com/gag/av5nvyb', + 'only_matching': True, + }] def _real_extract(self, url): post_id = self._match_id(url) @@ -43,7 +48,7 @@ class NineGagIE(InfoExtractor): 'The given url does not contain a video', expected=True) - title = post['title'] + title = unescapeHTML(post['title']) duration = None formats = [] From b92bb0e02a09930cad3c4f6a406eb503c941af61 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Feb 2021 16:00:22 +0100 Subject: [PATCH 045/242] [viki] improve extraction(closes #26522)(closes #28203) - extract uploader_url and episode_number - report login required error - extract 480p formats - fix API v4 calls --- youtube_dl/extractor/viki.py | 69 +++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a311f21ef..2e9cbf148 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -21,6 +21,7 @@ from ..utils import ( parse_iso8601, sanitized_Request, std_headers, + try_get, ) @@ -30,7 +31,7 @@ class VikiBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' _APP = '100005a' - _APP_VERSION = '2.2.5.1428709186' + _APP_VERSION = '6.0.0' _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False @@ -41,7 +42,7 @@ class VikiBaseIE(InfoExtractor): _ERRORS = { 'geo': 'Sorry, this content is not available in your region.', 'upcoming': 'Sorry, this content is not yet available.', - # 'paywall': 'paywall', + 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } def _prepare_call(self, path, timestamp=None, post_data=None): @@ -62,7 +63,8 @@ class VikiBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note, + headers={'x-viki-app-ver': self._APP_VERSION}) error = resp.get('error') if error: @@ -82,11 +84,13 @@ class VikiBaseIE(InfoExtractor): expected=True) def _check_errors(self, data): - for reason, status in data.get('blocking', {}).items(): + for reason, status in (data.get('blocking') or {}).items(): if status and reason in self._ERRORS: message = self._ERRORS[reason] if reason == 'geo': self.raise_geo_restricted(msg=message) + elif reason == 'paywall': + self.raise_login_required(message) raise ExtractorError('%s said: %s' % ( self.IE_NAME, message), expected=True) @@ -131,13 +135,19 @@ class VikiIE(VikiBaseIE): 'info_dict': { 'id': '1023585v', 'ext': 'mp4', - 'title': 'Heirs Episode 14', - 'uploader': 'SBS', - 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'title': 'Heirs - Episode 14', + 'uploader': 'SBS Contents Hub', + 'timestamp': 1385047627, 'upload_date': '20131121', 'age_limit': 13, + 'duration': 3570, + 'episode_number': 14, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'Blocked in the US', + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', @@ -153,7 +163,8 @@ class VikiIE(VikiBaseIE): 'uploader': 'Arirang TV', 'like_count': int, 'age_limit': 0, - } + }, + 'skip': 'Sorry. There was an error loading this video', }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { @@ -171,7 +182,7 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '94e0e34fd58f169f40c184f232356cfe', + 'md5': '0a53dc252e6e690feccd756861495a8c', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -183,6 +194,10 @@ class VikiIE(VikiBaseIE): 'uploader': 'group8', 'like_count': int, 'age_limit': 13, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', }, 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { @@ -209,7 +224,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': 'adf9e321a0ae5d0aace349efaaff7691', + 'md5': '41faaba0de90483fb4848952af7c7d0d', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -220,6 +235,10 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }] def _real_extract(self, url): @@ -229,36 +248,33 @@ class VikiIE(VikiBaseIE): 'https://www.viki.com/api/videos/' + video_id, video_id, 'Downloading video JSON', headers={ 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '4.0.57', + 'x-viki-app-ver': '3.0.0', }) video = resp['video'] self._check_errors(video) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + episode_number = int_or_none(video.get('number')) if not title: - title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = video.get('container', {}).get('titles', {}) + title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') title = '%s - %s' % (container_title, title) description = self.dict_selection(video.get('descriptions', {}), 'en') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('created_at')) - uploader = video.get('author') - like_count = int_or_none(video.get('likes', {}).get('count')) - age_limit = parse_age_limit(video.get('rating')) + like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) thumbnails = [] - for thumbnail_id, thumbnail in video.get('images', {}).items(): + for thumbnail_id, thumbnail in (video.get('images') or {}).items(): thumbnails.append({ 'id': thumbnail_id, 'url': thumbnail.get('url'), }) subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): subtitles[subtitle_lang] = [{ 'ext': subtitles_format, 'url': self._prepare_call( @@ -269,13 +285,15 @@ class VikiIE(VikiBaseIE): 'id': video_id, 'title': title, 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader': video.get('author'), + 'uploader_url': video.get('author_url'), 'like_count': like_count, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(video.get('rating')), 'thumbnails': thumbnails, 'subtitles': subtitles, + 'episode_number': episode_number, } formats = [] @@ -360,7 +378,7 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', }, 'playlist_mincount': 71, }, { @@ -371,6 +389,7 @@ class VikiChannelIE(VikiBaseIE): 'description': 'md5:05bf5471385aa8b21c18ad450e350525', }, 'playlist_count': 127, + 'skip': 'Page not found', }, { 'url': 'http://www.viki.com/news/24569c-showbiz-korea', 'only_matching': True, From cf2dbec6301177a1fddf72862de05fa912d9869d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Feb 2021 21:13:56 +0100 Subject: [PATCH 046/242] [vimeo] add support for unlisted video source format extraction --- youtube_dl/extractor/vimeo.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 15cd06268..bd2663fe0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -226,10 +226,12 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'is_live': is_live, } - def _extract_original_format(self, url, video_id): + def _extract_original_format(self, url, video_id, unlisted_hash=None): + query = {'action': 'load_download_config'} + if unlisted_hash: + query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, - query={'action': 'load_download_config'}, + url, video_id, fatal=False, query=query, headers={'X-Requested-With': 'XMLHttpRequest'}) if download_data: source_file = download_data.get('source_file') @@ -509,6 +511,11 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, + }, + { + # requires passing unlisted_hash(a52724358e) to load_download_config request + 'url': 'https://vimeo.com/392479337/a52724358e', + 'only_matching': True, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header @@ -673,7 +680,8 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(redirect_url, video_id, headers) - vod = config.get('video', {}).get('vod', {}) + video = config.get('video') or {} + vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: @@ -733,7 +741,7 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = [] source_format = self._extract_original_format( - 'https://vimeo.com/' + video_id, video_id) + 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) if source_format: formats.append(source_format) From 21e872b19ada61337770160a124c4387d6c77e08 Mon Sep 17 00:00:00 2001 From: Isaac-the-Man Date: Sun, 10 Jan 2021 10:37:54 -0500 Subject: [PATCH 047/242] [samplefocus] Add new extractor(closes #27763) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/samplefocus.py | 100 ++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/samplefocus.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 62819ddcf..1a39c25c5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1029,6 +1029,7 @@ from .safari import ( SafariApiIE, SafariCourseIE, ) +from .samplefocus import SampleFocusIE from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE diff --git a/youtube_dl/extractor/samplefocus.py b/youtube_dl/extractor/samplefocus.py new file mode 100644 index 000000000..806c3c354 --- /dev/null +++ b/youtube_dl/extractor/samplefocus.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_element_by_attribute, + int_or_none, +) + + +class SampleFocusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', + 'md5': '48c8d62d60be467293912e0e619a5120', + 'info_dict': { + 'id': '40316', + 'display_id': 'lil-peep-sad-emo-guitar', + 'ext': 'mp3', + 'title': 'Lil Peep Sad Emo Guitar', + 'thumbnail': r're:^https?://.+\.png', + 'license': 'Standard License', + 'uploader': 'CapsCtrl', + 'uploader_id': 'capsctrl', + 'like_count': int, + 'comment_count': int, + 'categories': ['Samples', 'Guitar', 'Electric guitar'], + }, + }, { + 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', + 'only_matching': True + }, { + 'url': 'https://samplefocus.com/samples/young-chop-kick', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sample_id = self._search_regex( + r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', + webpage, 'sample id', group='id') + + title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( + r'

(.+?)

', webpage, 'title') + + mp3_url = self._search_regex( + r']+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P(?:(?!\2).)+)', + webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( + r']+itemprop=(["\'])contentUrl\1[^>]*>', + webpage, 'mp3 url', group=0))['content'] + + thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( + r']+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P(?:(?!\1).)+)', + webpage, 'mp3', fatal=False, group='url') + + comments = [] + for author_id, author, body in re.findall(r'(?s)]+class="comment-author">]+href="/users/([^"]+)">([^"]+).+?]+class="comment-body">([^>]+)

', webpage): + comments.append({ + 'author': author, + 'author_id': author_id, + 'text': body, + }) + + uploader_id = uploader = None + mobj = re.search(r'>By ]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) + if mobj: + uploader_id, uploader = mobj.groups() + + breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) + categories = [] + if breadcrumb: + for _, name in re.findall(r']+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): + categories.append(name) + + def extract_count(klass): + return int_or_none(self._html_search_regex( + r']+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, + webpage, klass, fatal=False)) + + return { + 'id': sample_id, + 'title': title, + 'url': mp3_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'license': self._html_search_regex( + r']+href=(["\'])/license\1[^>]*>(?P[^<]+)<', + webpage, 'license', fatal=False, group='license'), + 'uploader_id': uploader_id, + 'like_count': extract_count('sample-%s-favorites' % sample_id), + 'comment_count': extract_count('comments'), + 'comments': comments, + 'categories': categories, + } From 3037ab00c7ddbe4bedaff51420e4ea1e8d0ccccb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Feb 2021 10:24:58 +0100 Subject: [PATCH 048/242] [youtube] fixup m4a_dash formats(closes #28165) --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ff32758df..72d9fbbc6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1603,6 +1603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Youtube throttles chunks >~10M 'http_chunk_size': 10485760, } + if dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' formats.append(dct) hls_manifest_url = streaming_data.get('hlsManifestUrl') From f90d825a6be852b6a3fa39b0948cc9b94154963e Mon Sep 17 00:00:00 2001 From: SirCipherz Date: Sun, 21 Feb 2021 16:05:33 +0000 Subject: [PATCH 049/242] [peertube] Add support for canard.tube (#28190) --- youtube_dl/extractor/peertube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index c2ca71c71..32ff51653 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -413,7 +413,8 @@ class PeerTubeIE(InfoExtractor): peertube3\.cpy\.re| peertube2\.cpy\.re| videos\.tcit\.fr| - peertube\.cpy\.re + peertube\.cpy\.re| + canard\.tube )''' _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _API_BASE = 'https://%s/api/v1/videos/%s/%s' From 696183e1333aa8f2f1241e149759edf410f94c79 Mon Sep 17 00:00:00 2001 From: piplongrun Date: Sun, 21 Feb 2021 17:19:37 +0100 Subject: [PATCH 050/242] [youporn] Extract duration (#28019) Co-authored-by: Sergey M --- youtube_dl/extractor/youporn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 534270bac..2b5771828 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -25,6 +25,7 @@ class YouPornIE(InfoExtractor): 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101217', 'average_rating': int, @@ -153,6 +154,8 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', @@ -194,6 +197,7 @@ class YouPornIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, 'average_rating': average_rating, From 919d7646004ad8480016b9dec0f6033759244520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 21 Feb 2021 23:21:38 +0700 Subject: [PATCH 051/242] [youporn] Skip test --- youtube_dl/extractor/youporn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 2b5771828..33114363d 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -55,6 +55,7 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404', }, { 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'only_matching': True, From 36abc16c3cf050a3368367038d40cce27504c28a Mon Sep 17 00:00:00 2001 From: Adrian Heine Date: Sat, 9 Jan 2021 22:06:24 +0100 Subject: [PATCH 052/242] [apa] Fix extraction --- youtube_dl/extractor/apa.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index 98ccdaa4a..1dd35dd9c 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, - url_or_none, ) @@ -17,14 +16,10 @@ class APAIE(InfoExtractor): 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'info_dict': { - 'id': 'jjv85FdZ', + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', }, }, { 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', @@ -48,7 +43,7 @@ class APAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('https://uvp.apa.at/player/%s' % video_id, video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -59,18 +54,12 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json( - self._search_regex( - r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), - video_id, transform_source=js_to_json) + sources = self._parse_json("{" + self._search_regex( + r'("hls"\s*:\s*"[^"]+"\s*,\s*"progressive"\s*:\s*"[^"]+")', webpage, 'sources') + + "}", video_id, transform_source=js_to_json) formats = [] - for source in sources: - if not isinstance(source, dict): - continue - source_url = url_or_none(source.get('file')) - if not source_url: - continue + for (format, source_url) in sources.items(): ext = determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -83,7 +72,7 @@ class APAIE(InfoExtractor): self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'"poster"\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='url') return { From aa9118a373a6e9cfb9fda24533df86286eccc468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Feb 2021 02:29:50 +0700 Subject: [PATCH 053/242] [apa] Improve extraction (closes #27750) --- youtube_dl/extractor/apa.py | 38 ++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py index 1dd35dd9c..cbc1c0ecb 100644 --- a/youtube_dl/extractor/apa.py +++ b/youtube_dl/extractor/apa.py @@ -6,12 +6,13 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, - js_to_json, + int_or_none, + url_or_none, ) class APAIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?Phttps?://[^/]+\.apa\.at)/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -41,9 +42,11 @@ class APAIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, base_url = mobj.group('id', 'base_url') - webpage = self._download_webpage('https://uvp.apa.at/player/%s' % video_id, video_id) + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -54,30 +57,39 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json("{" + self._search_regex( - r'("hls"\s*:\s*"[^"]+"\s*,\s*"progressive"\s*:\s*"[^"]+")', webpage, 'sources') - + "}", video_id, transform_source=js_to_json) + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') formats = [] - for (format, source_url) in sources.items(): + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) + if not source_url: + continue ext = determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) formats.append({ 'url': source_url, + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'"poster"\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='url') - return { 'id': video_id, - 'title': video_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, 'formats': formats, } From 44b2d5f5fc80a291b093c8bf20e2ad7ac58b3536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Feb 2021 02:40:00 +0700 Subject: [PATCH 054/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/ChangeLog b/ChangeLog index 384bd19c2..69ce51890 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +version + +Core ++ [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase + (#28112) + +Extractors +* [apa] Fix and improve extraction (#27750) ++ [youporn] Extract duration (#28019) ++ [peertube] Add support for canard.tube (#28190) +* [youtube] Fixup m4a_dash formats (#28165) ++ [samplefocus] Add support for samplefocus.com (#27763) ++ [vimeo] Add support for unlisted video source format extraction +* [viki] Improve extraction (#26522, #28203) + * Extract uploader URL and episode number + * Report login required error + + Extract 480p formats + * Fix API v4 calls +* [ninegag] Unescape title (#28201) +* [youtube] Improve URL regular expression (#28193) ++ [youtube] Add support for redirect.invidious.io (#28193) ++ [dplay] Add support for de.hgtv.com (#28182) ++ [dplay] Add support for discoveryplus.com (#24698) ++ [simplecast] Add support for simplecast.com (#24107) +* [youtube] Fix uploader extraction in flat playlist mode (#28045) +* [yandexmusic:playlist] Request missing tracks in chunks (#27355, #28184) ++ [storyfire] Add support for storyfire.com (#25628, #26349) ++ [zhihu] Add support for zhihu.com (#28177) +* [youtube] Fix controversial videos when authenticated with cookies (#28174) +* [ccma] Fix timestamp parsing in python 2 ++ [videopress] Add support for video.wordpress.com +* [kakao] Improve info extraction and detect geo restriction (#26577) +* [xboxclips] Fix extraction (#27151) +* [ard] Improve formats extraction (#28155) ++ [canvas] Add support for dagelijksekost.een.be (#28119) + + version 2021.02.10 Extractors From 0a04e03a0245d78593844e7b7930920051b9cc27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Feb 2021 02:42:16 +0700 Subject: [PATCH 055/242] release 2021.02.22 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 11 +++++++++++ youtube_dl/version.py | 2 +- 8 files changed, 25 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index ea0a59dca..60879f0ac 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.10 + [debug] youtube-dl version 2021.02.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d24855c72..b38d39ab4 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 8b96a2883..3235de44b 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index e46971047..a3255623a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.10 + [debug] youtube-dl version 2021.02.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index a9ca379ca..124b020c3 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.10** +- [ ] I've verified that I'm running youtube-dl version **2021.02.22** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 69ce51890..2912d776c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.02.22 Core + [postprocessor/embedthumbnail] Recognize atomicparsley binary in lowercase diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1373cc4f6..2452c1f7f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -212,6 +212,7 @@ - **curiositystream** - **curiositystream:collection** - **CWTV** + - **DagelijkseKost**: dagelijksekost.een.be - **DailyMail** - **dailymotion** - **dailymotion:playlist** @@ -233,6 +234,7 @@ - **DiscoveryGo** - **DiscoveryGoPlaylist** - **DiscoveryNetworksDe** + - **DiscoveryPlus** - **DiscoveryVR** - **Disney** - **dlive:stream** @@ -353,6 +355,7 @@ - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** + - **HGTVDe** - **HiDive** - **HistoricFilms** - **history:player** @@ -803,6 +806,7 @@ - **safari:course**: safaribooksonline.com online courses - **SAKTV** - **SaltTV** + - **SampleFocus** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -825,6 +829,9 @@ - **ShahidShow** - **Shared**: shared.sx - **ShowRoomLive** + - **simplecast** + - **simplecast:episode** + - **simplecast:podcast** - **Sina** - **sky.it** - **sky:news** @@ -877,6 +884,9 @@ - **Steam** - **Stitcher** - **StitcherShow** + - **StoryFire** + - **StoryFireSeries** + - **StoryFireUser** - **Streamable** - **streamcloud.eu** - **StreamCZ** @@ -1198,5 +1208,6 @@ - **ZattooLive** - **ZDF** - **ZDFChannel** + - **Zhihu** - **zingmp3**: mp3.zing.vn - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 79d2be625..f89530293 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.10' +__version__ = '2021.02.22' From 2090dbdc8c51d18760957e248f5ff152209f9236 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Feb 2021 23:08:40 +0100 Subject: [PATCH 056/242] [youtube] fix get_video_info request --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 72d9fbbc6..2496d27f1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1450,7 +1450,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Refetching age-gated info webpage', 'unable to download video info webpage', query={ 'video_id': video_id, - 'eurl': 'https://www.youtube.com/embed/' + video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, }, fatal=False)), lambda x: x['player_response'][0], compat_str) or '{}', video_id) From 7422a2194fcbc179083c6927a2fcca278fed39c5 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 15 Oct 2020 14:24:17 +0200 Subject: [PATCH 057/242] [gedidigital] Add new extractor(closes #7347)(closes #26946) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/gedidigital.py | 161 ++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 youtube_dl/extractor/gedidigital.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a39c25c5..dc6a06771 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -421,6 +421,7 @@ from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py new file mode 100644 index 000000000..1b47a4e27 --- /dev/null +++ b/youtube_dl/extractor/gedidigital.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,3}/(?P\d+)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P[^']+)',\s*'(?P[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } From d81421af4b4c3f8f6e197ad4a06fcdb948484c24 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Feb 2021 23:02:15 +0100 Subject: [PATCH 058/242] [gedidigital] improve asset id matching --- youtube_dl/extractor/gedidigital.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py index 1b47a4e27..6c4153b40 100644 --- a/youtube_dl/extractor/gedidigital.py +++ b/youtube_dl/extractor/gedidigital.py @@ -32,7 +32,7 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,3}/(?P\d+)''' + )\.it(?:/[^/]+){2,3}?/(?P\d+)(?:[/?&#]|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', From 8cb4b71909e720a758a17dd519d198e77884a14a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 23 Feb 2021 18:37:06 +0700 Subject: [PATCH 059/242] [tmz] Fix and improve extraction (closes #24603, closes #24687, closes #28211) --- youtube_dl/extractor/tmz.py | 97 +++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py index 419f9d92e..3d1bf75ff 100644 --- a/youtube_dl/extractor/tmz.py +++ b/youtube_dl/extractor/tmz.py @@ -2,55 +2,110 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from .kaltura import KalturaIE +from ..utils import ( + int_or_none, + unified_timestamp, +) class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '4d22a51ef205b6c06395d8394f72d560', + 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'md5': '31f9223e20eef55954973359afa61a20', 'info_dict': { - 'id': '0_okj015ty', + 'id': 'P6YjLBLk', 'ext': 'mp4', - 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', - 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'timestamp': 1394747163, - 'uploader_id': 'batchUser', - 'upload_date': '20140313', - } + 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + 'description': 'md5:b714359fc18607715ebccbd2da8ff488', + 'timestamp': 1467831837, + 'upload_date': '20160706', + }, + 'add_ie': [JWPlatformIE.ie_key()], }, { - 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'url': 'http://www.tmz.com/videos/0_okj015ty/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).replace('-', '_') - return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + tmz_video_id = self._search_regex( + r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', + webpage, 'video id', default=None) + video = self._download_json( + 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, + fatal=False) + if video: + message = video['message'] + info = { + '_type': 'url_transparent', + 'title': message.get('title'), + 'description': message.get('description'), + 'timestamp': unified_timestamp(message.get('published_at')), + 'duration': int_or_none(message.get('duration')), + } + jwplatform_id = message.get('jwplayer_media_id') + if jwplatform_id: + info.update({ + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': JWPlatformIE.ie_key(), + }) + else: + kaltura_entry_id = message.get('kaltura_entry_id') or video_id + kaltura_partner_id = message.get('kaltura_partner_id') or '591531' + info.update({ + 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), + 'ie_key': KalturaIE.ie_key(), + }) + return info + + return self.url_result( + 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/]+)/?' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { - 'id': '0_6snoelag', - 'ext': 'mov', + 'id': 'PAKZa97W', + 'ext': 'mp4', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429467813, + 'timestamp': 1429466400, 'upload_date': '20150419', - 'uploader_id': 'batchUser', - } + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + tmz_url = self._search_regex( + r'clickLink\s*\(\s*["\'](?P%s)' % TMZIE._VALID_URL, webpage, + 'video id', default=None, group='url') + if tmz_url: + return self.url_result(tmz_url, ie=TMZIE.ie_key()) + embedded_video_info = self._parse_json(self._html_search_regex( r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], + ie=TMZIE.ie_key()) From 295860ff00c5d8caf94badd4f04671f6a631fcae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 23 Feb 2021 12:39:46 +0100 Subject: [PATCH 060/242] [tf1] improve extraction(closes #27980)(closes #28040) --- youtube_dl/extractor/tf1.py | 123 +++++++++++++++++------------------- youtube_dl/extractor/wat.py | 91 +++++++++++--------------- 2 files changed, 97 insertions(+), 117 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 55e2a0721..23c2808a1 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,92 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) class TF1IE(InfoExtractor): - """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P[^/]+)/videos/(?P[^/?&#]+)\.html' _TESTS = [{ - 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - 'info_dict': { - 'id': '10635995', - 'ext': 'mp4', - 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', - 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', + 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', 'info_dict': { - 'id': 'le-grand-mysterioso-chuggington-7085291-739', + 'id': '13641379', 'ext': 'mp4', - 'title': 'Le grand Mystérioso - Chuggington', - 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', - 'upload_date': '20150103', + 'title': 'md5:f392bc52245dc5ad43771650c96fb620', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', + 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, + 'format': 'bestvideo', }, - 'skip': 'HTTP Error 410: Gone', }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, - }, { - 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', - 'only_matching': True, }, { 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', 'only_matching': True, - }, { - 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', - 'info_dict': { - 'id': '13641379', - 'ext': 'mp4', - 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', - 'upload_date': '20190611', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, }] def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - wat_id = None + program_slug, slug = re.match(self._VALID_URL, url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] - data = self._parse_json( - self._search_regex( - r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|)', webpage, - 'data', default='{}'), video_id, fatal=False) + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) - if data: - try: - wat_id = next( - video.get('streamId') - for key, video in data.items() - if isinstance(video, dict) - and video.get('slug') == video_id) - if not isinstance(wat_id, compat_str) or not wat_id.isdigit(): - wat_id = None - except StopIteration: - pass + decoration = video.get('decoration') or {} - if not wat_id: - wat_id = self._html_search_regex( - (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2'), - webpage, 'wat id', group='id') + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) - return self.url_result('wat:%s' % wat_id, 'Wat') + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index f6940b371..147931d73 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -4,9 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - unified_strdate, - HEADRequest, + ExtractorError, int_or_none, + try_get, + unified_strdate, ) @@ -29,6 +30,7 @@ class WatIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', @@ -40,8 +42,10 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', }, 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', }, ] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) @@ -49,71 +53,52 @@ class WatIE(InfoExtractor): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( - 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - self.report_warning( - '%s returned error: %s' % (self.IE_NAME, error_desc)) - - chapters = video_info['chapters'] - if chapters: - first_chapter = chapters[0] + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc) - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] + title = video_info['title'] - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url - else: - first_chapter = video_info + formats = [] - title = first_chapter['title'] + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + formats.extend(self._extract_mpd_formats( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + elif f == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) - def extract_url(path_template, url_type): - req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) - if head: - red_url = head.geturl() - if req_url != red_url: - return red_url - return None + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) - formats = [] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - formats.extend(self._extract_mpd_formats( - mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), - video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) - date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None - duration = None - files = video_info['files'] - if files: - duration = int_or_none(files[0].get('duration')) - return { 'id': video_id, 'title': title, - 'thumbnail': first_chapter.get('preview'), - 'description': first_chapter.get('description'), - 'view_count': int_or_none(video_info.get('views')), - 'upload_date': upload_date, - 'duration': duration, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } From 1631fca1ee1c3312027c702854d741bbb8025dcd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 23 Feb 2021 13:50:18 +0100 Subject: [PATCH 061/242] [wat] detect DRM protected videos(closes #27958) --- youtube_dl/extractor/wat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 147931d73..f1bccc2d6 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -64,7 +64,7 @@ class WatIE(InfoExtractor): if error_desc: if video_info.get('error_code') == 'GEOBLOCKED': self.raise_geo_restricted(error_desc, video_info.get('geoList')) - raise ExtractorError(error_desc) + raise ExtractorError(error_desc, expected=True) title = video_info['title'] @@ -86,6 +86,8 @@ class WatIE(InfoExtractor): delivery = video_data.get('delivery') or {} extract_formats({delivery.get('format'): delivery.get('url')}) if not formats: + if delivery.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) if manifest_urls: From 44603290e5002153f3ebad6230cc73aef42cc2cd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Feb 2021 18:34:28 +0100 Subject: [PATCH 062/242] [dplay] Extract Ad-Free uplynk URLs(#28160) --- youtube_dl/extractor/dplay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 0f0632f26..bbb199094 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -330,6 +330,7 @@ class DiscoveryPlusIE(DPlayIE): 'videoId': video_id, 'wisteriaProperties': { 'platform': 'desktop', + 'product': 'dplus_us', }, }).encode('utf-8'))['data']['attributes']['streaming'] From 9662e4964b8d1b8d23c79f90d91b9be87d10029f Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 24 Feb 2021 22:17:29 +0100 Subject: [PATCH 063/242] [vvvvid] extract series sublists playlist_title (#27601) (#27618) --- youtube_dl/extractor/vvvvid.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 778ce8b76..d62404cf3 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -247,9 +247,13 @@ class VVVVIDShowIE(VVVVIDIE): show_info = self._download_info( show_id, 'info/', show_title, fatal=False) + if not show_title: + base_url += "/title" + entries = [] for season in (seasons or []): episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') for episode in episodes: if episode.get('playable') is False: continue @@ -259,12 +263,13 @@ class VVVVIDShowIE(VVVVIDIE): continue info = self._extract_common_video_info(episode) info.update({ - '_type': 'url', + '_type': 'url_transparent', 'ie_key': VVVVIDIE.ie_key(), 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), 'season_id': season_id, + 'playlist_title': playlist_title, }) entries.append(info) From ef28e33249f650b3f8d40c3e62b9df2c6103b360 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Feb 2021 22:29:35 +0100 Subject: [PATCH 064/242] [vvvvid] reduce season request payload size --- youtube_dl/extractor/vvvvid.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index d62404cf3..7c94c4ee2 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -75,12 +75,15 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _download_info(self, show_id, path, video_id, fatal=True): + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) response = self._download_json( 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query={ - 'conn_id': self._conn_id, - }, fatal=fatal) + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) if not (response or fatal): return if response.get('result') == 'error': @@ -98,7 +101,8 @@ class VVVVIDIE(InfoExtractor): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, 'season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) vid = int(video_id) video_data = list(filter( From 3c58f9e0b9d8471212406e012727374db084932b Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Sat, 11 Nov 2017 19:30:10 +0100 Subject: [PATCH 065/242] [srgssr] improve extraction - extract subtitle - fix extraction for new videos - update srf download domains closes #14717 closes #14725 closes #27231 closes #28238 --- youtube_dl/extractor/rts.py | 15 ++- youtube_dl/extractor/srgssr.py | 208 +++++++++++++++++++++------------ 2 files changed, 144 insertions(+), 79 deletions(-) diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 48f17b828..aed35f8a9 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -6,11 +6,12 @@ import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, - determine_ext, + urljoin, ) @@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE): _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', @@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE): media_type = 'video' if 'video' in all_info else 'audio' # check for errors - self.get_media_data('rts', media_type, media_id) + self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] @@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): @@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE): format_id += '-%dk' % rate formats.append({ 'format_id': format_id, - 'url': 'http://download-video.rts.ch/' + media_url, + 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, parse_iso8601, qualities, + try_get, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P + video|audio + ): + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor): 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } def _get_tokenized_src(self, url, video_id, format_id): - sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = token.get('token', {}).get('authparams') + auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] - - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') + + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] - media_data = self.get_media_data(bu, media_type, media_id) - - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') - timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': asset_url, - 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, - }) + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + formats.extend(self._extract_akamai_formats( + format_url, media_id)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) self._sort_formats(formats) + subtitles = {} + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + return { 'id': media_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, 'formats': formats, } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'timestamp': 1444709160, + 'duration': 336.816, }, 'params': { # rtmp download @@ -159,19 +203,32 @@ class SRGSSRPlayIE(InfoExtractor): 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, + 'duration': 1796.76, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor): bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From 678d46f6bbcc8426723d48c49eb25cf202753245 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Feb 2021 10:42:41 +0100 Subject: [PATCH 066/242] [bandaichannel] Add new extractor(closes #21404) --- youtube_dl/extractor/bandaichannel.py | 37 +++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/bandaichannel.py diff --git a/youtube_dl/extractor/bandaichannel.py b/youtube_dl/extractor/bandaichannel.py new file mode 100644 index 000000000..d67285913 --- /dev/null +++ b/youtube_dl/extractor/bandaichannel.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from ..utils import extract_attributes + + +class BandaiChannelIE(BrightcoveNewIE): + IE_NAME = 'bandaichannel' + _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P\d+/\d+)' + _TESTS = [{ + 'url': 'https://www.b-ch.com/titles/514/001', + 'md5': 'a0f2d787baa5729bed71108257f613a4', + 'info_dict': { + 'id': '6128044564001', + 'ext': 'mp4', + 'title': 'メタルファイターMIKU 第1話', + 'timestamp': 1580354056, + 'uploader_id': '5797077852001', + 'upload_date': '20200130', + 'duration': 1387.733, + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + attrs = extract_attributes(self._search_regex( + r'(]+\bid="bcplayer"[^>]*>)', webpage, 'player')) + bc = self._download_json( + 'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'], + video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc'] + return self._parse_brightcove_metadata(bc, bc['id']) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dc6a06771..07a8af055 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -90,6 +90,7 @@ from .awaan import ( ) from .azmedien import AZMedienIE from .baidu import BaiduVideoIE +from .bandaichannel import BandaiChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, From 38fe5e239ad602b32c111f40ad7c51b3e029be3c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Feb 2021 12:31:18 +0100 Subject: [PATCH 067/242] [urplay] fix episode data extraction(closes #28292) --- youtube_dl/extractor/urplay.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py index 5452c7ca1..d6c79147e 100644 --- a/youtube_dl/extractor/urplay.py +++ b/youtube_dl/extractor/urplay.py @@ -21,6 +21,11 @@ class URPlayIE(InfoExtractor): 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', 'timestamp': 1513292400, 'upload_date': '20171214', + 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', + 'duration': 2269, + 'categories': ['Kultur & historia'], + 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], + 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -31,6 +36,10 @@ class URPlayIE(InfoExtractor): 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', 'timestamp': 1440086400, 'upload_date': '20150820', + 'series': 'Tripp, Trapp, Träd', + 'duration': 865, + 'tags': ['Sova'], + 'episode': 'Sovkudde', }, }, { 'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden', @@ -41,9 +50,11 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._html_search_regex( + vid = int(video_id) + accessible_episodes = self._parse_json(self._html_search_regex( r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] + webpage, 'urplayer data'), video_id)['accessibleEpisodes'] + urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid) episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( From bee618268014480bb3dd7887986b456c8e9c0236 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 1 Mar 2021 14:00:03 +0100 Subject: [PATCH 068/242] [stretchinternet] Fix extraction(closes #28297) --- youtube_dl/extractor/stretchinternet.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py index 4dbead2ba..ec08eae55 100644 --- a/youtube_dl/extractor/stretchinternet.py +++ b/youtube_dl/extractor/stretchinternet.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none class StretchInternetIE(InfoExtractor): @@ -11,22 +10,28 @@ class StretchInternetIE(InfoExtractor): 'info_dict': { 'id': '573272', 'ext': 'mp4', - 'title': 'University of Mary Wrestling vs. Upper Iowa', - 'timestamp': 1575668361, - 'upload_date': '20191206', + 'title': 'UNIVERSITY OF MARY WRESTLING VS UPPER IOWA', + # 'timestamp': 1575668361, + # 'upload_date': '20191206', + 'uploader_id': '99997', } } def _real_extract(self, url): video_id = self._match_id(url) + media_url = self._download_json( + 'https://core.stretchlive.com/trinity/event/tcg/' + video_id, + video_id)[0]['media'][0]['url'] event = self._download_json( - 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id, - video_id)[0] + 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json', + video_id, query={'eventID': video_id, 'token': 'asdf'})['event'] return { 'id': video_id, 'title': event['title'], - 'timestamp': int_or_none(event.get('dateCreated'), 1000), - 'url': 'https://' + event['media'][0]['url'], + # TODO: parse US timezone abbreviations + # 'timestamp': event.get('dateTimeString'), + 'url': 'https://' + media_url, + 'uploader_id': event.get('ownerID'), } From 3fb14cd214fdadfae195745b26498e012f78be8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:03:17 +0700 Subject: [PATCH 069/242] [zdf] Rework extractors (closes #11606, closes #13473, closes #17354, closes #21185, closes #26711, closes #27068, closes #27930, closes #28198, closes #28199, closes #28274) * Generalize unique video ids for zdf based extractors * Improve extraction * Fix 3sat and phoenix --- youtube_dl/extractor/dreisat.py | 220 +++++--------------------------- youtube_dl/extractor/phoenix.py | 149 ++++++++++++++++----- youtube_dl/extractor/zdf.py | 192 ++++++++++++++++++---------- 3 files changed, 276 insertions(+), 285 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 848d387d1..5a07c18f4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,193 +1,43 @@ from __future__ import unicode_literals -import re +from .zdf import ZDFIE -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) - -class DreiSatIE(InfoExtractor): +class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } + _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html + 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, + }, { + 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html', + 'info_dict': { + 'id': '140913_sendung_schweizweit', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'timestamp': 1410623100, + 'upload_date': '20140913' }, - ] - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params - - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, + 'params': { + 'skip_download': True, } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) + }, { + # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html + 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index e435c28e1..dbbfce983 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,45 +1,128 @@ +# coding: utf-8 from __future__ import unicode_literals -from .dreisat import DreiSatIE +import re +from .youtube import YoutubeIE +from .zdf import ZDFBaseIE +from ..compat import compat_str +from ..utils import ( + int_or_none, + merge_dicts, + unified_timestamp, + xpath_text, +) -class PhoenixIE(DreiSatIE): + +class PhoenixIE(ZDFBaseIE): IE_NAME = 'phoenix.de' - _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ - (?: - phoenix/die_sendungen/(?:[^/]+/)? - )? - (?P[0-9]+)''' - _TESTS = [ - { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } + _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P\d+)\.html' + _TESTS = [{ + # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html + 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613906100, + 'upload_date': '20210221', + 'uploader': 'Phoenix', + 'channel': 'corona nachgehakt', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', - 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html', + 'info_dict': { + 'id': 'hMQtqFYjomk', + 'ext': 'mp4', + 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?', + 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd', + 'duration': 3509, + 'upload_date': '20201219', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', - 'only_matching': True, + 'params': { + 'skip_download': True, }, - ] + }, { + 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html', + 'only_matching': True, + }, { + # no media + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html', + 'only_matching': True, + }, { + # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + article_id = self._match_id(url) + + article = self._download_json( + 'https://www.phoenix.de/response/id/%s' % article_id, article_id, + 'Downloading article JSON') + + video = article['absaetze'][0] + title = video.get('titel') or article.get('subtitel') + + if video.get('typ') == 'video-youtube': + video_id = video['id'] + return self.url_result( + video_id, ie=YoutubeIE.ie_key(), video_id=video_id, + video_title=title) + + video_id = compat_str(video.get('basename') or video.get('content')) - internal_id = self._search_regex( - r'
{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) - - -class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') - _TESTS = [{ - 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', - 'info_dict': { - 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', - 'ext': 'mp4', - 'title': 'Die Magie der Farben (2/2)', - 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', - 'duration': 2615, - 'timestamp': 1465021200, - 'upload_date': '20160604', - }, - }, { - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', - 'only_matching': True, - }] + def _call_api(self, url, video_id, item, api_token=None, referrer=None): + headers = {} + if api_token: + headers['Api-Auth'] = 'Bearer %s' % api_token + if referrer: + headers['Referer'] = referrer + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, headers=headers) @staticmethod def _extract_subtitles(src): @@ -109,20 +79,11 @@ class ZDFIE(ZDFBaseIE): }) formats.append(f) - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - - if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'ngplayer_2_4') - + def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( - urljoin(url, ptmd_path), player, url, video_id, 'metadata') + ptmd_url, video_id, 'metadata', api_token, referrer) + + content_id = ptmd.get('basename') or ptmd_url.split('/')[-1] formats = [] track_uris = set() @@ -140,7 +101,7 @@ class ZDFIE(ZDFBaseIE): continue for track in tracks: self._extract_format( - video_id, formats, track_uris, { + content_id, formats, track_uris, { 'url': track.get('uri'), 'type': f.get('type'), 'mimeType': f.get('mimeType'), @@ -149,6 +110,103 @@ class ZDFIE(ZDFBaseIE): }) self._sort_formats(formats) + duration = float_or_none(try_get( + ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) + + return { + 'extractor_key': ZDFIE.ie_key(), + 'id': content_id, + 'duration': duration, + 'formats': formats, + 'subtitles': self._extract_subtitles(ptmd), + } + + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) + + +class ZDFIE(ZDFBaseIE): + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?#&]+)\.html' + _TESTS = [{ + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'md5': '34ec321e7eb34231fd88616c65c92db0', + 'info_dict': { + 'id': '210222_phx_nachgehakt_corona_protest', + 'ext': 'mp4', + 'title': 'Wohin führt der Protest in der Pandemie?', + 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', + 'duration': 1691, + 'timestamp': 1613948400, + 'upload_date': '20210221', + }, + }, { + # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html + 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', + 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', + 'info_dict': { + 'id': '141007_ab18_10wochensommer_film', + 'ext': 'mp4', + 'title': 'Ab 18! - 10 Wochen Sommer', + 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26', + 'duration': 2660, + 'timestamp': 1608604200, + 'upload_date': '20201222', + }, + }, { + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', + 'info_dict': { + 'id': '151025_magie_farben2_tex', + 'ext': 'mp4', + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html + 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html', + 'only_matching': True, + }, { + # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids + 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', + 'only_matching': True, + }] + + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] + + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'ngplayer_2_4') + + info = self._extract_ptmd( + urljoin(url, ptmd_path), video_id, player['apiToken'], url) + thumbnails = [] layouts = try_get( content, lambda x: x['teaserImageRef']['layouts'], dict) @@ -169,33 +227,33 @@ class ZDFIE(ZDFBaseIE): }) thumbnails.append(thumbnail) - return { - 'id': video_id, + return merge_dicts(info, { 'title': title, 'description': content.get('leadParagraph') or content.get('teasertext'), 'duration': int_or_none(t.get('duration')), 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(ptmd), - 'formats': formats, - } + }) def _extract_regular(self, url, player, video_id): content = self._call_api( - player['content'], player, url, video_id, 'content') + player['content'], video_id, 'content', player['apiToken'], url) return self._extract_entry(player['content'], player, content, video_id) def _extract_mobile(self, video_id): - document = self._download_json( + video = self._download_json( 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, - video_id)['document'] + video_id) + + document = video['document'] title = document['titel'] + content_id = document['basename'] formats = [] format_urls = set() for f in document['formitaeten']: - self._extract_format(video_id, formats, format_urls, f) + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -213,12 +271,12 @@ class ZDFIE(ZDFBaseIE): }) return { - 'id': video_id, + 'id': content_id, 'title': title, 'description': document.get('beschreibung'), 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(try_get( - document, lambda x: x['meta']['editorialDate'], compat_str)), + 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp( + try_get(video, lambda x: x['meta']['editorialDate'], compat_str)), 'thumbnails': thumbnails, 'subtitles': self._extract_subtitles(document), 'formats': formats, From 0002888627b40264994e8a37fc3a17cbd3551af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:16:41 +0700 Subject: [PATCH 070/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 2912d776c..07f26f2cf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Extractors +* [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, + #27930, #28198, #28199, #28274) + * Generalize cross-extractor video ids for zdf based extractors + * Improve extraction + * Fix 3sat and phoenix +* [stretchinternet] Fix extraction (#28297) +* [urplay] Fix episode data extraction (#28292) ++ [bandaichannel] Add support for b-ch.com (#21404) +* [srgssr] Improve extraction (#14717, #14725, #27231, #28238) + + Extract subtitle + * Fix extraction for new videos + * Update srf download domains +* [vvvvid] Reduce season request payload size ++ [vvvvid] Extract series sublists playlist title (#27601, #27618) ++ [dplay] Extract Ad-Free uplynk URLs (#28160) ++ [wat] Detect DRM protected videos (#27958) +* [tf1] Improve extraction (#27980, #28040) +* [tmz] Fix and improve extraction (#24603, #24687, 28211) ++ [gedidigital] Add support for Gedi group sites (#7347, #26946) +* [youtube] Fix get_video_info request + + version 2021.02.22 Core From 7c06216abff092d43e47b584699d435c40a8115e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 2 Mar 2021 06:19:42 +0700 Subject: [PATCH 071/242] release 2021.03.02 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 8 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 60879f0ac..9544eaa6c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.22 + [debug] youtube-dl version 2021.03.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index b38d39ab4..c32ebdf56 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 3235de44b..2b5e0f08f 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index a3255623a..13a54982e 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.02.22 + [debug] youtube-dl version 2021.03.02 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 124b020c3..dbca582ee 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.02.22** +- [ ] I've verified that I'm running youtube-dl version **2021.03.02** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 07f26f2cf..fbf97a582 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.02 Extractors * [zdf] Rework extractors (#11606, #13473, #17354, #21185, #26711, #27068, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2452c1f7f..2c00ec406 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -82,6 +82,7 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 + - **bandaichannel** - **Bandcamp** - **Bandcamp:album** - **Bandcamp:weekly** @@ -330,6 +331,7 @@ - **Gaskrank** - **Gazeta** - **GDCVault** + - **GediDigital** - **generic**: Generic downloader that works on some sites - **Gfycat** - **GiantBomb** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f89530293..bfe98aa9f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.02.22' +__version__ = '2021.03.02' From e465b25c1fb0e72b97a032220399d4a959662095 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 24 Feb 2021 11:52:30 +0000 Subject: [PATCH 072/242] [bbc] add support for BBC Reel videos(closes #21870, closes #23660, closes #28268) --- youtube_dl/extractor/bbc.py | 59 ++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index b4daee54e..a0c557929 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -5,10 +5,15 @@ import itertools import re from .common import InfoExtractor +from ..compat import ( + compat_etree_Element, + compat_HTTPError, + compat_urlparse, +) from ..utils import ( + ExtractorError, clean_html, dict_get, - ExtractorError, float_or_none, get_element_by_class, int_or_none, @@ -21,11 +26,6 @@ from ..utils import ( urlencode_postdata, urljoin, ) -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_urlparse, -) class BBCCoUkIE(InfoExtractor): @@ -793,6 +793,20 @@ class BBCIE(BBCCoUkIE): 'description': 'Learn English words and phrases from this story', }, 'add_ie': [BBCCoUkIE.ie_key()], + }, { + # BBC Reel + 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness', + 'info_dict': { + 'id': 'p07c6sb9', + 'ext': 'mp4', + 'title': 'How positive thinking is harming your happiness', + 'alt_title': 'The downsides of positive thinking', + 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'duration': 235, + 'thumbnail': r're:https?://.+/p07c9dsr.jpg', + 'upload_date': '20190604', + 'categories': ['Psychology'], + }, }] @classmethod @@ -980,6 +994,37 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) + initial_data = self._parse_json(self._html_search_regex( + r']+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P(?:(?!\2).)+)', + webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False) + if initial_data: + init_data = try_get( + initial_data, lambda x: x['initData']['items'][0], dict) or {} + smp_data = init_data.get('smpData') or {} + clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {} + version_id = clip_data.get('versionID') + if version_id: + title = smp_data['title'] + formats, subtitles = self._download_media_selector(version_id) + self._sort_formats(formats) + image_url = smp_data.get('holdingImageURL') + display_date = init_data.get('displayDate') + topic_title = init_data.get('topicTitle') + + return { + 'id': version_id, + 'title': title, + 'formats': formats, + 'alt_title': init_data.get('shortTitle'), + 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None, + 'description': smp_data.get('summary') or init_data.get('shortSummary'), + 'upload_date': display_date.replace('-', '') if display_date else None, + 'subtitles': subtitles, + 'duration': int_or_none(clip_data.get('duration')), + 'categories': [topic_title] if topic_title else None, + } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # There are several setPayload calls may be present but the video # seems to be always related to the first one @@ -1041,7 +1086,7 @@ class BBCIE(BBCCoUkIE): thumbnail = None image_url = current_programme.get('image_url') if image_url: - thumbnail = image_url.replace('{recipe}', '1920x1920') + thumbnail = image_url.replace('{recipe}', 'raw') return { 'id': programme_id, 'title': title, From e1adb3ed4fc911a8177280fe87109e7b54a52fa2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 2 Mar 2021 11:21:49 +0100 Subject: [PATCH 073/242] [bbc] correct catched exception type --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index a0c557929..92e6f1bea 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -943,7 +943,7 @@ class BBCIE(BBCCoUkIE): else: entry['title'] = info['title'] entry['formats'].extend(info['formats']) - except Exception as e: + except ExtractorError as e: # Some playlist URL may fail with 500, at the same time # the other one may work fine (e.g. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) From 8f56907afa693290a6b2e05fb7ffc2f15dca33e2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 2 Mar 2021 12:04:31 +0100 Subject: [PATCH 074/242] [9c9media] fix extraction for videos with multiple ContentPackages(closes #28309) --- youtube_dl/extractor/ninecninemedia.py | 4 +--- youtube_dl/extractor/rds.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index a569c889e..cfc220314 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -23,11 +23,9 @@ class NineCNineMediaIE(InfoExtractor): destination_code, content_id = re.match(self._VALID_URL, url).groups() api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) content = self._download_json(api_base_url, content_id, query={ - '$include': '[Media,Season,ContentPackages]', + '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]', }) title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 8c016a77d..0c497856e 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -15,17 +15,17 @@ class RDSIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P[^/]+)-\d+\.\d+' _TESTS = [{ - 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + # has two 9c9media ContentPackages, the web player selects the first ContentPackage + 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606', 'info_dict': { - 'id': '604333', - 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'id': '2083309', + 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande', 'ext': 'flv', - 'title': 'Fowler Jr. prend la direction de Jacksonville', - 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', - 'timestamp': 1430397346, - 'upload_date': '20150430', - 'duration': 154.354, - 'age_limit': 0, + 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande', + 'description': 'md5:83fa38ecc4a79b19e433433254077f25', + 'timestamp': 1606129030, + 'upload_date': '20201123', + 'duration': 773.039, } }, { 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', From 061c03013311eff75ac381cb4060204ce91b2510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Mar 2021 11:42:59 +0700 Subject: [PATCH 075/242] [youtube:tab] Switch continuation to browse API (closes #28289, closes #28327) Until further investigation. --- youtube_dl/extractor/youtube.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2496d27f1..eb5a58807 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2478,24 +2478,37 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): headers = { 'x-youtube-client-name': '1', 'x-youtube-client-version': '2.20201112.04.01', + 'content-type': 'application/json', } if identity_token: headers['x-youtube-identity-token'] = identity_token + data = { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20201021.03.00', + } + }, + } + for page_num in itertools.count(1): if not continuation: break + data['continuation'] = continuation['continuation'] + data['clickTracking'] = { + 'clickTrackingParams': continuation['itct'] + } count = 0 retries = 3 while count <= retries: try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - browse = self._download_json( - 'https://www.youtube.com/browse_ajax', None, - 'Downloading page %d%s' - % (page_num, ' (retry #%d)' % count if count else ''), - headers=headers, query=continuation) + response = self._download_json( + 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''), + headers=headers, data=json.dumps(data).encode('utf8')) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -2503,9 +2516,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if count <= retries: continue raise - if not browse: - break - response = try_get(browse, lambda x: x[1]['response'], dict) if not response: break From 8c9766f4bf78ca777e8de9d4809584d8e88098ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Mar 2021 11:44:49 +0700 Subject: [PATCH 076/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index fbf97a582..366d322f5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +version + +Extractors +* [youtube:tab] Switch continuation to browse API (#28289, #28327) +* [9c9media] Fix extraction for videos with multiple ContentPackages (#28309) ++ [bbc] Add support for BBC Reel videos (#21870, #23660, #28268) + + version 2021.03.02 Extractors From f68692b004f1c65f08a9a7d9c2ee4ab2ec255ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 3 Mar 2021 11:47:34 +0700 Subject: [PATCH 077/242] release 2021.03.03 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 9544eaa6c..a8eba3214 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.02 + [debug] youtube-dl version 2021.03.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index c32ebdf56..7d59a9f2d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 2b5e0f08f..523408f03 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 13a54982e..6e9e094e4 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.02 + [debug] youtube-dl version 2021.03.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dbca582ee..46af4e420 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.02** +- [ ] I've verified that I'm running youtube-dl version **2021.03.03** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 366d322f5..238ca3965 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.03 Extractors * [youtube:tab] Switch continuation to browse API (#28289, #28327) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bfe98aa9f..a1c68e384 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.02' +__version__ = '2021.03.03' From ec64ec9651848e9173ec033a9a27809e4b5063bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Mar 2021 12:41:49 +0100 Subject: [PATCH 078/242] [voxmedia] fix volume embed extraction(closes #28338) --- youtube_dl/extractor/voxmedia.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index b318e15d4..661208125 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + try_get, + unified_timestamp, ) @@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE): setup = self._parse_json(self._search_regex( r'setup\s*=\s*({.+});', webpage, 'setup'), video_id) - video_data = setup.get('video') or {} + player_setup = setup.get('player_setup') or setup + video_data = player_setup.get('video') or {} + formatted_metadata = video_data.get('formatted_metadata') or {} info = { 'id': video_id, - 'title': video_data.get('title_short'), + 'title': player_setup.get('title') or video_data.get('title_short'), 'description': video_data.get('description_long') or video_data.get('description_short'), - 'thumbnail': video_data.get('brightcove_thumbnail') + 'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'), + 'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')), } - asset = setup.get('asset') or setup.get('params') or {} + asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {} formats = [] hls_url = asset.get('hls_url') @@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE): if formats: self._sort_formats(formats) info['formats'] = formats + info['duration'] = int_or_none(asset.get('duration')) return info for provider_video_type in ('ooyala', 'youtube', 'brightcove'): @@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor): }, { # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', - 'md5': '4c8f4a0937752b437c3ebc0ed24802b5', + 'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68', 'info_dict': { 'id': 'Gy8Md3Eky38', 'ext': 'mp4', @@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor): 'uploader_id': 'TheVerge', 'upload_date': '20141021', 'uploader': 'The Verge', + 'timestamp': 1413907200, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor): # Volume embed, Youtube 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'info_dict': { - 'id': 'YCjDnX-Xzhg', + 'id': '22986359b', 'ext': 'mp4', 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination", 'description': 'md5:fc1317922057de31cd74bce91eb1c66c', - 'uploader_id': 'voxdotcom', 'upload_date': '20150915', - 'uploader': 'Vox', + 'timestamp': 1442332800, + 'duration': 285, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', + 'timestamp': 1402938000, + 'upload_date': '20140616', + 'duration': 4114, }, 'add_ie': ['VoxMediaVolume'], }] From b8b622fbebb158db95edb05a8cc248668194b430 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 4 Mar 2021 17:57:16 +0100 Subject: [PATCH 079/242] [trovo] Add Origin header to VOD formats(closes #28346) --- youtube_dl/extractor/trovo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py index 43745213d..de0107aa9 100644 --- a/youtube_dl/extractor/trovo.py +++ b/youtube_dl/extractor/trovo.py @@ -153,6 +153,7 @@ class TrovoVodIE(TrovoBaseIE): 'protocol': 'm3u8_native', 'tbr': int_or_none(play_info.get('bitrate')), 'url': play_url, + 'http_headers': {'Origin': 'https://trovo.live'}, }) self._sort_formats(formats) From 7f064d50db957d551dccde73ab73318f53ab3b17 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 7 Mar 2021 08:32:37 +0100 Subject: [PATCH 080/242] [cbs] add support for Paramount+ (closes #28342) --- youtube_dl/extractor/cbs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 4a19a73d2..c79e55a75 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -52,6 +52,9 @@ class CBSIE(CBSBaseIE): }, { 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'only_matching': True, }] def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): From c6a14755bb9629967fb12536ee8660ca67ff4345 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 8 Mar 2021 16:53:50 +0100 Subject: [PATCH 081/242] [bilibili] fix video info extraction(closes #28341) --- youtube_dl/extractor/bilibili.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4dc597e16..589fdc1ce 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -156,6 +156,7 @@ class BiliBiliIE(InfoExtractor): cid = js['result']['cid'] headers = { + 'Accept': 'application/json', 'Referer': url } headers.update(self.geo_verification_headers()) From 7dc513487fb0babb5257fa72df87c3f24967f2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 02:54:10 +0700 Subject: [PATCH 082/242] [pornhub] Extract formats from get_media end point (#28395) --- youtube_dl/extractor/pornhub.py | 40 +++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b7631e4e1..fdf8b1b0d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -394,6 +394,21 @@ class PornHubIE(PornHubBaseIE): upload_date = None formats = [] + + def add_format(format_url, height=None): + tbr = None + mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + for video_url, height in video_urls: if not upload_date: upload_date = self._search_regex( @@ -410,18 +425,19 @@ class PornHubIE(PornHubBaseIE): video_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) continue - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - 'tbr': tbr, - }) + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) + continue + add_format(video_url) self._sort_formats(formats) video_uploader = self._html_search_regex( From 1a1ccd9a6e8e9e90ab129e89c2524ab3eb9ed2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 02:56:01 +0700 Subject: [PATCH 083/242] [pornhub] Detect flagged videos --- youtube_dl/extractor/pornhub.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index fdf8b1b0d..2a7818e41 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -167,6 +167,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -265,7 +266,8 @@ class PornHubIE(PornHubBaseIE): webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)
', + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) From 477bff69065872fff6bab5c3a1b0512018fbb6eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 03:36:31 +0700 Subject: [PATCH 084/242] Introduce release_timestamp meta field (refs #28386) --- youtube_dl/YoutubeDL.py | 20 ++++++++++++-------- youtube_dl/extractor/common.py | 4 +++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ecac31f7a..8f65c6499 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1511,14 +1511,18 @@ class YoutubeDL(object): if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8eb110f4e..d3b6724df 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -230,8 +230,10 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. + timestamp: UNIX timestamp of the moment the video became available + (uploaded). upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. From 15c24b0346e3951b43dbf29631bfe65292f53ac5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 03:40:56 +0700 Subject: [PATCH 085/242] [lbry] Extract release_timestamp (closes #28386) --- youtube_dl/extractor/lbry.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 413215a99..95782366b 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -60,6 +60,7 @@ class LBRYBaseIE(InfoExtractor): 'description': stream_value.get('description'), 'license': stream_value.get('license'), 'timestamp': int_or_none(stream.get('timestamp')), + 'release_timestamp': int_or_none(stream_value.get('release_time')), 'tags': stream_value.get('tags'), 'duration': int_or_none(media.get('duration')), 'channel': try_get(signing_channel, lambda x: x['value']['title']), @@ -92,6 +93,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', 'timestamp': 1595694354, 'upload_date': '20200725', + 'release_timestamp': 1595340697, + 'release_date': '20200721', 'width': 1280, 'height': 720, } @@ -106,6 +109,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:661ac4f1db09f31728931d7b88807a61', 'timestamp': 1591312601, 'upload_date': '20200604', + 'release_timestamp': 1591312421, + 'release_date': '20200604', 'tags': list, 'duration': 2570, 'channel': 'The LBRY Foundation', From bae7dbf78be3a03d8454d1b17bfdbf1bfa0de715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Mar 2021 03:41:21 +0700 Subject: [PATCH 086/242] [bandcamp] Extract release_timestamp --- youtube_dl/extractor/bandcamp.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Ben Prunty', 'timestamp': 1396508491, 'upload_date': '20140403', + 'release_timestamp': 1396483200, 'release_date': '20140403', 'duration': 260.877, 'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Mastodon', 'timestamp': 1322005399, 'upload_date': '20111122', + 'release_timestamp': 1076112000, 'release_date': '20040207', 'duration': 120.79, 'track': 'Hail to Fire', @@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, From 64ed3af328929f83b02ed57df5cb4f863fdd0389 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 10 Mar 2021 11:45:30 +0100 Subject: [PATCH 087/242] [lbry] add support for channel filters(closes #28385) --- youtube_dl/extractor/lbry.py | 46 ++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index 95782366b..ae43d56ea 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -6,8 +6,10 @@ import json from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, @@ -186,17 +188,18 @@ class LBRYChannelIE(LBRYBaseIE): }] _PAGE_SIZE = 50 - def _fetch_page(self, claim_id, url, page): + def _fetch_page(self, claim_id, url, params, page): page += 1 + page_params = { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + } + page_params.update(params) result = self._call_api_proxy( - 'claim_search', claim_id, { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - 'stream_types': self._SUPPORTED_STREAM_TYPES, - }, 'page %d' % page) + 'claim_search', claim_id, page_params, 'page %d' % page) for item in (result.get('items') or []): stream_claim_name = item.get('name') stream_claim_id = item.get('claim_id') @@ -217,8 +220,31 @@ class LBRYChannelIE(LBRYBaseIE): result = self._resolve_url( 'lbry://' + display_id, display_id, 'channel') claim_id = result['claim_id'] + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url), + functools.partial(self._fetch_page, claim_id, url, params), self._PAGE_SIZE) result_value = result.get('value') or {} return self.playlist_result( From fc2c6d53239d4b4a6bac5383441152117ccf3c6f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 10 Mar 2021 13:16:21 +0100 Subject: [PATCH 088/242] [shahid] fix format extraction(closes #28383) --- youtube_dl/extractor/shahid.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 5c2a6206b..b5e093bd2 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -51,13 +51,16 @@ class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { - 'id': '275286', + 'id': '816924', 'ext': 'mp4', - 'title': 'مجلس الشباب الموسم 1 كليب 1', - 'timestamp': 1506988800, - 'upload_date': '20171003', + 'title': 'متحف الدحيح الموسم 1 كليب 1', + 'timestamp': 1602806400, + 'upload_date': '20201016', + 'description': 'برومو', + 'duration': 22, + 'categories': ['كوميديا'], }, 'params': { # m3u8 download @@ -109,12 +112,15 @@ class ShahidIE(ShahidBaseIE): page_type = 'episode' playout = self._call_api( - 'playout/url/' + video_id, video_id)['playout'] + 'playout/new/url/' + video_id, video_id)['playout'] if playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) - formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + formats = self._extract_m3u8_formats(re.sub( + # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html + r'aws\.manifestfilter=[\w:;,-]+&?', + '', playout['url']), video_id, 'mp4') self._sort_formats(formats) # video = self._call_api( From 9c644a641922e5ac3b5b4a1c9386fa599973e885 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 09:51:01 +0100 Subject: [PATCH 089/242] [fujitv] fix HLS formats extension(closes #28416) --- youtube_dl/extractor/fujitv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/youtube_dl/extractor/fujitv.py +++ b/youtube_dl/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: From 43d986acd8bf7247725fc9de34648c0eda560daf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 10:14:28 +0100 Subject: [PATCH 090/242] [tver] improve title extraction(closes #28418) --- youtube_dl/extractor/tver.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index 931d4d650..a54f49319 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, + strip_or_none, try_get, ) @@ -25,6 +26,10 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/episode/79622438', 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -47,8 +52,12 @@ class TVerIE(InfoExtractor): } if service == 'cx': + title = main['title'] + subtitle = strip_or_none(main.get('subtitle')) + if subtitle: + title += ' - ' + subtitle info.update({ - 'title': main.get('subtitle') or main['title'], + 'title': title, 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), 'ie_key': 'FujiTVFODPlus7', }) From ef414343e5fa2bc4fddae3097ecde5a8e32c2d4c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 10:48:58 +0100 Subject: [PATCH 091/242] [peertube] improve thumbnail extraction(closes #28419) --- youtube_dl/extractor/peertube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py index 32ff51653..d9b13adc2 100644 --- a/youtube_dl/extractor/peertube.py +++ b/youtube_dl/extractor/peertube.py @@ -599,11 +599,13 @@ class PeerTubeIE(InfoExtractor): else: age_limit = None + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), @@ -621,5 +623,6 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, + 'webpage_url': webpage_url, } From 1182f9567b86f2af747cdb8769ab87649c8ce4c2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Mar 2021 18:11:11 +0100 Subject: [PATCH 092/242] [pinterest] reduce the number of HLS format requests --- youtube_dl/extractor/pinterest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py index b249c9eda..42528d746 100644 --- a/youtube_dl/extractor/pinterest.py +++ b/youtube_dl/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor): title = (data.get('title') or data.get('grid_title') or video_id).strip() + urls = [] formats = [] duration = None if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) - if not format_url: + if not format_url or format_url in urls: continue + urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': From 60845121ca2f49172e7cd941c0cb43363cb86e46 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Mar 2021 15:19:24 +0100 Subject: [PATCH 093/242] [sportdeutschland] fix extraction(closes #21856)(closes #28425) --- youtube_dl/extractor/sportdeutschland.py | 145 +++++++++++++---------- 1 file changed, 84 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -1,82 +1,105 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + clean_html, + float_or_none, + int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, + try_get, ) class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' + _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sport_id = mobj.group('sport') - - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( - sport_id, video_id) - req = sanitized_Request(api_url, headers={ - 'Accept': 'application/vnd.vidibus.v2.html+json', - 'Referer': url, - }) - data = self._download_json(req, video_id) - + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) asset = data['asset'] - categories = [data['section']['title']] - - formats = [] - smil_url = asset['video'] - if '.smil' in smil_url: - m3u8_url = smil_url.replace('.smil', '.m3u8') - formats.extend( - self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), + } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] + if playlist_id: + if self._downloader.params.get('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) - smil_doc = self._download_xml( - smil_url, video_id, note='Downloading SMIL metadata') - base_url_el = smil_doc.find('./head/meta') - if base_url_el: - base_url = base_url_el.attrib['base'] - formats.extend([{ - 'format_id': 'rmtp', - 'url': base_url if base_url_el else n.attrib['src'], - 'play_path': n.attrib['src'], - 'ext': 'flv', - 'preference': -100, - 'format_note': 'Seems to fail at example stream', - } for n in smil_doc.findall('./body/video')]) + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats: + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) else: - formats.append({'url': smil_url}) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': asset['title'], - 'thumbnail': asset.get('image'), - 'description': asset.get('teaser'), - 'duration': asset.get('duration'), - 'categories': categories, - 'view_count': asset.get('views'), - 'rtmp_live': asset.get('live'), - 'timestamp': parse_iso8601(asset.get('date')), - } + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info From 1860d0f41cf50b1a0876174c4e1ee7adbbd4a0f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:26:54 +0700 Subject: [PATCH 094/242] [southpark] Fix extraction and add support for southparkstudios.com (closes #26763, closes #28413) --- youtube_dl/extractor/southpark.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index da75a43a7..0774da06e 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, }] + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' From b509d24b2fef8d5994b7d925db51befcbbf996fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:36:11 +0700 Subject: [PATCH 095/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 238ca3965..924d202b0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ Introduce release_timestamp meta field (#28386) + +Extractors ++ [southpark] Add support for southparkstudios.com (#28413) +* [southpark] Fix extraction (#26763, #28413) +* [sportdeutschland] Fix extraction (#21856, #28425) +* [pinterest] Reduce the number of HLS format requests +* [peertube] Improve thumbnail extraction (#28419) +* [tver] Improve title extraction (#28418) +* [fujitv] Fix HLS formats extension (#28416) +* [shahid] Fix format extraction (#28383) ++ [lbry] Add support for channel filters (#28385) ++ [bandcamp] Extract release timestamp ++ [lbry] Extract release timestamp (#28386) +* [pornhub] Detect flagged videos ++ [pornhub] Extract formats from get_media end point (#28395) +* [bilibili] Fix video info extraction (#28341) ++ [cbs] Add support for Paramount+ (#28342) ++ [trovo] Add Origin header to VOD formats (#28346) +* [voxmedia] Fix volume embed extraction (#28338) + + version 2021.03.03 Extractors From ebfd66c4b1d6ffabd8a5bc52737f2bacac341d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Mar 2021 09:38:16 +0700 Subject: [PATCH 096/242] release 2021.03.14 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index a8eba3214..9cc79eff4 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.03 + [debug] youtube-dl version 2021.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 7d59a9f2d..3296e44a8 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 523408f03..f74c29736 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 6e9e094e4..ae9e273ef 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.03 + [debug] youtube-dl version 2021.03.14 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 46af4e420..04fbea2f6 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.03** +- [ ] I've verified that I'm running youtube-dl version **2021.03.14** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 924d202b0..73fe316b9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.14 Core + Introduce release_timestamp meta field (#28386) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a1c68e384..5a540119c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.03' +__version__ = '2021.03.14' From 9955bb4a2704f98b74a448c82dfd690ec6775b8d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Mar 2021 14:49:23 +0100 Subject: [PATCH 097/242] [rtve] improve extraction - extract all formats - fix RTVE Infantil extraction(closes #24851) - extract is_live and series --- youtube_dl/extractor/rtve.py | 232 ++++++++++++++++------------------- 1 file changed, 104 insertions(+), 128 deletions(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 +import io import re -import time +import sys from .common import InfoExtractor from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import ( determine_ext, ExtractorError, float_or_none, + qualities, remove_end, remove_start, - sanitized_Request, std_headers, ) - -def _decrypt_url(png): - encrypted_data = compat_b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index - 4:] - length = compat_struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8 + length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index + 1:] - if url_data[0] == 'H' and url_data[3] == '%': - # remove useless HQ%% at the start - url_data = url_data[4:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, + 'series': 'Balonmano', }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', - 'ext': 'flv', - 'title': 'TODO', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', }, - 'skip': 'The f4m manifest can\'t be used yet', }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104 ', + 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor): def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - manager_info = self._download_json( + self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info') - self._manager = manager_info['manager'] + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png_request = sanitized_Request(png_url) - png_request.add_header('Referer', url) - png = self._download_webpage(png_request, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - ext = determine_ext(video_url) - - formats = [] - if not video_url.endswith('.f4m') and ext != 'm3u8': - if '?' not in video_url: - video_url = video_url.replace('resources/', 'auth/resources/') - video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': video_url, - }) - self._sort_formats(formats) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) subtitles = None - if info.get('sbtFile') is not None: - subtitles = self.extract_subtitles(video_id, info['sbtFile']) + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), - 'page_url': url, 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), scale=1000), + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P[^/]*)/video/(?P[^/]*)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '915319587b33720b8e0357caaa6617e6', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', - 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - - webpage = self._download_webpage(url, video_id) - vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': info['title'], - 'url': video_url, - 'thumbnail': info.get('image'), - 'duration': float_or_none(info.get('duration'), scale=1000), - } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor): 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - start_time = time.gmtime() video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') - title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - m3u8_url = _decrypt_url(png) - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'formats': formats, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } From 3be098010f667b14075e3dfad1e74e5e2becc8ea Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Mar 2021 20:08:46 +0100 Subject: [PATCH 098/242] [applepodcasts] fix extraction(closes #28445) --- youtube_dl/extractor/applepodcasts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/youtube_dl/extractor/applepodcasts.py +++ b/youtube_dl/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor): ember_data = self._parse_json(self._search_regex( r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', webpage, 'ember data'), episode_id) + ember_data = ember_data.get(episode_id) or ember_data episode = ember_data['data']['attributes'] description = episode.get('description') or {} From 357bfe251d7f4f8bb9319bc6531a3813b5a355a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Mar 2021 20:42:20 +0700 Subject: [PATCH 099/242] [svtplay] Improve extraction (closes #28448) --- youtube_dl/extractor/svt.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 4acc29fce..aba9bb447 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,18 +146,19 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) + (?:.*?modalId=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ - 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'url': 'https://www.svtplay.se/video/30479064', 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': 'jNwpV9P', + 'id': '8zVbDPA', 'ext': 'mp4', - 'title': 'Det här är himlen', - 'timestamp': 1586044800, - 'upload_date': '20200405', - 'duration': 3515, + 'title': 'Designdrömmar i Stenungsund', + 'timestamp': 1615770000, + 'upload_date': '20210315', + 'duration': 3519, 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { @@ -173,6 +174,9 @@ class SVTPlayIE(SVTPlayBaseIE): # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B 'skip_download': True, }, + }, { + 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -219,7 +223,8 @@ class SVTPlayIE(SVTPlayBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id, svt_id = mobj.group('id', 'svt_id') + video_id = mobj.group('id') + svt_id = mobj.group('svt_id') or mobj.group('modal_id') if svt_id: return self._extract_by_video_id(svt_id) @@ -254,6 +259,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From f912d6c8cf5a68d576abe4426e12554c3404a7dd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Mar 2021 21:43:53 +0100 Subject: [PATCH 100/242] [mlb] fix video extracion(#21241) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/mlb.py | 189 +++++++++++++++++++++++++---- 2 files changed, 172 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 07a8af055..c2f67323b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -673,7 +673,10 @@ from .mixcloud import ( MixcloudUserIE, MixcloudPlaylistIE, ) -from .mlb import MLBIE +from .mlb import ( + MLBIE, + MLBVideoIE, +) from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index b907f6b49..b69301d97 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,15 +1,91 @@ from __future__ import unicode_literals -from .nhl import NHLBaseIE +import re +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + try_get, +) -class MLBIE(NHLBaseIE): + +class MLBBaseIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + video = self._download_video_data(display_id) + video_id = video['id'] + title = video['title'] + feed = self._get_feed(video) + + formats = [] + for playback in (feed.get('playbacks') or []): + playback_url = playback.get('url') + if not playback_url: + continue + name = playback.get('name') + ext = determine_ext(playback_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + playback_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=name, fatal=False)) + else: + f = { + 'format_id': name, + 'url': playback_url, + } + mobj = re.search(r'_(\d+)K_(\d+)X(\d+)', name) + if mobj: + f.update({ + 'height': int(mobj.group(3)), + 'tbr': int(mobj.group(1)), + 'width': int(mobj.group(2)), + }) + mobj = re.search(r'_(\d+)x(\d+)_(\d+)_(\d+)K\.mp4', playback_url) + if mobj: + f.update({ + 'fps': int(mobj.group(3)), + 'height': int(mobj.group(2)), + 'tbr': int(mobj.group(4)), + 'width': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnails = [] + for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []): + src = cut.get('src') + if not src: + continue + thumbnails.append({ + 'height': int_or_none(cut.get('height')), + 'url': src, + 'width': int_or_none(cut.get('width')), + }) + + language = (video.get('language') or 'EN').lower() + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': video.get('description'), + 'duration': parse_duration(feed.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(video.get(self._TIMESTAMP_KEY)), + 'subtitles': self._extract_mlb_subtitles(feed, language), + } + + +class MLBIE(MLBBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*(?Pmlb)\.com/ + (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: - (?:[^/]+/)*c-| + (?:[^/]+/)*video/[^/]+/c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| @@ -18,7 +94,6 @@ class MLBIE(NHLBaseIE): (?P\d+) ) ''' - _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', @@ -76,18 +151,6 @@ class MLBIE(NHLBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - { - 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', - 'md5': 'e09e37b552351fddbf4d9e699c924d68', - 'info_dict': { - 'id': '75609783', - 'ext': 'mp4', - 'title': 'Must C: Pillar climbs for catch', - 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429139220, - 'upload_date': '20150415', - } - }, { 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694', 'only_matching': True, @@ -113,8 +176,92 @@ class MLBIE(NHLBaseIE): 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, }, - { - 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', - 'only_matching': True, - } ] + _TIMESTAMP_KEY = 'date' + + @staticmethod + def _get_feed(video): + return video + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for keyword in (feed.get('keywordsAll') or []): + keyword_type = keyword.get('type') + if keyword_type and keyword_type.startswith('closed_captions_location_'): + cc_location = keyword.get('value') + if cc_location: + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + return subtitles + + def _download_video_data(self, display_id): + return self._download_json( + 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id, + display_id) + + +class MLBVideoIE(MLBBaseIE): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/(?:[^/]+/)*video/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.mlb.com/mariners/video/ackley-s-spectacular-catch-c34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', + 'info_dict': { + 'id': 'c04a8863-f569-42e6-9f87-992393657614', + 'ext': 'mp4', + 'title': "Ackley's spectacular catch", + 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', + 'duration': 66, + 'timestamp': 1405995000, + 'upload_date': '20140722', + 'thumbnail': r're:^https?://.+', + }, + } + _TIMESTAMP_KEY = 'timestamp' + + @classmethod + def suitable(cls, url): + return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url) + + @staticmethod + def _get_feed(video): + return video['feeds'][0] + + @staticmethod + def _extract_mlb_subtitles(feed, language): + subtitles = {} + for cc_location in (feed.get('closedCaptions') or []): + subtitles.setdefault(language, []).append({ + 'url': cc_location, + }) + + def _download_video_data(self, display_id): + # https://www.mlb.com/data-service/en/videos/[SLUG] + return self._download_json( + 'https://fastball-gateway.mlb.com/graphql', + display_id, query={ + 'query': '''{ + mediaPlayback(ids: "%s") { + description + feeds(types: CMS) { + closedCaptions + duration + image { + cuts { + width + height + src + } + } + playbacks { + name + url + } + } + id + timestamp + title + } +}''' % display_id, + })['data']['mediaPlayback'][0] From fa6bf0a7112e83d36567072985d56440bb34de72 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 19 Mar 2021 12:37:22 +0100 Subject: [PATCH 101/242] [vvvvid] fix kenc format extraction(closes #28473) --- youtube_dl/extractor/vvvvid.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py index 7c94c4ee2..bc196f8a0 100644 --- a/youtube_dl/extractor/vvvvid.py +++ b/youtube_dl/extractor/vvvvid.py @@ -182,17 +182,20 @@ class VVVVIDIE(InfoExtractor): if not embed_code: continue embed_code = ds(embed_code) - if video_type in ('video/rcs', 'video/kenc'): - if video_type == 'video/kenc': - kenc = self._download_json( - 'https://www.vvvvid.it/kenc', video_id, query={ - 'action': 'kt', - 'conn_id': self._conn_id, - 'url': embed_code, - }, fatal=False) or {} - kenc_message = kenc.get('message') - if kenc_message: - embed_code += '?' + ds(kenc_message) + if video_type == 'video/kenc': + embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') + kenc = self._download_json( + 'https://www.vvvvid.it/kenc', video_id, query={ + 'action': 'kt', + 'conn_id': self._conn_id, + 'url': embed_code, + }, fatal=False) or {} + kenc_message = kenc.get('message') + if kenc_message: + embed_code += '?' + ds(kenc_message) + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif video_type == 'video/rcs': formats.extend(self._extract_akamai_formats(embed_code, video_id)) elif video_type == 'video/youtube': info.update({ From 7e79ba7dd6e6649dd2ce3a74004b2044f2182881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 20 Mar 2021 05:45:36 +0700 Subject: [PATCH 102/242] [vimeo:album] Fix extraction for albums with number of videos multiple to page size (closes #28486) --- youtube_dl/extractor/vimeo.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bd2663fe0..955651bec 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -939,11 +939,15 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): } if hashed_pass: query['_hashed_pass'] = hashed_pass - videos = self._download_json( - 'https://api.vimeo.com/albums/%s/videos' % album_id, - album_id, 'Downloading page %d' % api_page, query=query, headers={ - 'Authorization': 'jwt ' + authorization, - })['data'] + try: + videos = self._download_json( + 'https://api.vimeo.com/albums/%s/videos' % album_id, + album_id, 'Downloading page %d' % api_page, query=query, headers={ + 'Authorization': 'jwt ' + authorization, + })['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + return for video in videos: link = video.get('link') if not link: From 21ccd0d7f46002acc61eb21bd0d4e492064c7fe1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 21 Mar 2021 09:10:38 +0100 Subject: [PATCH 103/242] [tiktok] detect private videos(closes #28453) --- youtube_dl/extractor/tiktok.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index ea1beb8af..4faa6de54 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -107,9 +107,12 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - data = self._parse_json(self._search_regex( + page_props = self._parse_json(self._search_regex( r']+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s* Date: Mon, 22 Mar 2021 14:56:58 +0100 Subject: [PATCH 104/242] [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) Co-authored-by: Sergey M --- youtube_dl/extractor/vgtv.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index fe7a26b62..22e99e8f0 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -23,6 +23,8 @@ class VGTVIE(XstreamIE): 'fvn.no/fvntv': 'fvntv', 'aftenposten.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv', + 'tv.aftonbladet.se': 'abtv', + # obsolete URL schemas, kept in order to save one HTTP redirect 'tv.aftonbladet.se/abtv': 'abtv', 'www.aftonbladet.se/tv': 'abtv', } @@ -140,6 +142,10 @@ class VGTVIE(XstreamIE): 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', 'only_matching': True, }, + { + 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna', + 'only_matching': True, + }, { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'only_matching': True, From 8117d613acdd0a2874e52bfa52c3574f46e3a4fb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 22 Mar 2021 15:58:44 +0100 Subject: [PATCH 105/242] [zingmp3] fix extraction(closes #11589, closes #16409, closes #16968, closes #27205) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/zingmp3.py | 208 ++++++++++++++++------------- 2 files changed, 117 insertions(+), 96 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c2f67323b..8b55947f6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1624,5 +1624,8 @@ from .zattoo import ( ) from .zdf import ZDFIE, ZDFChannelIE from .zhihu import ZhihuIE -from .zingmp3 import ZingMp3IE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) from .zype import ZypeIE diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index adfdcaabf..207c04f5e 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -1,93 +1,94 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - update_url_query, ) -class ZingMp3BaseInfoExtractor(InfoExtractor): +class ZingMp3BaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P\w+)\.html' + _GEO_COUNTRIES = ['VN'] - def _extract_item(self, item, page_type, fatal=True): - error_message = item.get('msg') - if error_message: - if not fatal: - return - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), - expected=True) + def _extract_item(self, item, fatal): + item_id = item['id'] + title = item.get('name') or item['title'] formats = [] - for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): - if not source_url or source_url == 'require vip': + for k, v in (item.get('source') or {}).items(): + if not v: continue - if not re.match(r'https?://', source_url): - source_url = '//' + source_url - source_url = self._proto_relative_url(source_url, 'http:') - quality_num = int_or_none(quality) - f = { - 'format_id': quality, - 'url': source_url, - } - if page_type == 'video': - f.update({ - 'height': quality_num, - 'ext': 'mp4', - }) + if k in ('mp4', 'hls'): + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, item_id, 'mp4', + 'm3u8_native', m3u8_id=k, fatal=False)) + elif k == 'mp4': + formats.append({ + 'format_id': 'mp4-' + res, + 'url': video_url, + 'height': int_or_none(self._search_regex( + r'^(\d+)p', res, 'resolution', default=None)), + }) else: - f.update({ - 'abr': quality_num, + formats.append({ 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', }) - formats.append(f) + if not formats: + if not fatal: + return + msg = item['msg'] + if msg == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(msg, expected=True) + self._sort_formats(formats) + + subtitles = None + lyric = item.get('lyric') + if lyric: + subtitles = { + 'origin': [{ + 'url': lyric, + }], + } - cover = item.get('cover') + album = item.get('album') or {} return { - 'title': (item.get('name') or item.get('title')).strip(), + 'id': item_id, + 'title': title, 'formats': formats, - 'thumbnail': 'http:/' + cover if cover else None, - 'artist': item.get('artist'), + 'thumbnail': item.get('thumbnail'), + 'subtitles': subtitles, + 'duration': int_or_none(item.get('duration')), + 'track': title, + 'artist': item.get('artists_names'), + 'album': album.get('name') or album.get('title'), + 'album_artist': album.get('artists_names'), } - def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): - player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') - items = player_json['data'] - if 'item' in items: - items = items['item'] - - if len(items) == 1: - # one single song - data = self._extract_item(items[0], page_type) - data['id'] = id - - return data - else: - # playlist of songs - entries = [] - - for i, item in enumerate(items, 1): - entry = self._extract_item(item, page_type, fatal=False) - if not entry: - continue - entry['id'] = '%s-%d' % (id, i) - entries.append(entry) - - return { - '_type': 'playlist', - 'id': id, - 'title': playlist_title, - 'entries': entries, - } + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage( + url.replace('://zingmp3.vn/', '://mp3.zing.vn/'), + page_id, query={'play_song': 1}) + data_path = self._search_regex( + r'data-xml="([^"]+)', webpage, 'data path') + return self._process_data(self._download_json( + 'https://mp3.zing.vn/xhr' + data_path, page_id)['data']) -class ZingMp3IE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P\w+)\.html' +class ZingMp3IE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -95,49 +96,66 @@ class ZingMp3IE(ZingMp3BaseInfoExtractor): 'id': 'ZWZB9WAB', 'title': 'Xa Mãi Xa', 'ext': 'mp3', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.+\.jpg', + 'subtitles': { + 'origin': [{ + 'ext': 'lrc', + }] + }, + 'duration': 255, + 'track': 'Xa Mãi Xa', + 'artist': 'Bảo Thy', + 'album': 'Special Album', + 'album_artist': 'Bảo Thy', }, }, { - 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', - 'md5': '870295a9cd8045c0e15663565902618d', + 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', + 'md5': 'e9c972b693aa88301ef981c8151c4343', 'info_dict': { - 'id': 'ZW6BAEA0', - 'title': 'Let It Go (Frozen OST)', + 'id': 'ZO8ZF7C7', + 'title': 'Sương Hoa Đưa Lối', 'ext': 'mp4', + 'thumbnail': r're:^https?://.+\.jpg', + 'duration': 207, + 'track': 'Sương Hoa Đưa Lối', + 'artist': 'K-ICM, RYO', }, }, { + 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', + 'only_matching': True, + }] + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' + + def _process_data(self, data): + return self._extract_item(data, True) + + +class ZingMp3AlbumIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist' + _TESTS = [{ 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { '_type': 'playlist', 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', + 'title': 'Lâu Đài Tình Ái', }, 'playlist_count': 10, - 'skip': 'removed at the request of the owner', }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, + }, { + 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', + 'only_matching': True, }] - IE_NAME = 'zingmp3' - IE_DESC = 'mp3.zing.vn' - - def _real_extract(self, url): - page_id = self._match_id(url) - - webpage = self._download_webpage(url, page_id) - - player_json_url = self._search_regex([ - r'data-xml="([^"]+)', - r'&xmlURL=([^&]+)&' - ], webpage, 'player xml url') - - playlist_title = None - page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') - if page_type == 'video': - player_json_url = update_url_query(player_json_url, {'format': 'json'}) - else: - player_json_url = player_json_url.replace('/xml/', '/html5xml/') - if page_type == 'album': - playlist_title = self._og_search_title(webpage) - - return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) + IE_NAME = 'zingmp3:album' + + def _process_data(self, data): + def entries(): + for item in (data.get('items') or []): + entry = self._extract_item(item, False) + if entry: + yield entry + info = data.get('info') or {} + return self.playlist_result( + entries(), info.get('id'), info.get('name') or info.get('title')) From 5208ae92fc3e2916cdccae45c6b9a516be3d5796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 24 Mar 2021 02:57:35 +0700 Subject: [PATCH 106/242] [youtube] Fix default value for youtube_include_dash_manifest (closes #28523) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eb5a58807..badca3977 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1617,7 +1617,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f['format_id'] = itag formats.append(f) - if self._downloader.params.get('youtube_include_dash_manifest'): + if self._downloader.params.get('youtube_include_dash_manifest', True): dash_manifest_url = streaming_data.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats( From a40002444e64957594a1305bb2740fddb477beeb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Mar 2021 15:10:19 +0100 Subject: [PATCH 107/242] [bbc] fix BBC IPlayer Episodes/Group extraction(closes #28360) --- youtube_dl/extractor/bbc.py | 205 ++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 3 +- 2 files changed, 192 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 92e6f1bea..e8d000bbb 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,17 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import json import re from .common import InfoExtractor from ..compat import ( compat_etree_Element, compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, dict_get, float_or_none, @@ -811,7 +816,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) else super(BBCIE, cls).suitable(url)) @@ -1338,21 +1343,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor): playlist_id, title, description) -class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', 'title': 'The Disappearance', - 'description': 'French thriller serial about a missing teenager.', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', }, - 'playlist_mincount': 6, - 'skip': 'This programme is not currently available on BBC iPlayer', + 'playlist_mincount': 8, }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ # Available for over a year unlike 30 days for most other programmes 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', 'info_dict': { @@ -1361,14 +1494,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', - webpage, 'description', fatal=False, group='value') - return title, description + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b55947f6..e0fd0b648 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -95,7 +95,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, - BBCCoUkIPlayerPlaylistIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, BBCIE, ) From eafcadea261dba64c44c5c17ea8a47ac17256617 Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Wed, 24 Mar 2021 23:33:19 +0900 Subject: [PATCH 108/242] [extractor] escape forgotten dot for hostnames in regular expression (#28530) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/mtv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2b1b3bdf..f99d887ca 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2953,7 +2953,7 @@ class GenericIE(InfoExtractor): webpage) if not mobj: mobj = re.search( - r'data-video-link=["\'](?Phttp://m.mlb.com/video/[^"\']+)', + r'data-video-link=["\'](?Phttp://m\.mlb\.com/video/[^"\']+)', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f5e30d22d..600cf2d89 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -320,7 +320,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) if mobj: return mobj.group('url') From d1069d33b4ad3987acc2452756459065ce635d68 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Sun, 12 Apr 2020 23:27:58 +0200 Subject: [PATCH 109/242] [zoom] Add new extractor(closes #16597, closes #27002, closes #28531) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/zoom.py | 68 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/zoom.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e0fd0b648..b2b39e4dd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1629,4 +1629,5 @@ from .zingmp3 import ( ZingMp3IE, ZingMp3AlbumIE, ) +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py new file mode 100644 index 000000000..db073d91d --- /dev/null +++ b/youtube_dl/extractor/zoom.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + parse_filesize, + urlencode_postdata, +) + + +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' + _VALID_URL = r'(?Phttps?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P[A-Za-z0-9_.-]+)' + _TEST = { + 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'md5': 'ab445e8c911fddc4f9adc842c2c5d434', + 'info_dict': { + 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', + 'ext': 'mp4', + 'title': 'China\'s "two sessions" and the new five-year plan', + } + } + + def _real_extract(self, url): + base_url, play_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, play_id) + + try: + form = self._form_hidden_inputs('password_form', webpage) + except ExtractorError: + form = None + if form: + password = self._downloader.params.get('videopassword') + if not password: + raise ExtractorError( + 'This video is protected by a passcode, use the --video-password option', expected=True) + is_meeting = form.get('useWhichPasswd') == 'meeting' + validation = self._download_json( + base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''), + play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({ + 'id': form[('meet' if is_meeting else 'file') + 'Id'], + 'passwd': password, + 'action': form.get('action'), + })) + if not validation.get('status'): + raise ExtractorError(validation['errorMessage'], expected=True) + webpage = self._download_webpage(url, play_id) + + data = self._parse_json(self._search_regex( + r'(?s)window\.__data__\s*=\s*({.+?});', + webpage, 'data'), play_id, js_to_json) + + return { + 'id': play_id, + 'title': data['topic'], + 'url': data['viewMp4Url'], + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'http_headers': { + 'Referer': base_url, + }, + 'filesize_approx': parse_filesize(data.get('fileSize')), + } From c2fbfb49da2657002fafcf4c609f8d91030a6ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 25 Mar 2021 00:03:00 +0700 Subject: [PATCH 110/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 73fe316b9..1c3313280 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version + +Extractors ++ [zoom] Add support for zoom.us (#16597, #27002, #28531) +* [bbc] Fix BBC IPlayer Episodes/Group extraction (#28360) +* [youtube] Fix default value for youtube_include_dash_manifest (#28523) +* [zingmp3] Fix extraction (#11589, #16409, #16968, #27205) ++ [vgtv] Add support for new tv.aftonbladet.se URL schema (#28514) ++ [tiktok] Detect private videos (#28453) +* [vimeo:album] Fix extraction for albums with number of videos multiple + to page size (#28486) +* [vvvvid] Fix kenc format extraction (#28473) +* [mlb] Fix video extraction (#21241) +* [svtplay] Improve extraction (#28448) +* [applepodcasts] Fix extraction (#28445) +* [rtve] Improve extraction + + Extract all formats + * Fix RTVE Infantil extraction (#24851) + + Extract is_live and series + + version 2021.03.14 Core From 76da1c954aebba4af8def73dd2319fab2e27e50a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 25 Mar 2021 00:04:10 +0700 Subject: [PATCH 111/242] release 2021.03.25 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 8 files changed, 19 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 9cc79eff4..7feb0298c 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.14 + [debug] youtube-dl version 2021.03.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 3296e44a8..49e18173d 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index f74c29736..a1486b133 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ae9e273ef..7eaf5a202 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.14 + [debug] youtube-dl version 2021.03.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 04fbea2f6..20042d98c 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.14** +- [ ] I've verified that I'm running youtube-dl version **2021.03.25** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1c3313280..1b49e411a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.25 Extractors + [zoom] Add support for zoom.us (#16597, #27002, #28531) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2c00ec406..d2ad937a4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -90,7 +90,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles - - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:iplayer:episodes** + - **bbc.co.uk:iplayer:group** - **bbc.co.uk:playlist** - **BBVTV** - **Beatport** @@ -522,6 +523,7 @@ - **mixcloud:playlist** - **mixcloud:user** - **MLB** + - **MLBVideo** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -1212,4 +1214,6 @@ - **ZDFChannel** - **Zhihu** - **zingmp3**: mp3.zing.vn + - **zingmp3:album** + - **zoom** - **Zype** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5a540119c..e87b820fa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.14' +__version__ = '2021.03.25' From 8562218350a79d4709da8593bb0c538aa0824acf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 24 Mar 2021 19:28:51 +0100 Subject: [PATCH 112/242] [ard] improve clip id extraction(#22724)(closes #28528) --- youtube_dl/extractor/ard.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 143fc51e9..d57c5ba0f 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -335,7 +335,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?PY3JpZDovL[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -365,22 +365,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id = self._match_id(url) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ + video_id, data=json.dumps({ 'query': '''{ - playerPage(client:"%s", clipId: "%s") { + playerPage(client: "ard", clipId: "%s") { blockedByFsk broadcastedOn maturityContentRating @@ -410,7 +410,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % video_id, }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] @@ -435,7 +435,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, - 'display_id': display_id, 'title': title, 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), From 605e7b5e47c60c3ed7c2ca71df4d6bbd49fa8a77 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 25 Mar 2021 12:53:18 +0100 Subject: [PATCH 113/242] [youtube:tab] fix playlist/comunity continuation items extraction(closes #28266) --- youtube_dl/extractor/youtube.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index badca3977..faf3a344e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -24,6 +24,7 @@ from ..jsinterp import JSInterpreter from ..utils import ( ExtractorError, clean_html, + dict_get, float_or_none, int_or_none, mimetype2ext, @@ -2541,13 +2542,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): continuation = self._extract_continuation(continuation_renderer) continue + on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) continuation_items = try_get( - response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) + on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) if continuation_items: continuation_item = continuation_items[0] if not isinstance(continuation_item, dict): continue - renderer = continuation_item.get('gridVideoRenderer') + renderer = self._extract_grid_item_renderer(continuation_item) if renderer: grid_renderer = {'items': continuation_items} for entry in self._grid_entries(grid_renderer): @@ -2561,6 +2563,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(video_list_renderer) continue + renderer = continuation_item.get('backstagePostThreadRenderer') + if renderer: + continuation_renderer = {'contents': continuation_items} + for entry in self._post_thread_continuation_entries(continuation_renderer): + yield entry + continuation = self._extract_continuation(continuation_renderer) + continue break From c78591187080e7316c0042309fe956bfd0d38d30 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 25 Mar 2021 17:06:57 +0100 Subject: [PATCH 114/242] [vimeo] fix unlisted video extraction(closes #28414) --- youtube_dl/extractor/vimeo.py | 39 ++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 955651bec..5800962ea 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -24,6 +24,7 @@ from ..utils import ( merge_dicts, OnDemandPagedList, parse_filesize, + parse_iso8601, RegexNotFoundError, sanitized_Request, smuggle_url, @@ -278,7 +279,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? (?:videos?/)? (?P[0-9]+) - (?:/[\da-f]+)? + (?:/(?P[\da-f]{10}))? /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' @@ -577,11 +578,37 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - # Extract ID from URL - video_id = self._match_id(url) + video_id, unlisted_hash = re.match(self._VALID_URL, url).groups() + if unlisted_hash: + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + video = self._download_json( + 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), + video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + orig_url = url is_pro = 'vimeopro.com/' in url is_player = '://player.vimeo.com/video/' in url @@ -756,6 +783,8 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None info_dict = { From cc777dcaa0f3331626f33a7e4c61d804c43f4b5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 27 Mar 2021 17:37:45 +0100 Subject: [PATCH 115/242] [picarto] fix live stream extraction(closes #28532) --- youtube_dl/extractor/picarto.py | 100 ++++++++++++-------------------- 1 file changed, 37 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py index 8099ef1d6..e6c51e16b 100644 --- a/youtube_dl/extractor/picarto.py +++ b/youtube_dl/extractor/picarto.py @@ -1,22 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import time - from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, - try_get, - update_url_query, - urlencode_postdata, ) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)(?:/(?P[a-zA-Z0-9]+))?' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -34,65 +27,46 @@ class PicartoIE(InfoExtractor): return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') - - metadata = self._download_json( - 'https://api.picarto.tv/v1/channel/name/' + channel_id, - channel_id) - - if metadata.get('online') is False: + channel_id = self._match_id(url) + + data = self._download_json( + 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={ + 'query': '''{ + channel(name: "%s") { + adult + id + online + stream_name + title + } + getLoadBalancerUrl(channel_name: "%s") { + url + } +}''' % (channel_id, channel_id), + })['data'] + metadata = data['channel'] + + if metadata.get('online') == 0: raise ExtractorError('Stream is offline', expected=True) + title = metadata['title'] cdn_data = self._download_json( - 'https://picarto.tv/process/channel', channel_id, - data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Downloading load balancing info') - - token = mobj.group('token') or 'public' - params = { - 'con': int(time.time() * 1000), - 'token': token, - } + data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js', + channel_id, 'Downloading load balancing info') - prefered_edge = cdn_data.get('preferedEdge') formats = [] - - for edge in cdn_data['edges']: - edge_ep = edge.get('ep') - if not edge_ep or not isinstance(edge_ep, compat_str): + for source in (cdn_data.get('source') or []): + source_url = source.get('url') + if not source_url: continue - edge_id = edge.get('id') - for tech in cdn_data['techs']: - tech_label = tech.get('label') - tech_type = tech.get('type') - preference = 0 - if edge_id == prefered_edge: - preference += 1 - format_id = [] - if edge_id: - format_id.append(edge_id) - if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': - format_id.append('hls') - formats.extend(self._extract_m3u8_formats( - update_url_query( - 'https://%s/hls/%s/index.m3u8' - % (edge_ep, channel_id), params), - channel_id, 'mp4', preference=preference, - m3u8_id='-'.join(format_id), fatal=False)) - continue - elif tech_type == 'video/mp4' or tech_label == 'MP4': - format_id.append('mp4') - formats.append({ - 'url': update_url_query( - 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), - params), - 'format_id': '-'.join(format_id), - 'preference': preference, - }) - else: - # rtmp format does not seem to work - continue + source_type = source.get('type') + if source_type == 'html5/application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False)) + elif source_type == 'html5/video/mp4': + formats.append({ + 'url': source_url, + }) self._sort_formats(formats) mature = metadata.get('adult') @@ -103,10 +77,10 @@ class PicartoIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(metadata.get('title') or channel_id), + 'title': self._live_title(title.strip()), 'is_live': True, - 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']), 'channel': channel_id, + 'channel_id': metadata.get('id'), 'channel_url': 'https://picarto.tv/%s' % channel_id, 'age_limit': age_limit, 'formats': formats, From 49fc0a567febda65709cc5154ff046684a3b8427 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 27 Mar 2021 19:11:41 +0100 Subject: [PATCH 116/242] [youtube] fix video's channel extraction(closes #28562) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index faf3a344e..e48c5a7d2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1896,7 +1896,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info['channel'] = get_text(try_get( vsir, lambda x: x['owner']['videoOwnerRenderer']['title'], - compat_str)) + dict)) rows = try_get( vsir, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'], From 87a8bde7775ebc31175ebb111015b4052b50b7db Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 28 Mar 2021 08:46:33 +0100 Subject: [PATCH 117/242] [sbs] add support for ondemand watch URLs(closes #28566) --- youtube_dl/extractor/sbs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index f722528cd..0a806ee4e 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -43,6 +43,9 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', + 'only_matching': True, }] def _real_extract(self, url): From da762c4e329f6158956ddd51ac294e9183e5ce89 Mon Sep 17 00:00:00 2001 From: Chris Hranj Date: Mon, 29 Mar 2021 15:05:19 -0400 Subject: [PATCH 118/242] [instagram] Improve title extraction and extract duration (#28469) Co-authored-by: Sergey M. --- youtube_dl/extractor/instagram.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 1eeddc3b6..12e10143c 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, get_element_by_attribute, int_or_none, lowercase_escape, @@ -32,6 +33,7 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', @@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by britneyspears', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', @@ -86,6 +89,24 @@ class InstagramIE(InfoExtractor): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + }, { + # IGTV + 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', + 'info_dict': { + 'id': 'BkfuX9UB-eK', + 'ext': 'mp4', + 'title': 'Fingerboarding Tricks with @cass.fb', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 53.83, + 'timestamp': 1530032919, + 'upload_date': '20180626', + 'uploader_id': 'instagram', + 'uploader': 'Instagram', + 'like_count': int, + 'comment_count': int, + 'comments': list, + 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', + } }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -159,7 +180,9 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') + title = media.get('title') thumbnail = media.get('display_src') or media.get('display_url') + duration = float_or_none(media.get('video_duration')) timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') @@ -200,9 +223,10 @@ class InstagramIE(InfoExtractor): continue entries.append({ 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, + 'title': node.get('title') or 'Video %d' % edge_num, 'url': node_video_url, 'thumbnail': node.get('display_url'), + 'duration': float_or_none(node.get('video_duration')), 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), 'view_count': int_or_none(node.get('video_view_count')), @@ -239,8 +263,9 @@ class InstagramIE(InfoExtractor): 'id': video_id, 'formats': formats, 'ext': 'mp4', - 'title': 'Video by %s' % uploader_id, + 'title': title or 'Video by %s' % uploader_id, 'description': description, + 'duration': duration, 'thumbnail': thumbnail, 'timestamp': timestamp, 'uploader_id': uploader_id, From 287e50b56b4c71da8fd0c3ffdeca9bff5ab0b005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Mar 2021 03:37:43 +0700 Subject: [PATCH 119/242] [francetvinfo] Improve video id extraction (closes #28584) --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3ca415077..7cc88bf18 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -399,7 +399,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), webpage, 'video id') return self._make_url_result(video_id) From 955894e72fd8d4fdce5d85fc006d548278e6d9eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 30 Mar 2021 10:00:40 +0100 Subject: [PATCH 120/242] [vlive] fix inkey request(closes #28589) --- youtube_dl/extractor/vlive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index e2f5d81b8..42da34d44 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -106,7 +106,7 @@ class VLiveIE(VLiveBaseIE): raise ExtractorError('Unable to log in', expected=True) def _call_api(self, path_template, video_id, fields=None): - query = {'appId': self._APP_ID, 'gcc': 'KR'} + query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'} if fields: query['fields'] = fields try: From 207bc35d348efdfdfe2bd7119e004a1acf0ab3d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 31 Mar 2021 02:58:01 +0700 Subject: [PATCH 121/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 1b49e411a..1297c19f7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Extractors +* [vlive] Fix inkey request (#28589) +* [francetvinfo] Improve video id extraction (#28584) ++ [instagram] Extract duration (#28469) +* [instagram] Improve title extraction (#28469) ++ [sbs] Add support for ondemand watch URLs (#28566) +* [youtube] Fix video's channel extraction (#28562) +* [picarto] Fix live stream extraction (#28532) +* [vimeo] Fix unlisted video extraction (#28414) +* [youtube:tab] Fix playlist/community continuation items extraction (#28266) +* [ard] Improve clip id extraction (#22724, #28528) + + version 2021.03.25 Extractors From 8f493de9fb3a7f123bdf887163efa06ce9d6b051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 31 Mar 2021 02:59:07 +0700 Subject: [PATCH 122/242] release 2021.03.31 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 7feb0298c..2ac4df8db 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.25 + [debug] youtube-dl version 2021.03.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 49e18173d..5ad5590bf 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index a1486b133..ea96c4c20 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 7eaf5a202..ed3abd45c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.25 + [debug] youtube-dl version 2021.03.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 20042d98c..c1067ee1f 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.25** +- [ ] I've verified that I'm running youtube-dl version **2021.03.31** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index 1297c19f7..4c094b771 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.03.31 Extractors * [vlive] Fix inkey request (#28589) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e87b820fa..bcfdae23d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.25' +__version__ = '2021.03.31' From 28bab774a0df2c80b689c277390da8617131db35 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 30 Mar 2021 21:44:41 +0100 Subject: [PATCH 123/242] [youtube] imporve age-restricted video extraction(#28578) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e48c5a7d2..6a92938a5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1432,7 +1432,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): base_url = self.http_scheme() + '//www.youtube.com/' webpage_url = base_url + 'watch?v=' + video_id webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999', video_id, fatal=False) + webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None if webpage: From b97fb2edac25182ff3dcf4cb8537517a1ec9e4de Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 31 Mar 2021 20:07:13 +0100 Subject: [PATCH 124/242] [vimeo] fix password protected review extraction(closes #27591) --- youtube_dl/extractor/vimeo.py | 64 ++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5800962ea..a90cf0630 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -75,25 +75,28 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') - def _verify_video_password(self, url, video_id, webpage): + def _get_video_password(self): password = self._downloader.params.get('videopassword') if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ - 'password': password, - 'token': token, - }) + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', + expected=True) + return password + + def _verify_video_password(self, url, video_id, password, token, vuid): if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') - password_request = sanitized_Request(url + '/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) self._set_vimeo_cookie('vuid', vuid) return self._download_webpage( - password_request, video_id, - 'Verifying the password', 'Wrong password') + url + '/password', video_id, 'Verifying the password', + 'Wrong password', data=urlencode_postdata({ + 'password': password, + 'token': token, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': url, + }) def _extract_xsrft_and_vuid(self, webpage): xsrft = self._search_regex( @@ -332,9 +335,9 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '54469442', 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', - 'uploader': 'The BLN & Business of Software', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware', - 'uploader_id': 'theblnbusinessofsoftware', + 'uploader': 'Business of Software', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware', + 'uploader_id': 'businessofsoftware', 'duration': 3610, 'description': None, }, @@ -469,6 +472,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Unable to download JSON metadata'], + 'skip': 'this page is no longer available.', }, { 'url': 'http://player.vimeo.com/video/68375962', @@ -551,9 +555,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return urls[0] if urls else None def _verify_player_video_password(self, url, video_id, headers): - password = self._downloader.params.get('videopassword') - if password is None: - raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + password = self._get_video_password() data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) @@ -697,7 +699,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if re.search(r']+?id="pw_form"', webpage) is not None: if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') - self._verify_video_password(redirect_url, video_id, webpage) + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) return self._real_extract( smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) else: @@ -1091,10 +1096,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _real_extract(self, url): page_url, video_id = re.match(self._VALID_URL, url).groups() - clip_data = self._download_json( - page_url.replace('/review/', '/review/data/'), - video_id)['clipData'] - config_url = clip_data['configUrl'] + data = self._download_json( + page_url.replace('/review/', '/review/data/'), video_id) + if data.get('isLocked') is True: + video_password = self._get_video_password() + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', video_id) + webpage = self._verify_video_password( + 'https://vimeo.com/' + video_id, video_id, + video_password, viewer['xsrft'], viewer['vuid']) + clip_page_config = self._parse_json(self._search_regex( + r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', + webpage, 'clip page config'), video_id) + config_url = clip_page_config['player']['config_url'] + clip_data = clip_page_config.get('clip') or {} + else: + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( From 14f29f087e6097feb46bdb84878924bc410a57eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:05:10 +0700 Subject: [PATCH 125/242] [youtube] Setup CONSENT cookie when needed (closes #28604) --- youtube_dl/extractor/youtube.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6a92938a5..b940c0bad 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -249,7 +249,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _initialize_consent(self): + cookies = self._get_cookies('https://www.youtube.com/') + if cookies.get('__Secure-3PSID'): + return + consent_id = None + consent = cookies.get('CONSENT') + if consent: + if 'YES' in consent.value: + return + consent_id = self._search_regex( + r'PENDING\+(\d+)', consent.value, 'consent', default=None) + if not consent_id: + consent_id = random.randint(100, 999) + self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _real_initialize(self): + self._initialize_consent() if self._downloader is None: return if not self._login(): From e789bb1aa4cb627d3d7ca79a5e5daa8d2f58cda6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:43:08 +0700 Subject: [PATCH 126/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4c094b771..ee2dc88bc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +version + +Extractors +* [youtube] Setup CONSENT cookie when needed (#28604) +* [vimeo] Fix password protected review extraction (#27591) +* [youtube] Improve age-restricted video extraction (#28578) + + version 2021.03.31 Extractors From ca304beb1538e54c5a18fdd50846ed2259d63b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Apr 2021 04:47:11 +0700 Subject: [PATCH 127/242] release 2021.04.01 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 2ac4df8db..98ec799e8 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.31 + [debug] youtube-dl version 2021.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5ad5590bf..5387a6cd1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index ea96c4c20..945c80366 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index ed3abd45c..0acc8b679 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.03.31 + [debug] youtube-dl version 2021.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index c1067ee1f..42c3126a3 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.03.31** +- [ ] I've verified that I'm running youtube-dl version **2021.04.01** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index ee2dc88bc..4304ecd9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.01 Extractors * [youtube] Setup CONSENT cookie when needed (#28604) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bcfdae23d..0457d1a15 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.03.31' +__version__ = '2021.04.01' From 37488630703944b4f2bda84a26391ae61d29e15b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Apr 2021 11:50:30 +0100 Subject: [PATCH 128/242] [youtube:tab] Add support for hashtag videos extraction(closes #28308) --- youtube_dl/extractor/youtube.py | 139 +++++++++++++++++++------------- 1 file changed, 85 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b940c0bad..1f5497e24 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1959,7 +1959,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): invidio\.us )/ (?: - (?:channel|c|user|feed)/| + (?:channel|c|user|feed|hashtag)/| (?:playlist|watch)\?.*?\blist=| (?!(?:watch|embed|v|e)\b) ) @@ -2245,6 +2245,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/TheYoungTurks/live', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, }] @classmethod @@ -2392,6 +2399,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): for entry in self._post_thread_entries(renderer): yield entry + def _rich_grid_entries(self, contents): + for content in contents: + video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict) + if video_renderer: + entry = self._video_entry(video_renderer) + if entry: + yield entry + @staticmethod def _build_continuation_query(continuation, ctp=None): query = { @@ -2442,55 +2457,60 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not tab_content: return slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict) - if not slr_renderer: - return - is_channels_tab = tab.get('title') == 'Channels' - continuation = None - slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] - for slr_content in slr_contents: - if not isinstance(slr_content, dict): - continue - is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - renderer = isr_content.get('playlistVideoListRenderer') - if renderer: - for entry in self._playlist_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('gridRenderer') - if renderer: - for entry in self._grid_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) - continue - renderer = isr_content.get('shelfRenderer') - if renderer: - for entry in self._shelf_entries(renderer, not is_channels_tab): - yield entry + if slr_renderer: + is_channels_tab = tab.get('title') == 'Channels' + continuation = None + slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or [] + for slr_content in slr_contents: + if not isinstance(slr_content, dict): continue - renderer = isr_content.get('backstagePostThreadRenderer') - if renderer: - for entry in self._post_thread_entries(renderer): - yield entry - continuation = self._extract_continuation(renderer) + is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: continue - renderer = isr_content.get('videoRenderer') - if renderer: - entry = self._video_entry(renderer) - if entry: - yield entry - + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): + continue + renderer = isr_content.get('playlistVideoListRenderer') + if renderer: + for entry in self._playlist_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('gridRenderer') + if renderer: + for entry in self._grid_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('shelfRenderer') + if renderer: + for entry in self._shelf_entries(renderer, not is_channels_tab): + yield entry + continue + renderer = isr_content.get('backstagePostThreadRenderer') + if renderer: + for entry in self._post_thread_entries(renderer): + yield entry + continuation = self._extract_continuation(renderer) + continue + renderer = isr_content.get('videoRenderer') + if renderer: + entry = self._video_entry(renderer) + if entry: + yield entry + + if not continuation: + continuation = self._extract_continuation(is_renderer) if not continuation: - continuation = self._extract_continuation(is_renderer) - - if not continuation: - continuation = self._extract_continuation(slr_renderer) + continuation = self._extract_continuation(slr_renderer) + else: + rich_grid_renderer = tab_content.get('richGridRenderer') + if not rich_grid_renderer: + return + for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []): + yield entry + continuation = self._extract_continuation(rich_grid_renderer) headers = { 'x-youtube-client-name': '1', @@ -2586,6 +2606,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(continuation_renderer) continue + renderer = continuation_item.get('richItemRenderer') + if renderer: + for entry in self._rich_grid_entries(continuation_items): + yield entry + continuation = self._extract_continuation({'contents': continuation_items}) + continue break @@ -2642,7 +2668,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - playlist_id = title = description = None + playlist_id = item_id + title = description = None if renderer: channel_title = renderer.get('title') or item_id tab_title = selected_tab.get('title') @@ -2651,12 +2678,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): title += ' - %s' % tab_title description = renderer.get('description') playlist_id = renderer.get('externalId') - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) - if renderer: - title = renderer.get('title') - description = None - playlist_id = item_id + else: + renderer = try_get( + data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + if renderer: + title = renderer.get('title') + else: + renderer = try_get( + data, lambda x: x['header']['hashtagHeaderRenderer'], dict) + if renderer: + title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( self._entries(selected_tab, identity_token), playlist_id=playlist_id, playlist_title=title, From c5aa8f36bf636c3db81afd556d0e95d91b72b9c7 Mon Sep 17 00:00:00 2001 From: Vid Date: Thu, 18 Mar 2021 18:53:06 +0100 Subject: [PATCH 129/242] [arnes] Add new extractor(closes #28483) --- youtube_dl/extractor/arnes.py | 101 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/arnes.py diff --git a/youtube_dl/extractor/arnes.py b/youtube_dl/extractor/arnes.py new file mode 100644 index 000000000..c0032fcab --- /dev/null +++ b/youtube_dl/extractor/arnes.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + remove_start, +) + + +class ArnesIE(InfoExtractor): + IE_NAME = 'video.arnes.si' + IE_DESC = 'Arnes Video' + _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P[0-9a-zA-Z]{12})' + _TESTS = [{ + 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10', + 'md5': '4d0f4d0a03571b33e1efac25fd4a065d', + 'info_dict': { + 'id': 'a1qrWTOQfVoU', + 'ext': 'mp4', + 'title': 'Linearna neodvisnost, definicija', + 'description': 'Linearna neodvisnost, definicija', + 'license': 'PRIVATE', + 'creator': 'Polona Oblak', + 'timestamp': 1585063725, + 'upload_date': '20200324', + 'channel': 'Polona Oblak', + 'channel_id': 'q6pc04hw24cj', + 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj', + 'duration': 596.75, + 'view_count': int, + 'tags': ['linearna_algebra'], + 'start_time': 10, + } + }, { + 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC', + 'only_matching': True, + }] + _BASE_URL = 'https://video.arnes.si' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + self._BASE_URL + '/api/public/video/' + video_id, video_id)['data'] + title = video['title'] + + formats = [] + for media in (video.get('media') or []): + media_url = media.get('url') + if not media_url: + continue + formats.append({ + 'url': self._BASE_URL + media_url, + 'format_id': remove_start(media.get('format'), 'FORMAT_'), + 'format_note': media.get('formatTranslation'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + self._sort_formats(formats) + + channel = video.get('channel') or {} + channel_id = channel.get('url') + thumbnail = video.get('thumbnailUrl') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._BASE_URL + thumbnail, + 'description': video.get('description'), + 'license': video.get('license'), + 'creator': video.get('author'), + 'timestamp': parse_iso8601(video.get('creationTime')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'duration': float_or_none(video.get('duration'), 1000), + 'view_count': int_or_none(video.get('views')), + 'tags': video.get('hashtags'), + 'start_time': int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b2b39e4dd..8cf348772 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -72,6 +72,7 @@ from .arte import ( ArteTVEmbedIE, ArteTVPlaylistIE, ) +from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, From 392c467f95cbf89114235038e1938c72d97144d9 Mon Sep 17 00:00:00 2001 From: Allan Daemon Date: Mon, 15 May 2017 00:04:39 -0300 Subject: [PATCH 130/242] [palcomp3] Add new extractor(closes #13120) --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/palcomp3.py | 148 +++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 youtube_dl/extractor/palcomp3.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8cf348772..65fefabe8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -879,6 +879,11 @@ from .packtpub import ( PacktPubIE, PacktPubCourseIE, ) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py new file mode 100644 index 000000000..fb29d83f9 --- /dev/null +++ b/youtube_dl/extractor/palcomp3.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, +) + + +class PalcoMP3BaseIE(InfoExtractor): + _GQL_QUERY_TMPL = '''{ + artist(slug: "%s") { + %s + } +}''' + _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") { + %s + }''' + _MUSIC_FIELDS = '''duration + hls + mp3File + musicID + plays + title''' + + def _call_api(self, artist_slug, artist_fields): + return self._download_json( + 'https://www.palcomp3.com.br/graphql/', artist_slug, query={ + 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields), + })['data'] + + def _parse_music(self, music): + music_id = compat_str(music['musicID']) + title = music['title'] + + formats = [] + hls_url = music.get('hls') + if hls_url: + formats.append({ + 'url': hls_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + mp3_file = music.get('mp3File') + if mp3_file: + formats.append({ + 'url': mp3_file, + }) + + return { + 'id': music_id, + 'title': title, + 'formats': formats, + 'duration': int_or_none(music.get('duration')), + 'view_count': int_or_none(music.get('plays')), + } + + def _real_initialize(self): + self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS + + def _real_extract(self, url): + artist_slug, music_slug = re.match(self._VALID_URL, url).groups() + artist_fields = self._ARTIST_FIELDS_TMPL % music_slug + music = self._call_api(artist_slug, artist_fields)['artist']['music'] + return self._parse_music(music) + + +class PalcoMP3IE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:song' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', + 'info_dict': { + 'id': '3162927', + 'ext': 'mp3', + 'title': 'Nossas Composições - CUIDA BEM DELA', + 'duration': 210, + 'view_count': int, + } + }] + + @classmethod + def suitable(cls, url): + return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url) + + +class PalcoMP3ArtistIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:artist' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.palcomp3.com.br/condedoforro/', + 'info_dict': { + 'id': '358396', + 'title': 'Conde do Forró', + }, + 'playlist_mincount': 188, + }] + _ARTIST_FIELDS_TMPL = '''artistID + musics { + nodes { + %s + } + } + name''' + + @ classmethod + def suitable(cls, url): + return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url) + + def _real_extract(self, url): + artist_slug = self._match_id(url) + artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist'] + + def entries(): + for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []): + yield self._parse_music(music) + + return self.playlist_result( + entries(), str_or_none(artist.get('artistID')), artist.get('name')) + + +class PalcoMP3VideoIE(PalcoMP3BaseIE): + IE_NAME = 'PalcoMP3:video' + _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P[^/]+)/(?P[^/?&#]+)/?#clipe' + _TESTS = [{ + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe', + 'add_ie': ['Youtube'], + 'info_dict': { + 'id': '_pD1nR2qqPg', + 'ext': 'mp4', + 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande', + 'description': 'md5:7043342c09a224598e93546e98e49282', + 'upload_date': '20161107', + 'uploader_id': 'maiaramaraisaoficial', + 'uploader': 'Maiara e Maraisa', + } + }] + _MUSIC_FIELDS = 'youtubeID' + + def _parse_music(self, music): + youtube_id = music['youtubeID'] + return self.url_result(youtube_id, 'Youtube', youtube_id) From 04d4a3b136060158438c3f2c1b31c884c6961712 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Apr 2021 19:03:45 +0100 Subject: [PATCH 131/242] [screencastomatic] fix extraction(closes #11976, closes #24489) --- youtube_dl/extractor/screencastomatic.py | 48 +++++++++++++++--------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index b5e76c9af..0afdc1715 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + get_element_by_class, + int_or_none, + remove_start, + strip_or_none, + unified_strdate, +) class ScreencastOMaticIE(InfoExtractor): - _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' - _TEST = { + _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', 'md5': '483583cb80d92588f15ccbedd90f0c18', 'info_dict': { @@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', - 'duration': 369.163, + 'duration': 369, + 'upload_date': '20141216', } - } + }, { + 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl', + 'only_matching': True, + }, { + 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - jwplayer_data = self._parse_json( - self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), - video_id, transform_source=js_to_json) - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict.update({ - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/' + video_id, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info.update({ + 'id': video_id, + 'title': get_element_by_class('overlayTitle', webpage), + 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None, + 'duration': int_or_none(self._search_regex( + r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};', + webpage, 'duration', default=None)), + 'upload_date': unified_strdate(remove_start( + get_element_by_class('overlayPublished', webpage), 'Published: ')), }) - return info_dict + return info From 1df2596f81695bf452ffbfd89596d115d9b2daf5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 3 Apr 2021 07:54:02 +0100 Subject: [PATCH 132/242] [extractor/common] fix _get_cookies method for python 2(#20673, #23256, #20326, closes #28640) --- youtube_dl/extractor/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d3b6724df..fcbf18ee6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2899,7 +2899,10 @@ class InfoExtractor(object): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) + cookie = req.get_header('Cookie') + if cookie and sys.version_info[0] == 2: + cookie = str(cookie) + return compat_cookies.SimpleCookie(cookie) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 654b4f4ff2718f38b3182c1188c5d569c14cc70a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 3 Apr 2021 08:23:35 +0100 Subject: [PATCH 133/242] [youtube] prioritize information from YoutubeIE for playlist entries(closes #28619, closes #28636) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f5497e24..2e027528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,7 +329,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (lambda x: x['ownerText']['runs'][0]['text'], lambda x: x['shortBylineText']['runs'][0]['text']), compat_str) return { - '_type': 'url_transparent', + '_type': 'url', 'ie_key': YoutubeIE.ie_key(), 'id': video_id, 'url': video_id, From aee6feb02adaa316455ea9497e92cc82b720f231 Mon Sep 17 00:00:00 2001 From: RomanEmelyanov Date: Sun, 4 Apr 2021 11:14:37 +0300 Subject: [PATCH 134/242] [youku] Update ccode(closes #17852, closes #28447, closes #28460) (#28648) --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 61d1ab209..880c89687 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0590', + 'ccode': '0532', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From e165f5641fdf62975d3b6a40132a475c9cbaea2a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 15:52:14 +0100 Subject: [PATCH 135/242] [extractor/common] fix JSON-LD VideoObject author extraction --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fcbf18ee6..8ef22779a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,6 +70,7 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, + try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1282,7 +1283,7 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': str_or_none(e.get('author')), + 'uploader': try_get(e, lambda x: x['author']['name'], compat_str), 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), From 3ae9c0f410b1d4f63e8bada67dd62a8d2852be32 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 16:26:56 +0100 Subject: [PATCH 136/242] [vimeo] improve extraction(closes #28591) --- youtube_dl/extractor/vimeo.py | 239 ++++++++++++++-------------------- 1 file changed, 100 insertions(+), 139 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a90cf0630..102687b82 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import base64 import functools -import json import re import itertools @@ -17,15 +16,14 @@ from ..compat import ( from ..utils import ( clean_html, determine_ext, - dict_get, ExtractorError, + get_element_by_class, js_to_json, int_or_none, merge_dicts, OnDemandPagedList, parse_filesize, parse_iso8601, - RegexNotFoundError, sanitized_Request, smuggle_url, std_headers, @@ -127,10 +125,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): video_title = video_data['title'] live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' + request = config.get('request') or {} formats = [] - config_files = video_data.get('files') or config['request'].get('files', {}) - for f in config_files.get('progressive', []): + config_files = video_data.get('files') or request.get('files') or {} + for f in (config_files.get('progressive') or []): video_url = f.get('url') if not video_url: continue @@ -146,7 +145,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): # TODO: fix handling of 308 status code returned for live archive manifest requests sep_pattern = r'/sep/video/' for files_type in ('hls', 'dash'): - for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items(): manifest_url = cdn_data.get('url') if not manifest_url: continue @@ -192,17 +191,15 @@ class VimeoBaseInfoExtractor(InfoExtractor): f['preference'] = -40 subtitles = {} - text_tracks = config['request'].get('text_tracks') - if text_tracks: - for tt in text_tracks: - subtitles[tt['lang']] = [{ - 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), - }] + for tt in (request.get('text_tracks') or []): + subtitles[tt['lang']] = [{ + 'ext': 'vtt', + 'url': urljoin('https://vimeo.com', tt['url']), + }] thumbnails = [] if not is_live: - for key, thumb in video_data.get('thumbs', {}).items(): + for key, thumb in (video_data.get('thumbs') or {}).items(): thumbnails.append({ 'id': key, 'width': int_or_none(key), @@ -322,6 +319,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'duration': 1595, 'upload_date': '20130610', 'timestamp': 1370893156, + 'license': 'by', }, 'params': { 'format': 'best[protocol=https]', @@ -400,6 +398,12 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', 'duration': 62, + 'subtitles': { + 'de': [{'ext': 'vtt'}], + 'en': [{'ext': 'vtt'}], + 'es': [{'ext': 'vtt'}], + 'fr': [{'ext': 'vtt'}], + }, } }, { @@ -572,6 +576,37 @@ class VimeoIE(VimeoBaseInfoExtractor): def _real_initialize(self): self._login() + def _extract_from_api(self, video_id, unlisted_hash=None): + token = self._download_json( + 'https://vimeo.com/_rv/jwt', video_id, headers={ + 'X-Requested-With': 'XMLHttpRequest' + })['token'] + api_url = 'https://api.vimeo.com/videos/' + video_id + if unlisted_hash: + api_url += ':' + unlisted_hash + video = self._download_json( + api_url, video_id, headers={ + 'Authorization': 'jwt ' + token, + }, query={ + 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', + }) + info = self._parse_config(self._download_json( + video['config_url'], video_id), video_id) + self._vimeo_sort_formats(info['formats']) + get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) + info.update({ + 'description': video.get('description'), + 'license': video.get('license'), + 'release_timestamp': get_timestamp('release'), + 'timestamp': get_timestamp('created'), + 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), + }) + connections = try_get( + video, lambda x: x['metadata']['connections'], dict) or {} + for k in ('comment', 'like'): + info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) + return info + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) headers = std_headers.copy() @@ -580,48 +615,19 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url - # Extract ID from URL - video_id, unlisted_hash = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') if unlisted_hash: - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest' - })['token'] - video = self._download_json( - 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash), - video_id, headers={ - 'Authorization': 'jwt ' + token, - }, query={ - 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', - }) - info = self._parse_config(self._download_json( - video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) - get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) - info.update({ - 'description': video.get('description'), - 'license': video.get('license'), - 'release_timestamp': get_timestamp('release'), - 'timestamp': get_timestamp('created'), - 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])), - }) - connections = try_get( - video, lambda x: x['metadata']['connections'], dict) or {} - for k in ('comment', 'like'): - info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total'])) - return info + return self._extract_from_api(video_id, unlisted_hash) orig_url = url is_pro = 'vimeopro.com/' in url - is_player = '://player.vimeo.com/video/' in url if is_pro: # some videos require portfolio_id to be present in player url # https://github.com/ytdl-org/youtube-dl/issues/20070 url = self._extract_url(url, self._download_webpage(url, video_id)) if not url: url = 'https://vimeo.com/' + video_id - elif is_player: - url = 'https://player.vimeo.com/video/' + video_id elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id @@ -641,14 +647,25 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) raise - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) + if '://player.vimeo.com/video/' in url: + config = self._parse_json(self._search_regex( + r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + if config.get('view') == 4: + config = self._verify_player_video_password( + redirect_url, video_id, headers) + info = self._parse_config(config, video_id) + self._vimeo_sort_formats(info['formats']) + return info + + if re.search(r']+?id="pw_form"', webpage): + video_password = self._get_video_password() + token, vuid = self._extract_xsrft_and_vuid(webpage) + webpage = self._verify_video_password( + redirect_url, video_id, video_password, token, vuid) vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: - seed_status = vimeo_config.get('seed_status', {}) + seed_status = vimeo_config.get('seed_status') or {} if seed_status.get('state') == 'failed': raise ExtractorError( '%s said: %s' % (self.IE_NAME, seed_status['title']), @@ -657,70 +674,40 @@ class VimeoIE(VimeoBaseInfoExtractor): cc_license = None timestamp = None video_description = None + info_dict = {} - # Extract the config JSON - try: - try: - config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, - 'config URL', default=None) - if not config_url: - # Sometimes new react-based page is served instead of old one that require - # different config URL extraction approach (see - # https://github.com/ytdl-org/youtube-dl/pull/7209) - page_config = self._parse_json(self._search_regex( - r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', - webpage, 'page config'), video_id) - config_url = page_config['player']['config_url'] - cc_license = page_config.get('cc_license') - timestamp = try_get( - page_config, lambda x: x['clip']['uploaded_on'], - compat_str) - video_description = clean_html(dict_get( - page_config, ('description', 'description_html_escaped'))) - config = self._download_json(config_url, video_id) - except RegexNotFoundError: - # For pro videos or player.vimeo.com urls - # We try to find out to which variable is assigned the config dic - m_variable_name = re.search(r'(\w)\.video\.id', webpage) - if m_variable_name is not None: - config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))] - else: - config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] - config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;') - config_re.append(r'\bconfig\s*=\s*({.+?})\s*;') - config = self._search_regex(config_re, webpage, 'info section', - flags=re.DOTALL) - config = json.loads(config) - except Exception as e: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option') - - if re.search(r']+?id="pw_form"', webpage) is not None: - if '_video_password_verified' in data: - raise ExtractorError('video password verification failed!') - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - return self._real_extract( - smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) - else: - raise ExtractorError('Unable to extract info section', - cause=e) + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + if channel_id: + config_url = self._html_search_regex( + r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + video_description = clean_html(get_element_by_class('description', webpage)) + info_dict.update({ + 'channel_id': channel_id, + 'channel_url': 'https://vimeo.com/channels/' + channel_id, + }) else: - if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id, headers) - + page_config = self._parse_json(self._search_regex( + r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', + webpage, 'page config', default='{}'), video_id, fatal=False) + if not page_config: + return self._extract_from_api(video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') + clip = page_config.get('clip') or {} + timestamp = clip.get('uploaded_on') + video_description = clean_html( + clip.get('description') or page_config.get('description_html_escaped')) + config = self._download_json(config_url, video_id) video = config.get('video') or {} vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: return True - if config.get('user', {}).get('purchased'): + if try_get(config, lambda x: x['user']['purchased']): return True - for purchase_option in vod.get('purchase_options', []): + for purchase_option in (vod.get('purchase_options') or []): if purchase_option.get('purchased'): return True label = purchase_option.get('label_string') @@ -735,14 +722,10 @@ class VimeoIE(VimeoBaseInfoExtractor): 'https://player.vimeo.com/player/%s' % feature_id, {'force_feature_id': True}), 'Vimeo') - # Extract video description - if not video_description: - video_description = self._html_search_regex( - r'(?s)]*>(.*?)', - webpage, 'description', default=None) if not video_description: video_description = self._html_search_meta( - 'description', webpage, default=None) + ['description', 'og:description', 'twitter:description'], + webpage, default=None) if not video_description and is_pro: orig_webpage = self._download_webpage( orig_url, video_id, @@ -751,25 +734,14 @@ class VimeoIE(VimeoBaseInfoExtractor): if orig_webpage: video_description = self._html_search_meta( 'description', orig_webpage, default=None) - if not video_description and not is_player: + if not video_description: self._downloader.report_warning('Cannot find video description') - # Extract upload date if not timestamp: timestamp = self._search_regex( r']+datetime="([^"]+)"', webpage, 'timestamp', default=None) - try: - view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) - like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) - comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count')) - except RegexNotFoundError: - # This info is only available in vimeo.com/{id} urls - view_count = None - like_count = None - comment_count = None - formats = [] source_format = self._extract_original_format( @@ -788,31 +760,20 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') - channel_id = self._search_regex( - r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) - channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None - - info_dict = { + info_dict.update({ 'formats': formats, 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, 'license': cc_license, - 'channel_id': channel_id, - 'channel_url': channel_url, - } - - info_dict = merge_dicts(info_dict, info_dict_config, json_ld) + }) - return info_dict + return merge_dicts(info_dict, info_dict_config, json_ld) class VimeoOndemandIE(VimeoIE): IE_NAME = 'vimeo:ondemand' - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P[^/?#&]+)' _TESTS = [{ # ondemand video not available via https://vimeo.com/id 'url': 'https://vimeo.com/ondemand/20704', From 6beb1ac65b03415764c487fd139298f22e1e0313 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 19:16:17 +0100 Subject: [PATCH 137/242] [extractor/common] keep support for non standard JSON-LD VideoObject author values --- youtube_dl/extractor/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ef22779a..78ff5b6d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -70,7 +70,6 @@ from ..utils import ( str_or_none, str_to_int, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -1276,6 +1275,7 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' + author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), @@ -1283,7 +1283,11 @@ class InfoExtractor(object): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': try_get(e, lambda x: x['author']['name'], compat_str), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), From 162bf9e10a4e6a08f5ed156a68054ef9b4d2b60e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Apr 2021 19:49:24 +0100 Subject: [PATCH 138/242] [compat] add compat_SimpleCookie --- youtube_dl/compat.py | 9 +++++++++ youtube_dl/extractor/common.py | 9 +++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 6c3d49d45..8bbebebcf 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -73,6 +73,15 @@ try: except ImportError: # Python 2 import Cookie as compat_cookies +if sys.version_info[0] == 2: + class compat_SimpleCookie(compat_cookies.SimpleCookie): + def load(self, rawdata): + if isinstance(rawdata, unicode): + rawdata = str(rawdata) + return super(compat_SimpleCookie, self).load(rawdata) +else: + compat_SimpleCookie = compat_cookies.SimpleCookie + try: import html.entities as compat_html_entities except ImportError: # Python 2 diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78ff5b6d0..af289d705 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,13 +17,13 @@ import math from ..compat import ( compat_cookiejar_Cookie, - compat_cookies, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_os_name, + compat_SimpleCookie, compat_str, compat_urllib_error, compat_urllib_parse_unquote, @@ -2901,13 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + """ Return a compat_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - cookie = req.get_header('Cookie') - if cookie and sys.version_info[0] == 2: - cookie = str(cookie) - return compat_cookies.SimpleCookie(cookie) + return compat_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 760c911299aa607ca967d6d4be2985528bacf29f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 5 Apr 2021 07:16:50 +0100 Subject: [PATCH 139/242] [compat] add compat_SimpleCookie to __all__ array --- youtube_dl/compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8bbebebcf..8a5262dc8 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3002,6 +3002,7 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_SimpleCookie', 'compat_Struct', 'compat_b64decode', 'compat_basestring', From 25b1287323f5836c9416a8183096adc63809d5ce Mon Sep 17 00:00:00 2001 From: guredora Date: Sun, 4 Apr 2021 22:12:07 +0900 Subject: [PATCH 140/242] [line] add support live.line.me (closes #17205)(closes #28658) --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/line.py | 142 ++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 65fefabe8..d5cd364e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -595,7 +595,11 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) -from .line import LineTVIE +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py index 7f5fa446e..2526daa77 100644 --- a/youtube_dl/extractor/line.py +++ b/youtube_dl/extractor/line.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import js_to_json +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, +) class LineTVIE(InfoExtractor): @@ -88,3 +94,137 @@ class LineTVIE(InfoExtractor): for thumbnail in video_info.get('thumbnails', {}).get('list', [])], 'view_count': video_info.get('meta', {}).get('count'), } + + +class LineLiveBaseIE(InfoExtractor): + _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/' + + def _parse_broadcast_item(self, item): + broadcast_id = compat_str(item['id']) + title = item['title'] + is_live = item.get('isBroadcastingNow') + + thumbnails = [] + for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items(): + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail_url, + }) + + channel = item.get('channel') or {} + channel_id = str_or_none(channel.get('id')) + + return { + 'id': broadcast_id, + 'title': self._live_title(title) if is_live else title, + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('createdAt')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None, + 'duration': int_or_none(item.get('archiveDuration')), + 'view_count': int_or_none(item.get('viewerCount')), + 'comment_count': int_or_none(item.get('chatCount')), + 'is_live': is_live, + } + + +class LineLiveIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)/broadcast/(?P\d+)' + _TESTS = [{ + 'url': 'https://live.line.me/channels/4867368/broadcast/16331360', + 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3', + 'info_dict': { + 'id': '16331360', + 'title': '振りコピ講座😙😙😙', + 'ext': 'mp4', + 'timestamp': 1617095132, + 'upload_date': '20210330', + 'channel': '白川ゆめか', + 'channel_id': '4867368', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + }, { + # archiveStatus == 'DELETED' + 'url': 'https://live.line.me/channels/4778159/broadcast/16378488', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id, broadcast_id = re.match(self._VALID_URL, url).groups() + broadcast = self._download_json( + self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id), + broadcast_id) + item = broadcast['item'] + info = self._parse_broadcast_item(item) + protocol = 'm3u8' if info['is_live'] else 'm3u8_native' + formats = [] + for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items(): + if not v: + continue + if k == 'abr': + formats.extend(self._extract_m3u8_formats( + v, broadcast_id, 'mp4', protocol, + m3u8_id='hls', fatal=False)) + continue + f = { + 'ext': 'mp4', + 'format_id': 'hls-' + k, + 'protocol': protocol, + 'url': v, + } + if not k.isdigit(): + f['vcodec'] = 'none' + formats.append(f) + if not formats: + archive_status = item.get('archiveStatus') + if archive_status != 'ARCHIVED': + raise ExtractorError('this video has been ' + archive_status.lower(), expected=True) + self._sort_formats(formats) + info['formats'] = formats + return info + + +class LineLiveChannelIE(LineLiveBaseIE): + _VALID_URL = r'https?://live\.line\.me/channels/(?P\d+)(?!/broadcast/\d+)(?:[/?&#]|$)' + _TEST = { + 'url': 'https://live.line.me/channels/5893542', + 'info_dict': { + 'id': '5893542', + 'title': 'いくらちゃん', + 'description': 'md5:c3a4af801f43b2fac0b02294976580be', + }, + 'playlist_mincount': 29 + } + + def _archived_broadcasts_entries(self, archived_broadcasts, channel_id): + while True: + for row in (archived_broadcasts.get('rows') or []): + share_url = str_or_none(row.get('shareURL')) + if not share_url: + continue + info = self._parse_broadcast_item(row) + info.update({ + '_type': 'url', + 'url': share_url, + 'ie_key': LineLiveIE.ie_key(), + }) + yield info + if not archived_broadcasts.get('hasNextPage'): + return + archived_broadcasts = self._download_json( + self._API_BASE_URL + channel_id + '/archived_broadcasts', + channel_id, query={ + 'lastId': info['id'], + }) + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel = self._download_json(self._API_BASE_URL + channel_id, channel_id) + return self.playlist_result( + self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id), + channel_id, channel.get('title'), channel.get('information')) From 6b315d96bc0b07ddc3abaa7318583775828cce30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 6 Apr 2021 14:15:13 +0700 Subject: [PATCH 141/242] [compat] flake8 --- youtube_dl/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8a5262dc8..566e9d5ec 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -76,7 +76,7 @@ except ImportError: # Python 2 if sys.version_info[0] == 2: class compat_SimpleCookie(compat_cookies.SimpleCookie): def load(self, rawdata): - if isinstance(rawdata, unicode): + if isinstance(rawdata, compat_str): rawdata = str(rawdata) return super(compat_SimpleCookie, self).load(rawdata) else: From 70d0d4f9beba0e5b6d95ee50ad62ae7ab5be9be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 6 Apr 2021 14:22:28 +0700 Subject: [PATCH 142/242] [compat] Use more conventional name for compat SimpleCookie --- youtube_dl/compat.py | 8 ++++---- youtube_dl/extractor/common.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 566e9d5ec..9e45c454b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -74,13 +74,13 @@ except ImportError: # Python 2 import Cookie as compat_cookies if sys.version_info[0] == 2: - class compat_SimpleCookie(compat_cookies.SimpleCookie): + class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): def load(self, rawdata): if isinstance(rawdata, compat_str): rawdata = str(rawdata) - return super(compat_SimpleCookie, self).load(rawdata) + return super(compat_cookies_SimpleCookie, self).load(rawdata) else: - compat_SimpleCookie = compat_cookies.SimpleCookie + compat_cookies_SimpleCookie = compat_cookies.SimpleCookie try: import html.entities as compat_html_entities @@ -3002,7 +3002,6 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_SimpleCookie', 'compat_Struct', 'compat_b64decode', 'compat_basestring', @@ -3010,6 +3009,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', + 'compat_cookies_SimpleCookie', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', 'compat_etree_fromstring', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index af289d705..797c35fd5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,13 +17,13 @@ import math from ..compat import ( compat_cookiejar_Cookie, + compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, compat_os_name, - compat_SimpleCookie, compat_str, compat_urllib_error, compat_urllib_parse_unquote, @@ -2901,10 +2901,10 @@ class InfoExtractor(object): self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_SimpleCookie with the cookies for the url """ + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_SimpleCookie(req.get_header('Cookie')) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ From 6b116f0c03ac0b1aff01cd08bbe1d5cb87dff853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:34:43 +0700 Subject: [PATCH 143/242] [youtube] Fix videos with restricted location (closes #28685) --- youtube_dl/extractor/youtube.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2e027528d..6b4c7912c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1084,6 +1084,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg', 'only_matching': True, }, + { + # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685 + 'url': 'cBvYw8_A0vQ', + 'info_dict': { + 'id': 'cBvYw8_A0vQ', + 'ext': 'mp4', + 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き', + 'description': 'md5:ea770e474b7cd6722b4c95b833c03630', + 'upload_date': '20201120', + 'uploader': 'Walk around Japan', + 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw', + }, + 'params': { + 'skip_download': True, + }, + }, ] _formats = { '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -1485,7 +1502,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def get_text(x): if not x: return - return x.get('simpleText') or ''.join([r['text'] for r in x['runs']]) + text = x.get('simpleText') + if text and isinstance(text, compat_str): + return text + runs = x.get('runs') + if not isinstance(runs, list): + return + return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)]) search_meta = ( lambda x: self._html_search_meta(x, webpage, default=None)) \ From 445db582a27c44cb02d57ac9171d58651cafbd76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:35:25 +0700 Subject: [PATCH 144/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4304ecd9e..e5e546744 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +version + +Core +* [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies ++ [compat] Introduce compat_cookies_SimpleCookie +* [extractor/common] Improve JSON-LD author extraction +* [extractor/common] Fix _get_cookies on python 2 (#20673, #23256, #20326, + #28640) + +Extractors +* [youtube] Fix extraction of videos with restricted location (#28685) ++ [line] Add support for live.line.me (#17205, #28658) +* [vimeo] Improve extraction (#28591) +* [youku] Update ccode (#17852, #28447, #28460, #28648) +* [youtube] Prefer direct entry metadata over entry metadata from playlist + (#28619, #28636) +* [screencastomatic] Fix extraction (#11976, #24489) ++ [palcomp3] Add support for palcomp3.com (#13120) ++ [arnes] Add support for video.arnes.si (#28483) ++ [youtube:tab] Add support for hashtags (#28308) + + version 2021.04.01 Extractors From 72a2c0a9ede04c6b82235e453b1a933faf072a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 7 Apr 2021 03:42:24 +0700 Subject: [PATCH 145/242] release 2021.04.07 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 6 ++++++ youtube_dl/version.py | 2 +- 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index 98ec799e8..febbd2344 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.01 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 5387a6cd1..d7296d0a9 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 945c80366..92e616a1a 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 0acc8b679..b55739f6c 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.01 + [debug] youtube-dl version 2021.04.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 42c3126a3..dbdb8356a 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.01** +- [ ] I've verified that I'm running youtube-dl version **2021.04.07** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index e5e546744..22b4fa67d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.07 Core * [extractor/common] Use compat_cookies_SimpleCookie for _get_cookies diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d2ad937a4..ff9177a2c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -463,6 +463,8 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineLive** + - **LineLiveChannel** - **LineTV** - **linkedin:learning** - **linkedin:learning:course** @@ -679,6 +681,9 @@ - **OutsideTV** - **PacktPub** - **PacktPubCourse** + - **PalcoMP3:artist** + - **PalcoMP3:song** + - **PalcoMP3:video** - **pandora.tv**: 판도라TV - **ParamountNetwork** - **parliamentlive.tv**: UK parliament videos @@ -1059,6 +1064,7 @@ - **Vidbit** - **Viddler** - **Videa** + - **video.arnes.si**: Arnes Video - **video.google:search**: Google Video search - **video.sky.it** - **video.sky.it:live** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0457d1a15..a6b1b8dce 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.01' +__version__ = '2021.04.07' From c0c5134c5771dd2a1caeeaee62dcd207d169e981 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 09:27:05 +0100 Subject: [PATCH 146/242] [curiositystream] fix format extraction(closes #26845, closes #28668) --- youtube_dl/extractor/curiositystream.py | 103 +++++++++++++----------- 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index e4a7fca6c..ae64a07d7 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) - def _call_api(self, path, video_id): + def _call_api(self, path, video_id, query=None): headers = {} if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers) + self._API_BASE_URL + path, video_id, headers=headers, query=query) self._handle_errors(result) return result['data'] @@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - title = media['title'] formats = [] - for encoding in media.get('encodings', []): - m3u8_url = encoding.get('master_playlist_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: continue - fmt = f.copy() - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) self._sort_formats(formats) + title = media['title'] + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -140,7 +153,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 16, }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, From 281b8e34432d8dba9902be2c1eb77d3e6371cd73 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 10:41:06 +0100 Subject: [PATCH 147/242] [jamendo] fix track extraction(closes #28686) --- youtube_dl/extractor/jamendo.py | 74 ++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 490efa8fb..1db7c64af 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor): 'id': '196219', 'display_id': 'stories-from-emona-i', 'ext': 'flac', - 'title': 'Maya Filipič - Stories from Emona I', - 'artist': 'Maya Filipič', + # 'title': 'Maya Filipič - Stories from Emona I', + 'title': 'Stories from Emona I', + # 'artist': 'Maya Filipič', 'track': 'Stories from Emona I', 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1217438117, 'upload_date': '20080730', + 'license': 'by-nc-nd', + 'view_count': int, + 'like_count': int, + 'average_rating': int, + 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'], } }, { 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'only_matching': True, }] + def _call_api(self, resource, resource_id): + path = '/api/%ss' % resource + rand = compat_str(random.random()) + return self._download_json( + 'https://www.jamendo.com' + path, resource_id, query={ + 'id[]': resource_id, + }, headers={ + 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) + })[0] + def _real_extract(self, url): track_id, display_id = self._VALID_URL_RE.match(url).groups() - webpage = self._download_webpage( - 'https://www.jamendo.com/track/' + track_id, track_id) - models = self._parse_json(self._html_search_regex( - r"data-bundled-models='([^']+)", - webpage, 'bundled models'), track_id) - track = models['track']['models'][0] + # webpage = self._download_webpage( + # 'https://www.jamendo.com/track/' + track_id, track_id) + # models = self._parse_json(self._html_search_regex( + # r"data-bundled-models='([^']+)", + # webpage, 'bundled models'), track_id) + # track = models['track']['models'][0] + track = self._call_api('track', track_id) title = track_name = track['name'] - get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} - artist = get_model('artist') - artist_name = artist.get('name') - if artist_name: - title = '%s - %s' % (artist_name, title) - album = get_model('album') + # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {} + # artist = get_model('artist') + # artist_name = artist.get('name') + # if artist_name: + # title = '%s - %s' % (artist_name, title) + # album = get_model('album') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor): urls = [] thumbnails = [] - for _, covers in track.get('cover', {}).items(): + for covers in (track.get('cover') or {}).values(): for cover_id, cover_url in covers.items(): if not cover_url or cover_url in urls: continue @@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor): }) tags = [] - for tag in track.get('tags', []): + for tag in (track.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue tags.append(tag_name) stats = track.get('stats') or {} + license = track.get('licenseCC') or [] return { 'id': track_id, @@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - 'artist': artist_name, + # 'artist': artist_name, 'track': track_name, - 'album': album.get('name'), + # 'album': album.get('name'), 'formats': formats, - 'license': '-'.join(track.get('licenseCC', [])) or None, + 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), 'view_count': int_or_none(stats.get('listenedAll')), 'like_count': int_or_none(stats.get('favorited')), @@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(InfoExtractor): +class JamendoAlbumIE(JamendoIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', @@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 } - } - - def _call_api(self, resource, resource_id): - path = '/api/%ss' % resource - rand = compat_str(random.random()) - return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ - 'id[]': resource_id, - }, headers={ - 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) - })[0] + }] def _real_extract(self, url): album_id = self._match_id(url) @@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor): album_name = album.get('name') entries = [] - for track in album.get('tracks', []): + for track in (album.get('tracks') or []): track_id = track.get('id') if not track_id: continue From 006eea564d55130bb2e2ea7feb3a0e286d75d91f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Apr 2021 14:01:48 +0100 Subject: [PATCH 148/242] [cbssports] fix extraction(closes #28682) --- youtube_dl/extractor/cbssports.py | 123 +++++++++++++++++++++++------ youtube_dl/extractor/extractors.py | 6 +- 2 files changed, 104 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 83b764762..a891c9a55 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,38 +1,113 @@ from __future__ import unicode_literals -from .cbs import CBSBaseIE +import re +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) -class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P\d+) + )''' _TESTS = [{ - 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', - 'info_dict': { - 'id': '1214315075735', - 'ext': 'mp4', - 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', - 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', - 'timestamp': 1524111457, - 'upload_date': '20180419', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, }, { - 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): + uuid, pcid = re.match(self._VALID_URL, url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], - webpage, 'video id') - return self._extract_video_info('byId=%s' % video_id, video_id) + iframe_url = self._search_regex( + r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d5cd364e8..5ff9110b4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -191,7 +191,11 @@ from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, ) -from .cbssports import CBSSportsIE +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) from .ccc import ( CCCIE, CCCPlaylistIE, From 545d6cb9d06a8bf32bcd24463c0fd25e650bb2c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Apr 2021 15:32:59 +0700 Subject: [PATCH 149/242] [pornhub] Extract DASH and HLS formats from get_media end point (closes #28698) --- youtube_dl/extractor/pornhub.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 2a7818e41..031454600 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -398,6 +398,16 @@ class PornHubIE(PornHubBaseIE): formats = [] def add_format(format_url, height=None): + ext = determine_ext(format_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + return + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + return tbr = None mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) if mobj: @@ -417,16 +427,6 @@ class PornHubIE(PornHubBaseIE): r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) if upload_date: upload_date = upload_date.replace('/', '') - ext = determine_ext(video_url) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - video_url, video_id, mpd_id='dash', fatal=False)) - continue - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue if '/video/get_media' in video_url: medias = self._download_json(video_url, video_id, fatal=False) if isinstance(medias, list): From 27e5a4464d1d4c418d4937492e18a9d47d30fc50 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Apr 2021 18:53:36 +0100 Subject: [PATCH 150/242] [mtv] Fix Viacom A/B Testing Video Player extraction(closes #28703) --- youtube_dl/extractor/mtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 600cf2d89..5a5205c0e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -255,7 +255,9 @@ class MTVServicesInfoExtractor(InfoExtractor): @staticmethod def _extract_child_with_type(parent, t): - return next(c for c in parent['children'] if c.get('type') == t) + for c in parent['children']: + if c.get('type') == t: + return c def _extract_mgid(self, webpage): try: @@ -286,7 +288,8 @@ class MTVServicesInfoExtractor(InfoExtractor): data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') - video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + ab_testing = self._extract_child_with_type(main_container, 'ABTesting') + video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') mgid = video_player['props']['media']['video']['config']['uri'] return mgid From 1b0a13f33cfb3644cc718d35951ea85bb1905459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 9 Apr 2021 02:09:52 +0700 Subject: [PATCH 151/242] [youtube:tab] Pass innertube context and x-goog-visitor-id header along with continuation requests (closes #28702) --- youtube_dl/extractor/youtube.py | 42 +++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6b4c7912c..79e47c919 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -306,7 +306,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._parse_json( self._search_regex( r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', - default='{}'), video_id, fatal=False) + default='{}'), video_id, fatal=False) or {} def _extract_video(self, renderer): video_id = renderer['videoId'] @@ -2475,7 +2475,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): ctp = continuation_ep.get('clickTrackingParams') return YoutubeTabIE._build_continuation_query(continuation, ctp) - def _entries(self, tab, identity_token): + def _entries(self, tab, item_id, webpage): tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return @@ -2535,26 +2535,37 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield entry continuation = self._extract_continuation(rich_grid_renderer) + ytcfg = self._extract_ytcfg(item_id, webpage) + client_version = try_get( + ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00' + headers = { 'x-youtube-client-name': '1', - 'x-youtube-client-version': '2.20201112.04.01', + 'x-youtube-client-version': client_version, 'content-type': 'application/json', } + + context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or { + 'client': { + 'clientName': 'WEB', + 'clientVersion': client_version, + } + } + visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str) + + identity_token = self._extract_identity_token(ytcfg, webpage) if identity_token: headers['x-youtube-identity-token'] = identity_token data = { - 'context': { - 'client': { - 'clientName': 'WEB', - 'clientVersion': '2.20201021.03.00', - } - }, + 'context': context, } for page_num in itertools.count(1): if not continuation: break + if visitor_data: + headers['x-goog-visitor-id'] = visitor_data data['continuation'] = continuation['continuation'] data['clickTracking'] = { 'clickTrackingParams': continuation['itct'] @@ -2579,6 +2590,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not response: break + visitor_data = try_get( + response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data + continuation_contents = try_get( response, lambda x: x['continuationContents'], dict) if continuation_contents: @@ -2687,7 +2701,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): alerts.append(text) return '\n'.join(alerts) - def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): + def _extract_from_tabs(self, item_id, webpage, data, tabs): selected_tab = self._extract_selected_tab(tabs) renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) @@ -2712,7 +2726,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if renderer: title = try_get(renderer, lambda x: x['hashtag']['simpleText']) playlist = self.playlist_result( - self._entries(selected_tab, identity_token), + self._entries(selected_tab, item_id, webpage), playlist_id=playlist_id, playlist_title=title, playlist_description=description) playlist.update(self._extract_uploader(data)) @@ -2736,8 +2750,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): self._playlist_entries(playlist), playlist_id=playlist_id, playlist_title=title) - def _extract_identity_token(self, webpage, item_id): - ytcfg = self._extract_ytcfg(item_id, webpage) + def _extract_identity_token(self, ytcfg, webpage): if ytcfg: token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) if token: @@ -2760,12 +2773,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) webpage = self._download_webpage(url, item_id) - identity_token = self._extract_identity_token(webpage, item_id) data = self._extract_yt_initial_data(item_id, webpage) tabs = try_get( data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) if tabs: - return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) + return self._extract_from_tabs(item_id, webpage, data, tabs) playlist = try_get( data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) if playlist: From 4fb25ff5a3be5206bb72e5c4046715b1529fb2c7 Mon Sep 17 00:00:00 2001 From: Aaron Lipinski Date: Thu, 8 Apr 2021 19:59:36 +1200 Subject: [PATCH 152/242] [maoritv] Add new extractor(closes #24552) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/maoritv.py | 31 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 youtube_dl/extractor/maoritv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ff9110b4..ac33cd996 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -640,6 +640,7 @@ from .mangomolo import ( MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE from .markiza import ( MarkizaIE, MarkizaPageIE, diff --git a/youtube_dl/extractor/maoritv.py b/youtube_dl/extractor/maoritv.py new file mode 100644 index 000000000..0d23fec75 --- /dev/null +++ b/youtube_dl/extractor/maoritv.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MaoriTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54', + 'md5': '5ade8ef53851b6a132c051b1cd858899', + 'info_dict': { + 'id': '4774724855001', + 'ext': 'mp4', + 'title': 'Kōrero Mai, Series 1 Episode 54', + 'upload_date': '20160226', + 'timestamp': 1456455018, + 'description': 'md5:59bde32fd066d637a1a55794c56d8dcb', + 'uploader_id': '1614493167001', + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex( + r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) From 06159135ef148a6ddc632d0c89b90c937d5bb021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 00:07:32 +0700 Subject: [PATCH 153/242] [youtube] Improve URL to extractor routing (closes #27572, closes #28335, closes #28742) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 79e47c919..4d7f3f837 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,6 +46,10 @@ from ..utils import ( ) +def parse_qs(url): + return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -413,16 +417,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID - (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?\blist= - (?: - %(playlist_id)s| # combined list/video URLs are handled by the playlist IE - WL # WL are handled by the watch later IE - ) - ) + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" % { - 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, 'invidious': '|'.join(_INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( @@ -1208,6 +1205,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } + @classmethod + def suitable(cls, url): + qs = parse_qs(url) + if qs.get('list', [None])[0]: + return False + return super(YoutubeIE, cls).suitable(url) + def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) self._code_cache = {} @@ -2275,6 +2279,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': '#cctv9', }, 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, }] @classmethod @@ -2764,7 +2771,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): url = compat_urlparse.urlunparse( compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) # Handle both video/playlist URLs - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) video_id = qs.get('v', [None])[0] playlist_id = qs.get('list', [None])[0] if video_id and playlist_id: @@ -2860,12 +2867,16 @@ class YoutubePlaylistIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubeTabIE.suitable(url) else super( - YoutubePlaylistIE, cls).suitable(url) + if YoutubeTabIE.suitable(url): + return False + qs = parse_qs(url) + if qs.get('v', [None])[0]: + return False + return super(YoutubePlaylistIE, cls).suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) - qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + qs = parse_qs(url) if not qs: qs = {'list': playlist_id} return self.url_result( From 79e4ccfc4b395127bb3e5e957b20b04e75cba355 Mon Sep 17 00:00:00 2001 From: quyleanh Date: Sat, 17 Apr 2021 00:30:10 +0700 Subject: [PATCH 154/242] [pluralsight] Extend anti-throttling timeout (#28712) --- youtube_dl/extractor/pluralsight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index abd08bc28..2d63855df 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -393,7 +393,7 @@ query viewClip { # To somewhat reduce the probability of these consequences # we will sleep random amount of time before each call to ViewClip. self._sleep( - random.randint(2, 5), display_id, + random.randint(5, 10), display_id, '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling') if not viewclip: From d01e261a15abd779decae6e0858d8586f7a71621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20=C3=81vila?= Date: Fri, 16 Apr 2021 14:31:34 -0300 Subject: [PATCH 155/242] [youtube] Add more invidious instances (#28706) --- youtube_dl/extractor/youtube.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4d7f3f837..7fa9b473a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -359,21 +359,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?:www\.)?invidious\.mastodon\.host', r'(?:www\.)?invidious\.zapashcanon\.fr', r'(?:www\.)?invidious\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', r'(?:www\.)?invidious\.tube', r'(?:www\.)?invidiou\.site', r'(?:www\.)?invidious\.site', r'(?:www\.)?invidious\.xyz', r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', r'(?:www\.)?tube\.poal\.co', r'(?:www\.)?tube\.connect\.cafe', r'(?:www\.)?vid\.wxzm\.sx', r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', r'(?:www\.)?yewtu\.be', r'(?:www\.)?yt\.elukerio\.org', r'(?:www\.)?yt\.lelux\.fi', r'(?:www\.)?invidious\.ggc-project\.de', r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', r'(?:www\.)?invidious\.13ad\.de', r'(?:www\.)?invidious\.toot\.koeln', r'(?:www\.)?invidious\.fdn\.fr', From ea87ed8394127c4bf824688b8780eaf5a804e7a3 Mon Sep 17 00:00:00 2001 From: zraktvor <=> Date: Sat, 10 Apr 2021 15:11:35 +0200 Subject: [PATCH 156/242] [youtube:tab] Detect series playlist on playlists page (closes #28723) --- youtube_dl/extractor/youtube.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7fa9b473a..581687d96 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2019,6 +2019,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'title': 'Игорь Клейнер - Playlists', 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', }, + }, { + # playlists, series + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + }, }, { # playlists, singlepage 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', @@ -2311,7 +2320,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel'): + for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): renderer = item.get('grid%sRenderer' % item_kind) if renderer: return renderer @@ -2344,6 +2353,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) + # show + if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice + show_playlist_url = try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str) + if show_playlist_url: + playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, + 'playlist id', default=None) + if playlist_id: + title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + yield self.url_result( + "https://www.youtube.com/playlist?list=%s" % playlist_id, + ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 7c5239547928079513b65f62e4c84aea21ce76e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 01:05:44 +0700 Subject: [PATCH 157/242] [youtube:tab] Improve grid extraction (closes #28725) --- youtube_dl/extractor/youtube.py | 38 ++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 581687d96..b6945570f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2320,10 +2320,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): @staticmethod def _extract_grid_item_renderer(item): - for item_kind in ('Playlist', 'Video', 'Channel', 'Show'): - renderer = item.get('grid%sRenderer' % item_kind) - if renderer: - return renderer + assert isinstance(item, dict) + for key, renderer in item.items(): + if not key.startswith('grid') or not key.endswith('Renderer'): + continue + if not isinstance(renderer, dict): + continue + return renderer def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: @@ -2333,7 +2336,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if not isinstance(renderer, dict): continue title = try_get( - renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + renderer, (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) # playlist playlist_id = renderer.get('playlistId') if playlist_id: @@ -2341,10 +2345,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'https://www.youtube.com/playlist?list=%s' % playlist_id, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + continue # video video_id = renderer.get('videoId') if video_id: yield self._extract_video(renderer) + continue # channel channel_id = renderer.get('channelId') if channel_id: @@ -2353,19 +2359,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): yield self.url_result( 'https://www.youtube.com/channel/%s' % channel_id, ie=YoutubeTabIE.ie_key(), video_title=title) - # show - if playlist_id is None: # needs to check for playlist_id, or non-series playlists are recognized twice - show_playlist_url = try_get( - renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) - if show_playlist_url: - playlist_id = self._search_regex(r'/playlist\?list=([0-9a-zA-Z-_]+)', show_playlist_url, - 'playlist id', default=None) - if playlist_id: - title = try_get(renderer, lambda x: x['title']['simpleText'], compat_str) + continue + # generic endpoint URL support + ep_url = urljoin('https://www.youtube.com/', try_get( + renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], + compat_str)) + if ep_url: + for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): + if ie.suitable(ep_url): yield self.url_result( - "https://www.youtube.com/playlist?list=%s" % playlist_id, - ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) + ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title) + break def _shelf_entries_from_content(self, shelf_renderer): content = shelf_renderer.get('content') From 54558e0baa4d62a94af105cd1d7f8abcbd16b468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 02:27:54 +0700 Subject: [PATCH 158/242] [youtube] Improve stretch extraction and fix stretched ratio calculation (closes #28769) --- youtube_dl/extractor/youtube.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b6945570f..75751d5a6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -812,6 +812,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'This video does not exist.', }, + { + # Video with incomplete 'yt:stretch=16:' + 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI', + 'only_matching': True, + }, { # Video licensed under Creative Commons 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', @@ -1717,13 +1722,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for m in re.finditer(self._meta_regex('og:video:tag'), webpage)] for keyword in keywords: if keyword.startswith('yt:stretch='): - w, h = keyword.split('=')[1].split(':') - w, h = int(w), int(h) - if w > 0 and h > 0: - ratio = w / h - for f in formats: - if f.get('vcodec') != 'none': - f['stretched_ratio'] = ratio + mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword) + if mobj: + # NB: float is intentional for forcing float division + w, h = (float(v) for v in mobj.groups()) + if w > 0 and h > 0: + ratio = w / h + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + break thumbnails = [] for container in (video_details, microformat): From a00a7e0cad3308d999599bf17df5d3e6aba502d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:22:13 +0700 Subject: [PATCH 159/242] [utils] Add support for support for experimental HTTP response status code 308 Permanent Redirect (refs #27877, refs #28768) --- youtube_dl/utils.py | 62 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e4d144c9..538cc2b63 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ import zlib from .compat import ( compat_HTMLParseError, compat_HTMLParser, + compat_HTTPError, compat_basestring, compat_chr, compat_cookiejar, @@ -2879,12 +2880,61 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - if sys.version_info[0] < 3: - def redirect_request(self, req, fp, code, msg, headers, newurl): - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl)) + """YoutubeDL redirect handler + + The code is based on HTTPRedirectHandler implementation from CPython [1]. + + This redirect handler solves two issues: + - ensures redirect URL is always unicode under python 2 + - introduces support for experimental HTTP response status code + 308 Permanent Redirect [2] used by some sites [3] + + 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py + 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308 + 3. https://github.com/ytdl-org/youtube-dl/issues/28768 + """ + + http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise compat_HTTPError(req.full_url, code, msg, headers, fp) + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib.request, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + + # On python 2 urlh.geturl() may sometimes return redirect URL + # as byte string instead of unicode. This workaround allows + # to force it always return unicode. + if sys.version_info[0] < 3: + newurl = compat_str(newurl) + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. + newurl = newurl.replace(' ', '%20') + + CONTENT_HEADERS = ("content-length", "content-type") + # NB: don't use dict comprehension for python 2.6 compatibility + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request(newurl, + headers=newheaders, + origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From 30a3a4c70fdcad10ef1dc6c3402457a95fe1ae5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:23:47 +0700 Subject: [PATCH 160/242] [lbry] Add support for HLS videos (closes #27877, closes #28768) --- youtube_dl/extractor/lbry.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py index ae43d56ea..cfd6b8393 100644 --- a/youtube_dl/extractor/lbry.py +++ b/youtube_dl/extractor/lbry.py @@ -120,6 +120,26 @@ class LBRYIE(LBRYBaseIE): 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'vcodec': 'none', } + }, { + # HLS + 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', + 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'info_dict': { + 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', + 'ext': 'mp4', + 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁', + 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e', + 'timestamp': 1618254123, + 'upload_date': '20210412', + 'release_timestamp': 1618254002, + 'release_date': '20210412', + 'tags': list, + 'duration': 554, + 'channel': 'Gardening In Canada', + 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', + 'formats': 'mincount:3', + } }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, @@ -163,10 +183,18 @@ class LBRYIE(LBRYBaseIE): streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] info = self._parse_stream(result, url) + urlh = self._request_webpage( + streaming_url, display_id, note='Downloading streaming redirect url info') + if determine_ext(urlh.geturl()) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(info['formats']) + else: + info['url'] = streaming_url info.update({ 'id': claim_id, 'title': title, - 'url': streaming_url, }) return info From cfee2dfe83c5593d46bd0c8e8ce6a3d8c6e42db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:32:04 +0700 Subject: [PATCH 161/242] [utils] PEP 8 --- youtube_dl/utils.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 538cc2b63..e722eed58 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2908,7 +2908,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): """ m = req.get_method() if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST")): + or code in (301, 302, 303) and m == "POST")): raise compat_HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation @@ -2930,11 +2930,10 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): CONTENT_HEADERS = ("content-length", "content-type") # NB: don't use dict comprehension for python 2.6 compatibility newheaders = dict((k, v) for k, v in req.headers.items() - if k.lower() not in CONTENT_HEADERS) - return compat_urllib_request.Request(newurl, - headers=newheaders, - origin_req_host=req.origin_req_host, - unverifiable=True) + if k.lower() not in CONTENT_HEADERS) + return compat_urllib_request.Request( + newurl, headers=newheaders, origin_req_host=req.origin_req_host, + unverifiable=True) def extract_timezone(date_str): From f20b505b46e6654464635ca4afb0f37e1f14c57b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:47:00 +0700 Subject: [PATCH 162/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 22b4fa67d..c249412e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core ++ [utils] Add support for experimental HTTP response status code + 308 Permanent Redirect (#27877, #28768) + +Extractors ++ [lbry] Add support for HLS videos (#27877, #28768) +* [youtube] Fix stretched ratio calculation +* [youtube] Improve stretch extraction (#28769) +* [youtube:tab] Improve grid extraction (#28725) ++ [youtube:tab] Detect series playlist on playlists page (#28723) ++ [youtube] Add more invidious instances (#28706) +* [pluralsight] Extend anti-throttling timeout (#28712) +* [youtube] Improve URL to extractor routing (#27572, #28335, #28742) ++ [maoritv] Add support for maoritelevision.com (#24552) ++ [youtube:tab] Pass innertube context and x-goog-visitor-id header along with + continuation requests (#28702) +* [mtv] Fix Viacom A/B Testing Video Player extraction (#28703) ++ [pornhub] Extract DASH and HLS formats from get_media end point (#28698) +* [cbssports] Fix extraction (#28682) +* [jamendo] Fix track extraction (#28686) +* [curiositystream] Fix format extraction (#26845, #28668) + + version 2021.04.07 Core From 596b26606cfe20aa9f776ac0658b6bfb1ea95397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 03:50:09 +0700 Subject: [PATCH 163/242] release 2021.04.17 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index febbd2344..e2a89d5c2 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index d7296d0a9..4d7abd775 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 92e616a1a..d8dce6fd4 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index b55739f6c..d95ee291a 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.07 + [debug] youtube-dl version 2021.04.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index dbdb8356a..ac5dd2f27 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.07** +- [ ] I've verified that I'm running youtube-dl version **2021.04.17** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c249412e2..45d5c2ebf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.17 Core + [utils] Add support for experimental HTTP response status code diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ff9177a2c..a23da1a31 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,6 +3,7 @@ - **20min** - **220.ro** - **23video** + - **247sports** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -160,7 +161,8 @@ - **cbsnews**: CBS News - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos - - **CBSSports** + - **cbssports** + - **cbssports:embed** - **CCMA** - **CCTV**: 央视网 - **CDA** @@ -490,6 +492,7 @@ - **mangomolo:live** - **mangomolo:video** - **ManyVids** + - **MaoriTV** - **Markiza** - **MarkizaPage** - **massengeschmack.tv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a6b1b8dce..2b041d593 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.07' +__version__ = '2021.04.17' From 9f6c03a00602eb1119e43a522cf50682f6d6a6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Apr 2021 05:05:31 +0700 Subject: [PATCH 164/242] [cbsnews] Fix extraction for python <3.6 (closes #23359) --- youtube_dl/extractor/cbsnews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 345debcf0..1285ed65e 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE): def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS), None)['video']['items'][0] + -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') From 41920fc80e4fe4a8996aeb31a04826a5a2534814 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 20 Apr 2021 20:51:55 +0100 Subject: [PATCH 165/242] [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) --- youtube_dl/extractor/bbc.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e8d000bbb..71ea25881 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -25,8 +26,10 @@ from ..utils import ( js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, @@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1164,12 +1176,23 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = try_get( + media, + lambda x: x['summary']['blocks'][0]['model']['text'], + compat_str) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') From dab83a25972e0dbcc69583bf78d2a992f581563d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:00:56 +0700 Subject: [PATCH 166/242] [bbc] Extract full description from __INITIAL_DATA__ (refs #28774) --- youtube_dl/extractor/bbc.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 71ea25881..247d982ce 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -770,7 +770,7 @@ class BBCIE(BBCCoUkIE): 'id': 'p02xzws1', 'ext': 'mp4', 'title': "Pluto may have 'nitrogen glaciers'", - 'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', @@ -1176,10 +1176,16 @@ class BBCIE(BBCCoUkIE): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) - item_desc = try_get( - media, - lambda x: x['summary']['blocks'][0]['model']['text'], - compat_str) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) item_time = None for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: if try_get(meta, lambda x: x['label']) == 'Published': From 32290307a45260885b2210aa3c2a57e64abf8c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 03:56:04 +0700 Subject: [PATCH 167/242] [youtube] Fix lazy extractors (closes #28780) --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 75751d5a6..c16dc7ab8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1219,6 +1219,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('list', [None])[0]: return False @@ -2910,6 +2913,9 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False + # Hack for lazy extractors until more generic solution is implemented + # (see #28780) + from .youtube import parse_qs qs = parse_qs(url) if qs.get('v', [None])[0]: return False From 5ad69d3d0e7d1d8d15a1f2497d602b1b91fcc74a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:45:13 +0700 Subject: [PATCH 168/242] [test_youtube_misc] Move YoutubeIE.extract_id test into separate module --- test/test_all_urls.py | 9 --------- test/test_youtube_misc.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 test/test_youtube_misc.py diff --git a/test/test_all_urls.py b/test/test_all_urls.py index df6d81b5d..365b66bad 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -70,15 +70,6 @@ class TestAllURLsMatching(unittest.TestCase): # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_youtube_extract(self): - assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) - assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') - assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') - assertExtractId('BaW_jenozKc', 'BaW_jenozKc') - def test_facebook_matching(self): self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268')) self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 000000000..e18e71101 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from youtube_dl.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): + def test_youtube_extract(self): + assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) + assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') + assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') + assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': + unittest.main() From c4a451bcdd2b743fdb96fcbae261c86ed91022ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 04:47:29 +0700 Subject: [PATCH 169/242] [test_execution] Add test for lazy extractors (refs #28780) --- test/test_execution.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_execution.py b/test/test_execution.py index 11661bb68..32948d93e 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase): _, stderr = p.communicate() self.assertFalse(stderr) + def test_lazy_extractors(self): + try: + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) + finally: + try: + os.remove('youtube_dl/extractor/lazy_extractors.py') + except (IOError, OSError): + pass + if __name__ == '__main__': unittest.main() From ac19c3ac8035fbf9369fd4bd336c9045d4eeafa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:35:39 +0700 Subject: [PATCH 170/242] [go] Improve video id extraction (closes #25207, closes #25216, closes #26058) --- youtube_dl/extractor/go.py | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 0d731e90a..878ba14e6 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .adobepass import AdobePassIE +from ..compat import compat_str from ..utils import ( int_or_none, determine_ext, parse_age_limit, + try_get, urlencode_postdata, ExtractorError, ) @@ -116,6 +118,18 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', + 'info_dict': { + 'id': 'VDKA22600213', + 'ext': 'mp4', + 'title': 'Pilot', + 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -149,14 +163,30 @@ class GoIE(AdobePassIE): brand = site_info.get('brand') if not video_id or not site_info: webpage = self._download_webpage(url, display_id or video_id) - video_id = self._search_regex( - ( - # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" - # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*(VDKA\w+)', - # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet - r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' - ), webpage, 'video id', default=video_id) + data = self._parse_json( + self._search_regex( + r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, + 'data', default='{}'), + display_id or video_id, fatal=False) + # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot + layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) + video_id = None + if layout: + video_id = try_get( + layout, + (lambda x: x['videoid'], lambda x: x['video']['id']), + compat_str) + if not video_id: + video_id = self._search_regex( + ( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*(VDKA\w+)', + # page.analytics.videoIdCode + r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', + # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet + r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' + ), webpage, 'video id', default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', From 7e8b3f9439ebefb3a3a4e5da9c0bd2b595976438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Apr 2021 05:37:51 +0700 Subject: [PATCH 171/242] [youtube] Remove unused code --- youtube_dl/extractor/youtube.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c16dc7ab8..0c52e5a8b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -65,11 +65,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)' - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _login(self): """ Attempt to log in to YouTube. From 0db79d8181c1c3ebf74b9b6d38262c8dcaaf0f4f Mon Sep 17 00:00:00 2001 From: The Hatsune Daishi Date: Sat, 24 Apr 2021 20:58:03 +0900 Subject: [PATCH 172/242] [tver] Redirect all downloads to Brightcove (#28849) --- youtube_dl/extractor/tver.py | 37 +++++++++++------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py index a54f49319..a4a30b1e6 100644 --- a/youtube_dl/extractor/tver.py +++ b/youtube_dl/extractor/tver.py @@ -9,7 +9,6 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, - strip_or_none, try_get, ) @@ -45,32 +44,18 @@ class TVerIE(InfoExtractor): query={'token': self._TOKEN})['main'] p_id = main['publisher_id'] service = remove_start(main['service'], 'ts_') - info = { + + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + + return { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'url': bc_url, + 'ie_key': 'BrightcoveNew', } - - if service == 'cx': - title = main['title'] - subtitle = strip_or_none(main.get('subtitle')) - if subtitle: - title += ' - ' + subtitle - info.update({ - 'title': title, - 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), - 'ie_key': 'FujiTVFODPlus7', - }) - else: - r_id = main['reference_id'] - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) - info.update({ - 'url': bc_url, - 'ie_key': 'BrightcoveNew', - }) - - return info From c6ab79299034492c5af9dd29b0c49585e4efc4cd Mon Sep 17 00:00:00 2001 From: catboy <79282513+catboy-oss@users.noreply.github.com> Date: Sat, 24 Apr 2021 12:10:35 +0000 Subject: [PATCH 173/242] [medaltv] Fix extraction (#28807) numeric clip ids are no longer used by medal, and integer user ids are now sent as strings. --- youtube_dl/extractor/medaltv.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py index 1603b55f6..ef2283dea 100644 --- a/youtube_dl/extractor/medaltv.py +++ b/youtube_dl/extractor/medaltv.py @@ -15,32 +15,32 @@ from ..utils import ( class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', 'info_dict': { - 'id': '34934644', + 'id': '2mA60jWAGQCBH', 'ext': 'mp4', 'title': 'Quad Cold', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'MowgliSB', 'timestamp': 1603165266, 'upload_date': '20201020', - 'uploader_id': 10619174, + 'uploader_id': '10619174', } }, { - 'url': 'https://medal.tv/clips/36787208', + 'url': 'https://medal.tv/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { - 'id': '36787208', + 'id': '2um24TWdty0NA', 'ext': 'mp4', 'title': 'u tk me i tk u bigger', 'description': 'Medal,https://medal.tv/desktop/', 'uploader': 'Mimicc', 'timestamp': 1605580939, 'upload_date': '20201117', - 'uploader_id': 5156321, + 'uploader_id': '5156321', } }] From 999329cf6b29878d054c6ccdd24573489a2886d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 24 Apr 2021 23:52:16 +0700 Subject: [PATCH 174/242] [workflows/ci.yml] Fix install nose for Jython --- .github/workflows/ci.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a9dc47a71..97b50aedc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,7 +53,14 @@ jobs: java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose + if: ${{ matrix.python-impl != 'jython' }} run: pip install nose + - name: Install nose (Jython) + if: ${{ matrix.python-impl == 'jython' }} + # Working around deprecation of support for non-SNI clients at PyPI CDN (see https://status.python.org/incidents/hzmjhqsdjqgb) + run: | + wget https://files.pythonhosted.org/packages/99/4f/13fb671119e65c4dce97c60e67d3fd9e6f7f809f2b307e2611f4701205cb/nose-1.3.7-py2-none-any.whl + pip install nose-1.3.7-py2-none-any.whl - name: Run tests continue-on-error: ${{ matrix.ytdl-test-set == 'download' || matrix.python-impl == 'jython' }} env: From 57eaaff5cf1ae63d4e3ae89f301c0f9b3e86bb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 25 Apr 2021 22:52:28 +0700 Subject: [PATCH 175/242] [francetvinfo] Improve video id extraction (closes #28792) --- youtube_dl/extractor/francetv.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7cc88bf18..e4ec2e200 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "
]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', - r'data-id=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'), + r'(?:data-id| Date: Sun, 25 Apr 2021 19:32:47 +0200 Subject: [PATCH 176/242] [xfileshare] Add support for wolfstream.tv (#28858) --- youtube_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index cbd5d1cbb..df9efa9fa 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor): (r'vidlocker\.xyz', 'VidLocker'), (r'vidshare\.tv', 'VidShare'), (r'vup\.to', 'VUp'), + (r'wolfstream\.tv', 'WolfStream'), (r'xvideosharing\.com', 'XVideoSharing'), ) @@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor): }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, + }, { + 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'only_matching': True, }] @staticmethod From 346dd3b5e87503c52fc6800d9f73cd6bdbce71bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:29:50 +0700 Subject: [PATCH 177/242] [ChangeLog] Actualize [ci skip] --- ChangeLog | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ChangeLog b/ChangeLog index 45d5c2ebf..c59984d63 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +version + +Extractors ++ [xfileshare] Add support for wolfstream.tv (#28858) +* [francetvinfo] Improve video id extraction (#28792) +* [medaltv] Fix extraction (#28807) +* [tver] Redirect all downloads to Brightcove (#28849) +* [go] Improve video id extraction (#25207, #25216, #26058) +* [youtube] Fix lazy extractors (#28780) ++ [bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774) +* [cbsnews] Fix extraction for python <3.6 (#23359) + + version 2021.04.17 Core From 273964d190fb048477e71114c4734fcb819c5c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 01:33:30 +0700 Subject: [PATCH 178/242] release 2021.04.26 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index e2a89d5c2..6ece3e031 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 4d7abd775..f923b2d5f 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index d8dce6fd4..97d605653 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index d95ee291a..73a806833 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2021.04.17 + [debug] youtube-dl version 2021.04.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index ac5dd2f27..ee19a75f5 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2021.04.17** +- [ ] I've verified that I'm running youtube-dl version **2021.04.26** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index c59984d63..f15c84225 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2021.04.26 Extractors + [xfileshare] Add support for wolfstream.tv (#28858) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a23da1a31..88d474de4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1162,7 +1162,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, WolfStream, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b041d593..576f721db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.04.17' +__version__ = '2021.04.26' From 94520568b399d5b4f35a9708f5643d8b16c6c4ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 26 Apr 2021 02:16:47 +0700 Subject: [PATCH 179/242] [workflows/ci.yml] Update link to jython-installer --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97b50aedc..90bd63c32 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,7 @@ jobs: - name: Install Jython if: ${{ matrix.python-impl == 'jython' }} run: | - wget http://search.maven.org/remotecontent?filepath=org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar + wget https://repo1.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar -O jython-installer.jar java -jar jython-installer.jar -s -d "$HOME/jython" echo "$HOME/jython/bin" >> $GITHUB_PATH - name: Install nose From e33dfb445c547f210a7060e8b7abd592dbe42808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 17:53:27 +0700 Subject: [PATCH 180/242] [tv2dk] Fix extraction (closes #28888) --- youtube_dl/extractor/tv2dk.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py index 8bda9348d..8bd5fd640 100644 --- a/youtube_dl/extractor/tv2dk.py +++ b/youtube_dl/extractor/tv2dk.py @@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor): webpage = self._download_webpage(url, video_id) entries = [] + + def add_entry(partner_id, kaltura_id): + entries.append(self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', + video_id=kaltura_id)) + for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage): video = extract_attributes(video_el) kaltura_id = video.get('data-entryid') @@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor): partner_id = video.get('data-partnerid') if not partner_id: continue - entries.append(self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura', - video_id=kaltura_id)) + add_entry(partner_id, kaltura_id) + if not entries: + kaltura_id = self._search_regex( + r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id') + partner_id = self._search_regex( + (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage, + 'partner id') + add_entry(partner_id, kaltura_id) return self.playlist_result(entries) From d2f72c40db0d1fe1102c98c017682b283579ad97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:09:32 +0700 Subject: [PATCH 181/242] [svtplay] Improve extraction (closes #28507, closes #28876) --- youtube_dl/extractor/svt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index aba9bb447..a5bb6daa7 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -146,7 +146,7 @@ class SVTPlayIE(SVTPlayBaseIE): ) (?P[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P[^/?#&]+) - (?:.*?modalId=(?P[\da-zA-Z-]+))? + (?:.*?(?:modalId|id)=(?P[\da-zA-Z-]+))? ) ''' _TESTS = [{ @@ -177,6 +177,9 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA', 'only_matching': True, + }, { + 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa', + 'only_matching': True, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -259,7 +262,7 @@ class SVTPlayIE(SVTPlayBaseIE): if not svt_id: svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', - r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\bmodalId=([\da-zA-Z-]+)' % re.escape(video_id), + r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id), r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', From ff04d43c469e4cf8c14ba3e2e79da0d35ef3c7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 18:33:05 +0700 Subject: [PATCH 182/242] [xtube] Fix formats extraction (closes #28870) --- youtube_dl/extractor/xtube.py | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 18969058f..7246409e3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, sanitized_Request, str_to_int, + url_or_none, ) @@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor): 'Cookie': 'age_verified=1; cookiesAccepted=1', }) - title, thumbnail, duration = [None] * 3 + title, thumbnail, duration, sources, media_definition = [None] * 5 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') @@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor): thumbnail = config.get('poster') duration = int_or_none(config.get('duration')) sources = config.get('sources') or config.get('format') + media_definition = config.get('mediaDefinition') - if not isinstance(sources, dict): + if not isinstance(sources, dict) and not media_definition: sources = self._parse_json(self._search_regex( r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', webpage, 'sources', group='sources'), video_id, transform_source=js_to_json) formats = [] - for format_id, format_url in sources.items(): - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'height': int_or_none(format_id), - }) + format_urls = set() + + if isinstance(sources, dict): + for format_id, format_url in sources.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + + if isinstance(media_definition, list): + for media in media_definition: + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + if video_url in format_urls: + continue + format_urls.add(video_url) + format_id = media.get('format') + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'mp4': + height = int_or_none(media.get('quality')) + formats.append({ + 'url': video_url, + 'format_id': '%s-%d' % (format_id, height) if height else format_id, + 'height': height, + }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) From d1b9a5e2eff1c075b38815a3d2b25eb8b3f626bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 19:00:39 +0700 Subject: [PATCH 183/242] [twitter] Improve formats extraction from vmap URL (closes #28909) --- youtube_dl/extractor/twitter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index ed495f297..cfa7a7326 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -19,6 +19,7 @@ from ..utils import ( strip_or_none, unified_timestamp, update_url_query, + url_or_none, xpath_text, ) @@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor): return [f] def _extract_formats_from_vmap_url(self, vmap_url, video_id): + vmap_url = url_or_none(vmap_url) + if not vmap_url: + return [] vmap_data = self._download_xml(vmap_url, video_id) formats = [] urls = [] From a0df8a06178e530a1097f177a1faf1d2c609ac99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 May 2021 22:53:30 +0700 Subject: [PATCH 184/242] [cda] Improve extraction (closes #28709, closes #28937) --- youtube_dl/extractor/cda.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1b4362144..e1b391937 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -133,6 +133,8 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -197,7 +199,7 @@ class CDAIE(InfoExtractor): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -209,6 +211,4 @@ class CDAIE(InfoExtractor): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) From 0204838163bd4068fe23b40414573d1307d817ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 01:57:02 +0700 Subject: [PATCH 185/242] [kaltura] Make embed code alternatives actually work --- youtube_dl/extractor/kaltura.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 49d13460d..5d0ff0418 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor): def _extract_urls(webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( - re.finditer( + list(re.finditer( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? @@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) - or re.finditer( + """, webpage)) + or list(re.finditer( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -142,8 +142,8 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) - or re.finditer( + ''', webpage)) + or list(re.finditer( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) @@ -151,7 +151,7 @@ class KalturaIE(InfoExtractor): [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) (?:(?!(?P=q1)).)* (?P=q1) - ''', webpage) + ''', webpage)) ) urls = [] for mobj in finditer: From fe05191b8c59538a48b6cbc95f4fe54fc7e6a0ac Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Tue, 4 May 2021 14:14:35 -0500 Subject: [PATCH 186/242] [kaltura] Improve iframe extraction (#28969) Co-authored-by: Sergey M. --- youtube_dl/extractor/gdcvault.py | 15 +++++++++++++++ youtube_dl/extractor/kaltura.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 2f555c1d4..5ad40ee23 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -102,6 +102,21 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, ] def _login(self, webpage_url, display_id): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 5d0ff0418..c731612c4 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -145,7 +145,7 @@ class KalturaIE(InfoExtractor): ''', webpage)) or list(re.finditer( r'''(?xs) - <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) + <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["'])\s* (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) (?:(?!(?P=q1)).)* [?&;]entry_id=(?P(?:(?!(?P=q1))[^&])+) From b8645c1f5885522ec8bb77649f49ce842e947c25 Mon Sep 17 00:00:00 2001 From: Ben Rog-Wilhelm Date: Sat, 17 Apr 2021 23:15:10 -0500 Subject: [PATCH 187/242] [dispeak] Improve FLV extraction (closes #13513) --- youtube_dl/extractor/dispeak.py | 50 ++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index c345e0274..e776ac00c 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -32,6 +32,14 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624 + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, }] def _parse_mp4(self, metadata): @@ -84,26 +92,28 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) + slide_video_path = xpath_text(metadata, './slideVideo') + if slide_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(slide_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'slide deck video', + 'quality': -2, + 'preference': -2, + 'format_id': 'slides', + }) + speaker_video_path = xpath_text(metadata, './speakerVideo') + if speaker_video_path: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(speaker_video_path, '.flv'), + 'ext': 'flv', + 'format_note': 'speaker video', + 'quality': -1, + 'preference': -1, + 'format_id': 'speaker', + }) return formats def _real_extract(self, url): From 1786cd3fe4e555b83bdd3eea77ade3477293330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:30:42 +0700 Subject: [PATCH 188/242] [dispeak] DRY and update tests (closes #28970) --- youtube_dl/extractor/dispeak.py | 34 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index e776ac00c..276fd4b09 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -33,13 +33,17 @@ class DigitallySpeakingIE(InfoExtractor): 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, }, { - # From https://gdcvault.com/play/1016624 + # From https://gdcvault.com/play/1016624, empty speakerVideo 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', 'info_dict': { 'id': '201210-822101_1349794556671DDDD', 'ext': 'flv', 'title': 'Pre-launch - Preparing to Take the Plunge', }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -92,27 +96,19 @@ class DigitallySpeakingIE(InfoExtractor): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo') - if slide_video_path: - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo') - if speaker_video_path: + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), + 'play_path': remove_end(video_path, '.flv'), 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'preference': preference, + 'format_id': format_id, }) return formats From 504e4d804df0ee666d80ba6796017cf97e026c0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 5 May 2021 02:44:29 +0700 Subject: [PATCH 189/242] [gdcvault] Add support for HTML5 videos --- youtube_dl/extractor/gdcvault.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 5ad40ee23..acc6478b8 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( HEADRequest, + remove_start, sanitized_Request, smuggle_url, urlencode_postdata, @@ -117,6 +118,11 @@ class GDCVaultIE(InfoExtractor): 'skip_download': True, }, }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -190,7 +196,18 @@ class GDCVaultIE(InfoExtractor): xml_name = self._html_search_regex( r'