From b14b33a2e9c35d41a1fd16cf53afae612392bf44 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 31 Jan 2022 04:28:54 +0000 Subject: [PATCH 01/15] [YouTube] Bypass age-gating for certain restricted videos * Use TVHTML5_SIMPLY_EMBEDDED_PLAYER client * Also add and fix tests * Introduce and use new utility function `update_url()` --- youtube_dl/extractor/youtube.py | 202 +++++++++++++++++++++++++------- youtube_dl/utils.py | 11 ++ 2 files changed, 168 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28fdb086a..65428528d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -42,6 +42,7 @@ from ..utils import ( unescapeHTML, unified_strdate, unsmuggle_url, + update_url, update_url_query, url_or_none, urlencode_postdata, @@ -286,15 +287,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|= 18): + + self.report_age_confirmation() + + # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 + pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} + query = { + 'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}}, + 'contentCheckOk': True, + 'racyCheckOk': True, + 'context': { + 'client': {'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', 'clientVersion': '2.0', 'hl': 'en', 'clientScreen': 'EMBED'}, + 'thirdParty': {'embedUrl': 'https://google.com'}, + }, + 'videoId': video_id, + } + headers = { + 'X-YouTube-Client-Name': '85', + 'X-YouTube-Client-Version': '2.0', + 'Origin': 'https://www.youtube.com' + } + + video_info = self._call_api('player', query, video_id, fatal=False, headers=headers) + age_gate_status = get_playability_status(video_info) + if age_gate_status.get('status') == 'OK': + player_response = video_info + playability_status = age_gate_status trailer_video_id = try_get( playability_status, @@ -1932,12 +2048,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for thumbnail in (try_get( container, lambda x: x['thumbnail']['thumbnails'], list) or []): - thumbnail_url = thumbnail.get('url') + thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue thumbnails.append({ 'height': int_or_none(thumbnail.get('height')), - 'url': thumbnail_url, + 'url': update_url(thumbnail_url, query=None, fragment=None), 'width': int_or_none(thumbnail.get('width')), }) if thumbnails: @@ -2142,6 +2258,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: + # however dislike_count was hidden by YT, as if there could ever be dislikable content on YT like_count, dislike_count = sbr_tooltip.split(' / ') info.update({ 'like_count': str_to_int(like_count), @@ -2411,7 +2528,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'tags': list, 'view_count': int, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2438,7 +2554,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -2458,7 +2573,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], 'like_count': int, - 'dislike_count': int, }, 'params': { 'skip_download': True, @@ -3043,8 +3157,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + url = update_url(url, netloc='www.youtube.com') # Handle both video/playlist URLs qs = parse_qs(url) video_id = qs.get('v', [None])[0] @@ -3178,7 +3291,6 @@ class YoutubeYtBeIE(InfoExtractor): 'categories': ['Nonprofits & Activism'], 'tags': list, 'like_count': int, - 'dislike_count': int, }, 'params': { 'noplaylist': True, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e3c3ccff9..d5cc6386d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4121,6 +4121,17 @@ def update_url_query(url, query): query=compat_urllib_parse_urlencode(qs, True))) +def update_url(url, **kwargs): + """Replace URL components specified by kwargs + url: compat_str or parsed URL tuple + returns: compat_str""" + if not kwargs: + return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url + if not isinstance(url, tuple): + url = compat_urlparse.urlparse(url) + return compat_urlparse.urlunparse(url._replace(**kwargs)) + + def update_Request(req, url=None, data=None, headers={}, query={}): req_headers = req.headers.copy() req_headers.update(headers) From 2be0cd261601e5be431723b95cf0d838621760a9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 20 Jun 2022 23:15:20 +0100 Subject: [PATCH 02/15] [YouTube] Add `signatureTimestamp` for age-gate bypass --- youtube_dl/extractor/youtube.py | 34 +++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 65428528d..6c1cfe7f2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1642,6 +1642,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmt['url'] = compat_urlparse.urlunparse( parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + # from yt-dlp, with tweaks + def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): + """ + Extract signatureTimestamp (sts) + Required to tell API what sig/player version is in use. + """ + sts = int_or_none(ytcfg.get('STS')) if isinstance(ytcfg, dict) else None + if not sts: + # Attempt to extract from player + if player_url is None: + error_msg = 'Cannot extract signature timestamp without player_url.' + if fatal: + raise ExtractorError(error_msg) + self._downloader.report_warning(error_msg) + return + code = self._get_player_code(video_id, player_url) + sts = int_or_none(self._search_regex( + r'(?:signatureTimestamp|sts)\s*:\s*(?P[0-9]{5})', code or '', + 'JS player signature timestamp', group='sts', fatal=fatal)) + return sts + def _mark_watched(self, video_id, player_response): playback_url = url_or_none(try_get( player_response, @@ -1766,6 +1787,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) player_response = None + player_url = None if webpage: player_response = self._extract_yt_initial_variable( webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, @@ -1799,8 +1821,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Thanks: https://github.com/yt-dlp/yt-dlp/pull/3233 pb_context = {'html5Preference': 'HTML5_PREF_WANTS'} + + # Use signatureTimestamp if available + # Thanks https://github.com/ytdl-org/youtube-dl/issues/31034#issuecomment-1160718026 + player_url = self._extract_player_url(webpage) + ytcfg = self._extract_ytcfg(video_id, webpage) + sts = self._extract_signature_timestamp(video_id, player_url, ytcfg) + if sts: + pb_context['signatureTimestamp'] = sts + query = { - 'playbackContext': {'contentPlaybackContext': {'html5Preference': 'HTML5_PREF_WANTS'}}, + 'playbackContext': {'contentPlaybackContext': pb_context}, 'contentCheckOk': True, 'racyCheckOk': True, 'context': { @@ -1901,7 +1932,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats = [] itags = [] itag_qualities = {} - player_url = None q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres']) streaming_data = player_response.get('streamingData') or {} streaming_formats = streaming_data.get('formats') or [] From ce81ae38463f81eb3ee4dd6f68063146ec22b819 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sat, 4 Feb 2023 23:18:24 +0000 Subject: [PATCH 03/15] [test] Fix TestAgeRestriction * age restriction may cause DownloadError * update obsolete test URLs [skip ci] --- test/test_age_restriction.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 6f5513faa..db98494ab 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -11,6 +11,7 @@ from test.helper import try_rm from youtube_dl import YoutubeDL +from youtube_dl.utils import DownloadError def _download_restricted(url, filename, age): @@ -26,7 +27,10 @@ def _download_restricted(url, filename, age): ydl.add_default_info_extractors() json_filename = os.path.splitext(filename)[0] + '.info.json' try_rm(json_filename) - ydl.download([url]) + try: + ydl.download([url]) + except DownloadError: + try_rm(json_filename) res = os.path.exists(json_filename) try_rm(json_filename) return res @@ -38,12 +42,12 @@ class TestAgeRestriction(unittest.TestCase): self.assertFalse(_download_restricted(url, filename, age)) def test_youtube(self): - self._assert_restricted('07FYdnEawAQ', '07FYdnEawAQ.mp4', 10) + self._assert_restricted('HtVdAasjOgU', 'HtVdAasjOgU.mp4', 10) def test_youporn(self): self._assert_restricted( - 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - '505835.mp4', 2, old_age=25) + 'https://www.youporn.com/watch/16715086/sex-ed-in-detention-18-asmr/', + '16715086.mp4', 2, old_age=25) if __name__ == '__main__': From 9ca224b69708cc32010edea19b73892197e32a95 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Feb 2023 15:43:43 +0000 Subject: [PATCH 04/15] [compat] Systematise compat_ naming [skip ci] --- test/test_compat.py | 3 +- youtube_dl/compat.py | 221 +++++++++++++++++++++++++++---------------- 2 files changed, 139 insertions(+), 85 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 0986cff37..4dddd9a38 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -48,7 +48,8 @@ class TestCompat(unittest.TestCase): def test_all_present(self): import youtube_dl.compat - all_names = youtube_dl.compat.__all__ + all_names = sorted( + youtube_dl.compat.__all__ + youtube_dl.compat.legacy) present_names = set(filter( lambda c: '_' in c and not c.startswith('_'), dir(youtube_dl.compat))) - set(['unicode_literals']) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 28942a8c1..39551f810 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -21,6 +21,10 @@ import subprocess import sys import xml.etree.ElementTree +# naming convention +# 'compat_' + Python3_name.replace('.', '_') +# other aliases exist for convenience and/or legacy + # deal with critical unicode/str things first try: # Python 2 @@ -28,6 +32,7 @@ try: unicode, basestring, unichr ) from .casefold import casefold as compat_casefold + except NameError: compat_str, compat_basestring, compat_chr = ( str, str, chr @@ -53,16 +58,15 @@ try: import urllib.parse as compat_urllib_parse except ImportError: # Python 2 import urllib as compat_urllib_parse + import urlparse as _urlparse + for a in dir(_urlparse): + if not hasattr(compat_urllib_parse, a): + setattr(compat_urllib_parse, a, getattr(_urlparse, a)) + del _urlparse -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse +# unfavoured aliases +compat_urlparse = compat_urllib_parse +compat_urllib_parse_urlparse = compat_urllib_parse.urlparse try: import urllib.response as compat_urllib_response @@ -73,6 +77,7 @@ try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 import cookielib as compat_cookiejar +compat_http_cookiejar = compat_cookiejar if sys.version_info[0] == 2: class compat_cookiejar_Cookie(compat_cookiejar.Cookie): @@ -84,11 +89,13 @@ if sys.version_info[0] == 2: compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) else: compat_cookiejar_Cookie = compat_cookiejar.Cookie +compat_http_cookiejar_Cookie = compat_cookiejar_Cookie try: import http.cookies as compat_cookies except ImportError: # Python 2 import Cookie as compat_cookies +compat_http_cookies = compat_cookies if sys.version_info[0] == 2: class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): @@ -98,6 +105,7 @@ if sys.version_info[0] == 2: return super(compat_cookies_SimpleCookie, self).load(rawdata) else: compat_cookies_SimpleCookie = compat_cookies.SimpleCookie +compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie try: import html.entities as compat_html_entities @@ -2351,16 +2359,19 @@ try: from urllib.error import HTTPError as compat_HTTPError except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +compat_urllib_HTTPError = compat_HTTPError try: from urllib.request import urlretrieve as compat_urlretrieve except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +compat_urllib_request_urlretrieve = compat_urlretrieve try: from html.parser import HTMLParser as compat_HTMLParser except ImportError: # Python 2 from HTMLParser import HTMLParser as compat_HTMLParser +compat_html_parser_HTMLParser = compat_HTMLParser try: # Python 2 from HTMLParser import HTMLParseError as compat_HTMLParseError @@ -2374,6 +2385,7 @@ except ImportError: # Python <3.4 # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass +compat_html_parser_HTMLParseError = compat_HTMLParseError try: from subprocess import DEVNULL @@ -2390,6 +2402,8 @@ try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus + from urllib.parse import urlencode as compat_urllib_parse_urlencode + from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') else re.compile(r'([\x00-\x7f]+)')) @@ -2456,9 +2470,6 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) -try: - from urllib.parse import urlencode as compat_urllib_parse_urlencode -except ImportError: # Python 2 # Python 2 will choke in urlencode on mixture of byte and unicode strings. # Possible solutions are to either port it from python 3 with all # the friends or manually ensure input query contains only byte strings. @@ -2480,7 +2491,62 @@ except ImportError: # Python 2 def encode_list(l): return [encode_elem(e) for e in l] - return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq) + + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, compat_str + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError('bad query field: %r' % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + setattr(compat_urllib_parse, '_urlencode', + getattr(compat_urllib_parse, 'urlencode')) + for name, fix in ( + ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes), + ('parse_unquote', compat_urllib_parse_unquote), + ('unquote_plus', compat_urllib_parse_unquote_plus), + ('urlencode', compat_urllib_parse_urlencode), + ('parse_qs', compat_parse_qs)): + setattr(compat_urllib_parse, name, fix) + +compat_urllib_parse_parse_qs = compat_parse_qs try: from urllib.request import DataHandler as compat_urllib_request_DataHandler @@ -2520,6 +2586,7 @@ try: from xml.etree.ElementTree import ParseError as compat_xml_parse_error except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error etree = xml.etree.ElementTree @@ -2533,10 +2600,11 @@ try: # xml.etree.ElementTree.Element is a method in Python <=2.6 and # the following will crash with: # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types - isinstance(None, xml.etree.ElementTree.Element) + isinstance(None, etree.Element) from xml.etree.ElementTree import Element as compat_etree_Element except TypeError: # Python <=2.6 from xml.etree.ElementTree import _ElementInterface as compat_etree_Element +compat_xml_etree_ElementTree_Element = compat_etree_Element if sys.version_info[0] >= 3: def compat_etree_fromstring(text): @@ -2592,6 +2660,7 @@ else: if k == uri or v == prefix: del etree._namespace_map[k] etree._namespace_map[uri] = prefix +compat_xml_etree_register_namespace = compat_etree_register_namespace if sys.version_info < (2, 7): # Here comes the crazy part: In 2.6, if the xpath is a unicode, @@ -2603,53 +2672,6 @@ if sys.version_info < (2, 7): else: compat_xpath = lambda xpath: xpath -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - compat_os_name = os._name if os.name == 'java' else os.name @@ -2774,6 +2796,8 @@ else: else: compat_expanduser = os.path.expanduser +compat_os_path_expanduser = compat_expanduser + if compat_os_name == 'nt' and sys.version_info < (3, 8): # os.path.realpath on Windows does not follow symbolic links @@ -2785,6 +2809,8 @@ if compat_os_name == 'nt' and sys.version_info < (3, 8): else: compat_realpath = os.path.realpath +compat_os_path_realpath = compat_realpath + if sys.version_info < (3, 0): def compat_print(s): @@ -2805,11 +2831,15 @@ if sys.version_info < (3, 0) and sys.platform == 'win32': else: compat_getpass = getpass.getpass +compat_getpass_getpass = compat_getpass + + try: compat_input = raw_input except NameError: # Python 3 compat_input = input + # Python < 2.6.5 require kwargs to be bytes try: def _testfunc(x): @@ -2915,15 +2945,16 @@ else: lines = _lines return _terminal_size(columns, lines) + try: itertools.count(start=0, step=1) compat_itertools_count = itertools.count except TypeError: # Python 2.6 def compat_itertools_count(start=0, step=1): - n = start while True: - yield n - n += step + yield start + start += step + if sys.version_info >= (3, 0): from tokenize import tokenize as compat_tokenize_tokenize @@ -3075,6 +3106,8 @@ if sys.version_info < (3, 3): else: compat_b64decode = base64.b64decode +compat_base64_b64decode = compat_b64decode + if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): # PyPy2 prior to version 5.4.0 expects byte strings as Windows function @@ -3094,30 +3127,53 @@ else: return ctypes.WINFUNCTYPE(*args, **kwargs) -__all__ = [ +legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_Struct', 'compat_b64decode', - 'compat_basestring', - 'compat_casefold', - 'compat_chr', - 'compat_collections_abc', - 'compat_collections_chain_map', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', 'compat_cookies_SimpleCookie', - 'compat_ctypes_WINFUNCTYPE', 'compat_etree_Element', - 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', + 'compat_getpass', + 'compat_parse_qs', + 'compat_realpath', + 'compat_urllib_parse_parse_qs', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', + 'compat_urllib_parse_urlparse', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', +] + + +__all__ = [ + 'compat_html_parser_HTMLParseError', + 'compat_html_parser_HTMLParser', + 'compat_Struct', + 'compat_base64_b64decode', + 'compat_basestring', + 'compat_casefold', + 'compat_chr', + 'compat_collections_abc', + 'compat_collections_chain_map', + 'compat_http_cookiejar', + 'compat_http_cookiejar_Cookie', + 'compat_http_cookies', + 'compat_http_cookies_SimpleCookie', + 'compat_ctypes_WINFUNCTYPE', + 'compat_etree_fromstring', 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', - 'compat_getpass', + 'compat_getpass_getpass', 'compat_html_entities', 'compat_html_entities_html5', 'compat_http_client', @@ -3131,11 +3187,11 @@ __all__ = [ 'compat_numeric_types', 'compat_ord', 'compat_os_name', - 'compat_parse_qs', + 'compat_os_path_expanduser', + 'compat_os_path_realpath', 'compat_print', 'compat_re_Match', 'compat_re_Pattern', - 'compat_realpath', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', @@ -3147,17 +3203,14 @@ __all__ = [ 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_xml_parse_error', + 'compat_urllib_request_urlretrieve', + 'compat_urllib_HTTPError', + 'compat_xml_etree_ElementTree_Element', + 'compat_xml_etree_ElementTree_ParseError', + 'compat_xml_etree_register_namespace', 'compat_xpath', 'compat_zip', 'workaround_optparse_bug9161', From a6f7d10d441d25bcfeba9e63244dbef7d52d3163 Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Feb 2023 13:46:43 +0000 Subject: [PATCH 05/15] [utils] Add parse_qs, update_url [skip ci] --- youtube_dl/utils.py | 64 ++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d5cc6386d..4edbfa27b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -42,6 +42,7 @@ from .compat import ( compat_HTMLParser, compat_HTTPError, compat_basestring, + compat_casefold, compat_chr, compat_collections_abc, compat_cookiejar, @@ -54,18 +55,18 @@ from .compat import ( compat_integer_types, compat_kwargs, compat_os_name, - compat_parse_qs, + compat_re_Match, compat_shlex_quote, compat_str, compat_struct_pack, compat_struct_unpack, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urllib_parse_unquote_plus, compat_urllib_request, - compat_urlparse, compat_xpath, ) @@ -80,12 +81,12 @@ def register_socks_protocols(): # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 # URLs with protocols not in urlparse.uses_netloc are not handled correctly for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) + if scheme not in compat_urllib_parse.uses_netloc: + compat_urllib_parse.uses_netloc.append(scheme) -# This is not clearly defined otherwise -compiled_regex_type = type(re.compile('')) +# Unfavoured alias +compiled_regex_type = compat_re_Match def random_user_agent(): @@ -2725,7 +2726,7 @@ def make_socks_conn_class(base_class, socks_proxy): assert issubclass(base_class, ( compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) - url_components = compat_urlparse.urlparse(socks_proxy) + url_components = compat_urllib_parse.urlparse(socks_proxy) if url_components.scheme.lower() == 'socks5': socks_type = ProxyType.SOCKS5 elif url_components.scheme.lower() in ('socks', 'socks4'): @@ -3673,7 +3674,7 @@ def remove_quotes(s): def url_basename(url): - path = compat_urlparse.urlparse(url).path + path = compat_urllib_parse.urlparse(url).path return path.strip('/').split('/')[-1] @@ -3693,7 +3694,7 @@ def urljoin(base, path): if not isinstance(base, compat_str) or not re.match( r'^(?:https?:)?//', base): return None - return compat_urlparse.urljoin(base, path) + return compat_urllib_parse.urljoin(base, path) class HEADRequest(compat_urllib_request.Request): @@ -4091,6 +4092,10 @@ def escape_url(url): ).geturl() +def parse_qs(url): + return compat_parse_qs(compat_urllib_parse.urlparse(url).query) + + def read_batch_urls(batch_fd): def fixup(url): if not isinstance(url, compat_str): @@ -4111,25 +4116,28 @@ def urlencode_postdata(*args, **kargs): return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') -def update_url_query(url, query): - if not query: - return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) - qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) - - def update_url(url, **kwargs): """Replace URL components specified by kwargs url: compat_str or parsed URL tuple - returns: compat_str""" + if query_update is in kwargs, update query with + its value instead of replacing (overrides any `query`) + returns: compat_str + """ if not kwargs: - return compat_urlparse.urlunparse(url) if isinstance(url, tuple) else url + return compat_urllib_parse.urlunparse(url) if isinstance(url, tuple) else url if not isinstance(url, tuple): - url = compat_urlparse.urlparse(url) - return compat_urlparse.urlunparse(url._replace(**kwargs)) + url = compat_urllib_parse.urlparse(url) + query = kwargs.pop('query_update', None) + if query: + qs = compat_parse_qs(url.query) + qs.update(query) + kwargs['query'] = compat_urllib_parse_urlencode(qs, True) + kwargs = compat_kwargs(kwargs) + return compat_urllib_parse.urlunparse(url._replace(**kwargs)) + + +def update_url_query(url, query): + return update_url(url, query_update=query) def update_Request(req, url=None, data=None, headers={}, query={}): @@ -5597,7 +5605,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + if compat_urllib_parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): req.add_header('Ytdl-socks-proxy', proxy) # youtube-dl's http/https handlers do wrapping the socket with socks return None @@ -6035,14 +6043,6 @@ def traverse_obj(obj, *paths, **kwargs): str = compat_str is_sequence = lambda x: isinstance(x, compat_collections_abc.Sequence) and not isinstance(x, (str, bytes)) - # stand-in until compat_re_Match is added - compat_re_Match = type(re.match('a', 'a')) - # stand-in until casefold.py is added - try: - ''.casefold() - compat_casefold = lambda s: s.casefold() - except AttributeError: - compat_casefold = lambda s: s.lower() casefold = lambda k: compat_casefold(k) if isinstance(k, str) else k if isinstance(expected_type, type): From b337af9c62fe17cb186788d550873726d4c7386f Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Feb 2023 15:50:28 +0000 Subject: [PATCH 06/15] [compat] Update test_compat [skip ci] --- test/test_compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 4dddd9a38..e233b1ae1 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -50,9 +50,9 @@ class TestCompat(unittest.TestCase): import youtube_dl.compat all_names = sorted( youtube_dl.compat.__all__ + youtube_dl.compat.legacy) - present_names = set(filter( + present_names = set(map(compat_str, filter( lambda c: '_' in c and not c.startswith('_'), - dir(youtube_dl.compat))) - set(['unicode_literals']) + dir(youtube_dl.compat)))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) def test_compat_urllib_parse_unquote(self): From f640916de124206f9a8a397d44455ab917728f4a Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Feb 2023 16:19:21 +0000 Subject: [PATCH 07/15] [YouTube] Refresh compat/utils usage * import parse_qs() * import parse_qs in lazy_extractors (clears old TODO) * clean up old compiled lazy_extractors for Py2 * use update_url() --- devscripts/make_lazy_extractors.py | 10 ++++- test/test_execution.py | 12 +++--- youtube_dl/extractor/youtube.py | 61 +++++++++++------------------- 3 files changed, 39 insertions(+), 44 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 878ae72b1..edc19183d 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -13,6 +13,11 @@ sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) lazy_extractors_filename = sys.argv[1] if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) +# Py2: may be confused by leftover lazy_extractors.pyc +try: + os.remove(lazy_extractors_filename + 'c') +except OSError: + pass from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor @@ -22,7 +27,10 @@ with open('devscripts/lazy_load_template.py', 'rt') as f: module_contents = [ module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', - 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n', + # needed for suitable() methods of Youtube extractor (see #28780) + 'from youtube_dl.utils import parse_qs\n', +] ie_template = ''' class {name}({bases}): diff --git a/test/test_execution.py b/test/test_execution.py index 32948d93e..704e14612 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -40,14 +40,16 @@ class TestExecution(unittest.TestCase): self.assertFalse(stderr) def test_lazy_extractors(self): + lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py' try: - subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) + subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) finally: - try: - os.remove('youtube_dl/extractor/lazy_extractors.py') - except (IOError, OSError): - pass + for x in ['', 'c'] if sys.version_info[0] < 3 else ['']: + try: + os.remove(lazy_extractors + x) + except (IOError, OSError): + pass if __name__ == '__main__': diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c1cfe7f2..6c70a98d1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,12 +14,11 @@ from ..compat import ( compat_chr, compat_HTTPError, compat_map as map, - compat_parse_qs, compat_str, + compat_urllib_parse, + compat_urllib_parse_parse_qs as compat_parse_qs, compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..jsinterp import JSInterpreter from ..utils import ( @@ -33,6 +32,7 @@ from ..utils import ( mimetype2ext, parse_codecs, parse_duration, + parse_qs, qualities, remove_start, smuggle_url, @@ -50,10 +50,6 @@ from ..utils import ( ) -def parse_qs(url): - return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - - class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -636,6 +632,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'duration': 142, 'uploader': 'The Witcher', + 'uploader_id': 'WitcherGame', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'age_limit': 18, @@ -671,7 +669,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { - 'note': 'Age-gated video embedable only with clientScreen=EMBED', + 'note': 'Age-gated video embeddable only with clientScreen=EMBED', 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', 'info_dict': { 'id': 'Tq92D6wQ1mg', @@ -1392,11 +1390,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('list', [None])[0]: + if parse_qs(url).get('list', [None])[0]: return False return super(YoutubeIE, cls).suitable(url) @@ -1546,7 +1540,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url elif not re.match(r'https?://', player_url): - player_url = compat_urlparse.urljoin( + player_url = compat_urllib_parse.urljoin( 'https://www.youtube.com', player_url) return player_url @@ -1628,9 +1622,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _unthrottle_format_urls(self, video_id, player_url, formats): for fmt in formats: - parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) - qs = compat_urlparse.parse_qs(parsed_fmt_url.query) - n_param = qs.get('n') + parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url']) + n_param = compat_parse_qs(parsed_fmt_url.query).get('n') if not n_param: continue n_param = n_param[-1] @@ -1638,9 +1631,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if n_response is None: # give up if descrambling failed break - qs['n'] = [n_response] - fmt['url'] = compat_urlparse.urlunparse( - parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + fmt['url'] = update_url( + parsed_fmt_url, query_update={'n': [n_response]}) # from yt-dlp, with tweaks def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): @@ -1669,20 +1661,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) if not playback_url: return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) # cpn generation algorithm is reverse engineered from base.js. # In fact it works even with dummy cpn. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + playback_url = update_url( + playback_url, query_update={ + 'ver': ['2'], + 'cpn': [cpn], + }) self._download_webpage( playback_url, video_id, 'Marking watched', @@ -2075,9 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): thumbnails = [] for container in (video_details, microformat): - for thumbnail in (try_get( + for thumbnail in try_get( container, - lambda x: x['thumbnail']['thumbnails'], list) or []): + lambda x: x['thumbnail']['thumbnails'], list) or []: thumbnail_url = url_or_none(thumbnail.get('url')) if not thumbnail_url: continue @@ -3287,11 +3276,7 @@ class YoutubePlaylistIE(InfoExtractor): def suitable(cls, url): if YoutubeTabIE.suitable(url): return False - # Hack for lazy extractors until more generic solution is implemented - # (see #28780) - from .youtube import parse_qs - qs = parse_qs(url) - if qs.get('v', [None])[0]: + if parse_qs(url).get('v', [None])[0]: return False return super(YoutubePlaylistIE, cls).suitable(url) @@ -3430,9 +3415,9 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor): }] def _real_extract(self, url): - qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - query = (qs.get('search_query') or qs.get('q'))[0] - params = qs.get('sp', ('',))[0] + qs = parse_qs(url) + query = (qs.get('search_query') or qs.get('q'))[-1] + params = qs.get('sp', ('',))[-1] return self.playlist_result(self._search_results(query, params), query, query) From bcf597ea1736395bbb893af16a012f10e7a2b544 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 8 Feb 2023 18:16:51 +0000 Subject: [PATCH 08/15] [YouTube] Fix tests --- youtube_dl/extractor/youtube.py | 55 ++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c70a98d1..ba0f5c8b6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,8 @@ from ..utils import ( dict_get, error_to_compat_str, float_or_none, + extract_attributes, + get_element_by_attribute, int_or_none, js_to_json, mimetype2ext, @@ -38,6 +40,7 @@ from ..utils import ( smuggle_url, str_or_none, str_to_int, + traverse_obj, try_get, unescapeHTML, unified_strdate, @@ -656,6 +659,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'duration': 177, 'uploader': 'FlyingKitty', + 'uploader_id': 'FlyingKitty900', 'upload_date': '20200408', 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', 'age_limit': 18, @@ -678,6 +682,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:17eccca93a786d51bc67646756894066', 'duration': 106, 'uploader': 'Projekt Melody', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'upload_date': '20191227', 'age_limit': 18, 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', @@ -929,16 +934,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', + 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk', + 'artist': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', }, 'params': { @@ -2091,7 +2096,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') - owner_profile_url = microformat.get('ownerProfileUrl') + + def gen_owner_profile_url(): + yield microformat.get('ownerProfileUrl') + yield extract_attributes(self._search_regex( + r'''(?s)(]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', + get_element_by_attribute('itemprop', 'author', webpage), + 'owner_profile_url', default='')).get('href') + + owner_profile_url = next( + (x for x in map(url_or_none, gen_owner_profile_url()) if x), + None) if not player_url: player_url = self._extract_player_url(webpage) @@ -2176,6 +2191,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info[d_k] = parse_duration(query[k][0]) if video_description: + # Youtube Music Auto-generated description mobj = re.search(r'(?s)(?P[^·\n]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: release_year = mobj.group('release_year') @@ -2250,7 +2266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': info['location'] = stl else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + # •? doesn't match, but [•]? does; \xa0 = non-breaking space + mobj = re.search(r'([^\xa0\s].*?)[\xa0\s]*S(\d+)[\xa0\s]*[•]?[\xa0\s]*E(\d+)', stl) if mobj: info.update({ 'series': mobj.group(1), @@ -2261,7 +2278,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} + tbr = traverse_obj(tlb, ('segmentedLikeDislikeButtonRenderer', 'likeButton', 'toggleButtonRenderer'), 'toggleButtonRenderer') or {} for getter, regex in [( lambda x: x['defaultText']['accessibility']['accessibilityData'], r'(?P[\d,]+)\s*(?P(?:dis)?like)'), ([ @@ -2315,6 +2332,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif mrr_title == 'Song': info['track'] = mrr_contents_text + # this is not extraction but spelunking! + carousel_lockups = traverse_obj( + initial_data, + ('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', + 'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis, + 'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis), + expected_type=dict) or [] + # try to reproduce logic from metadataRowContainerRenderer above (if it still is) + fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license')) + # multiple_songs ? + if len(carousel_lockups) > 1: + fields = fields[-1:] + for info_row in traverse_obj( + carousel_lockups, + (0, 'carouselLockupRenderer', 'infoRows', Ellipsis, 'infoRowRenderer'), + expected_type=dict): + row_title = traverse_obj(info_row, ('title', 'simpleText')) + row_text = traverse_obj(info_row, 'defaultMetadata', 'expandedMetadata', expected_type=get_text) + if not row_text: + continue + for name, field in fields: + if name == row_title and not info.get(field): + info[field] = row_text + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: From 80c02287773ab2c136e0e05d27917d0aa5f6d402 Mon Sep 17 00:00:00 2001 From: Valentin Metz <31850924+Valentin-Metz@users.noreply.github.com> Date: Thu, 9 Feb 2023 12:25:28 +0100 Subject: [PATCH 09/15] [rbgtum] Add new extractor (#31305) * [rbgtum] Add new extractor * Small update, force CI --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/rbgtum.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/rbgtum.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 96b27b179..dfaef0cc3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1010,6 +1010,10 @@ from .raywenderlich import ( RayWenderlichIE, RayWenderlichCourseIE, ) +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import ( diff --git a/youtube_dl/extractor/rbgtum.py b/youtube_dl/extractor/rbgtum.py new file mode 100644 index 000000000..da48ebbc4 --- /dev/null +++ b/youtube_dl/extractor/rbgtum.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class RbgTumIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P.+)' + _TESTS = [{ + # Combined view + 'url': 'https://live.rbg.tum.de/w/cpp/22128', + 'md5': '53a5e7b3e07128e33bbf36687fe1c08f', + 'info_dict': { + 'id': 'cpp/22128', + 'ext': 'mp4', + 'title': 'Lecture: October 18. 2022', + 'series': 'Concepts of C++ programming (IN2377)', + } + }, { + # Presentation only + 'url': 'https://live.rbg.tum.de/w/I2DL/12349/PRES', + 'md5': '36c584272179f3e56b0db5d880639cba', + 'info_dict': { + 'id': 'I2DL/12349/PRES', + 'ext': 'mp4', + 'title': 'Lecture 3: Introduction to Neural Networks', + 'series': 'Introduction to Deep Learning (IN2346)', + } + }, { + # Camera only + 'url': 'https://live.rbg.tum.de/w/fvv-info/16130/CAM', + 'md5': 'e04189d92ff2f56aedf5cede65d37aad', + 'info_dict': { + 'id': 'fvv-info/16130/CAM', + 'ext': 'mp4', + 'title': 'Fachschaftsvollversammlung', + 'series': 'Fachschaftsvollversammlung Informatik', + } + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + lecture_series_title = self._html_search_regex( + r'(?s)]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + + formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': lecture_title, + 'series': lecture_series_title, + 'formats': formats, + } + + +class RbgTumCourseIE(InfoExtractor): + _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, ] + + def _real_extract(self, url): + course_id = self._match_id(url) + webpage = self._download_webpage(url, course_id) + + lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + + lecture_urls = [] + for lecture_url in re.findall(r'(?i)href="/w/(.+)(? Date: Fri, 10 Feb 2023 04:19:27 +0800 Subject: [PATCH 10/15] [feat]: Add support to external downloader aria2p (#31500) * feat: add class Aria2pFD * feat: create call_downloader function * feat: a colorful download interface to aria2pFD * feat: change value name * Apply suggestions from code review Co-authored-by: dirkf * Typo in suggestion * fix: remove unused value * fix: add not function to return value(0 is normal); add total_seconds to download.eta(timedelta object); add waiting status when hook progress * fix: remove unuse method ..utils.format_bytes * fix: be up to flake8 * fix: be up to flake8 * Apply suggestions from code review * [feat] test external downloader aria2p * [feat] test external downloader aria2p * [fix] test_external_downloader.py * Apply suggestions from code review Co-authored-by: dirkf * Apply suggestions from code review Co-authored-by: dirkf * Update test/test_external_downloader.py Co-authored-by: dirkf * Update test/test_external_downloader.py Co-authored-by: dirkf * Update youtube_dl/downloader/external.py Co-authored-by: dirkf * refactoring code and fix bugs * Apply suggestions from code review * Rename test_external_downloader.py to test_downloader_external.py --------- Co-authored-by: dirkf --- test/helper.py | 11 +++ test/test_downloader_external.py | 115 ++++++++++++++++++++++++++++++ test/test_downloader_http.py | 17 ++--- test/test_http.py | 16 ++--- youtube_dl/downloader/external.py | 58 +++++++++++++++ 5 files changed, 193 insertions(+), 24 deletions(-) create mode 100644 test/test_downloader_external.py diff --git a/test/helper.py b/test/helper.py index c6a2f0667..883b2e877 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,6 +89,17 @@ class FakeYDL(YoutubeDL): self.report_warning = types.MethodType(report_warning, self) +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): for tc in ie.get_testcases(include_onlymatching): diff --git a/test/test_downloader_external.py b/test/test_downloader_external.py new file mode 100644 index 000000000..c0239502b --- /dev/null +++ b/test/test_downloader_external.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import subprocess +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.utils import encodeFilename +from youtube_dl.downloader.external import Aria2pFD +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.match(r'bytes=(\d+)-(\d+)', range_header) + if mobj: + start, end = (int(mobj.group(i)) for i in (1, 2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False, 'unrecognised server path' + + +@unittest.skipUnless(Aria2pFD.available(), 'aria2p module not found') +class TestAria2pFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + with subprocess.Popen( + ['aria2c', '--enable-rpc'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL + ) as process: + if not process.poll(): + filename = 'testfile.mp4' + params['logger'] = FakeLogger() + params['outtmpl'] = filename + ydl = YoutubeDL(params) + try_rm(encodeFilename(filename)) + self.assertEqual(ydl.download(['http://127.0.0.1:%d/%s' % (self.port, ep)]), 0) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + process.kill() + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({'external_downloader': 'aria2p'}) + + def test_chunked(self): + self.download_all({ + 'external_downloader': 'aria2p', + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 750472281..4e6d7a2a0 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,7 +9,11 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port, try_rm +from test.helper import ( + FakeLogger, + http_server_port, + try_rm, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD @@ -66,17 +70,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHttpFD(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/test/test_http.py b/test/test_http.py index 3ee0a5dda..487a9bc77 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,7 +8,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import http_server_port +from test.helper import ( + FakeLogger, + http_server_port, +) from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -52,17 +55,6 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): assert False -class FakeLogger(object): - def debug(self, msg): - pass - - def warning(self, msg): - pass - - def error(self, msg): - pass - - class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index a06ab2e50..bffcd10b6 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -200,6 +200,64 @@ class Aria2cFD(ExternalFD): return cmd +class Aria2pFD(ExternalFD): + ''' Aria2pFD class + This class support to use aria2p as downloader. + (Aria2p, a command-line tool and Python library to interact with an aria2c daemon process + through JSON-RPC.) + It can help you to get download progress more easily. + To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip. + Then run aria2c in the background and enable with the --enable-rpc option. + ''' + try: + import aria2p + __avail = True + except ImportError: + __avail = False + + @classmethod + def available(cls): + return cls.__avail + + def _call_downloader(self, tmpfilename, info_dict): + aria2 = self.aria2p.API( + self.aria2p.Client( + host='http://localhost', + port=6800, + secret='' + ) + ) + + options = { + 'min-split-size': '1M', + 'max-connection-per-server': 4, + 'auto-file-renaming': 'false', + } + options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.') + options['out'] = os.path.basename(tmpfilename) + options['header'] = [] + for key, val in info_dict['http_headers'].items(): + options['header'].append('{0}: {1}'.format(key, val)) + download = aria2.add_uris([info_dict['url']], options) + status = { + 'status': 'downloading', + 'tmpfilename': tmpfilename, + } + started = time.time() + while download.status in ['active', 'waiting']: + download = aria2.get_download(download.gid) + status.update({ + 'downloaded_bytes': download.completed_length, + 'total_bytes': download.total_length, + 'elapsed': time.time() - started, + 'eta': download.eta.total_seconds(), + 'speed': download.download_speed, + }) + self._hook_progress(status) + time.sleep(.5) + return download.status != 'complete' + + class HttpieFD(ExternalFD): @classmethod def available(cls): From cf5fb7e13972f508758a11490a46112cc0ee4891 Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:37:45 +0100 Subject: [PATCH 11/15] [FileMoonIE] Add extractor for filemoon.sx (#31515) --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/filemoon.py | 43 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/filemoon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dfaef0cc3..f63a2e030 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -376,6 +376,7 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE +from .filemoon import FileMoonIE from .fifa import FifaIE from .filmon import ( FilmOnIE, diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py new file mode 100644 index 000000000..654df9b69 --- /dev/null +++ b/youtube_dl/extractor/filemoon.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + js_to_json, +) + + +class FileMoonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' + _TEST = { + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + matches = re.findall(r'(?s)(eval.*?)', webpage) + packed = matches[-1] + unpacked = decode_packed_codes(packed) + jwplayer_sources = self._parse_json( + self._search_regex( + r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': self._generic_title(url) or video_id, + 'formats': formats + } From 99c3cf6419aac3d781aa3b82202c07802ec6160a Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:47:43 +0100 Subject: [PATCH 12/15] [KommunetvIE] Add extractor for kommunetv.no (#31516) * Add extractor for kommunetv.no * Using utils.update_url instead of regex --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kommunetv.py | 35 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/kommunetv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f63a2e030..d8428f46f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -557,6 +557,7 @@ from .khanacademy import ( from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE from .kth import KTHIE diff --git a/youtube_dl/extractor/kommunetv.py b/youtube_dl/extractor/kommunetv.py new file mode 100644 index 000000000..91d06a74f --- /dev/null +++ b/youtube_dl/extractor/kommunetv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import update_url + + +class KommunetvIE(InfoExtractor): + _VALID_URL = r'https://(\w+).kommunetv.no/archive/(?P\w+)' + _TEST = { + 'url': 'https://oslo.kommunetv.no/archive/921', + 'md5': '5f102be308ee759be1e12b63d5da4bbc', + 'info_dict': { + 'id': '921', + 'title': 'Bystyremøte', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'application/json' + } + data = self._download_json('https://oslo.kommunetv.no/api/streams?streamType=1&id=%s' % video_id, video_id, headers=headers) + title = data['stream']['title'] + file = data['playlist'][0]['playlist'][0]['file'] + url = update_url(file, query=None, fragment=None) + formats = self._extract_m3u8_formats(url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'title': title + } From 70a4a8b7528a3acd34e9c6e1a654da834d107d5a Mon Sep 17 00:00:00 2001 From: fonkap Date: Sat, 11 Feb 2023 03:54:45 +0100 Subject: [PATCH 13/15] [StreamsbIE] Add extractor for streamsb.com (viewsb.com) (#31517) * Add extractor for streamsb.com (viewsb.com) * make data url using app.js version --------- Co-authored-by: dirkf --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/streamsb.py | 61 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/streamsb.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8428f46f..3a87f9e33 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1206,6 +1206,7 @@ from .storyfire import ( from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamsb import StreamsbIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE diff --git a/youtube_dl/extractor/streamsb.py b/youtube_dl/extractor/streamsb.py new file mode 100644 index 000000000..bffcb3de1 --- /dev/null +++ b/youtube_dl/extractor/streamsb.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import binascii +import random +import re +import string + +from .common import InfoExtractor +from ..utils import urljoin, url_basename + + +def to_ascii_hex(str1): + return binascii.hexlify(str1.encode('utf-8')).decode('ascii') + + +def generate_random_string(length): + return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length)) + + +class StreamsbIE(InfoExtractor): + _DOMAINS = ('viewsb.com', ) + _VALID_URL = r'https://(?P%s)/(?P.+)' % '|'.join(_DOMAINS) + _TEST = { + 'url': 'https://viewsb.com/dxfvlu4qanjx', + 'md5': '488d111a63415369bf90ea83adc8a325', + 'info_dict': { + 'id': 'dxfvlu4qanjx', + 'ext': 'mp4', + 'title': 'Sintel' + } + } + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).group('domain', 'id') + webpage = self._download_webpage(url, video_id) + + iframe_rel_url = self._search_regex(r'''(?i)]+\bsrc\s*=\s*('|")(?P/.*\.html)\1''', webpage, 'iframe', group='path') + iframe_url = urljoin('https://' + domain, iframe_rel_url) + + iframe_data = self._download_webpage(iframe_url, video_id) + app_version = self._search_regex(r''']+\bsrc\s*=\s*["|'].*/app\.min\.(\d+)\.js''', iframe_data, 'app version', fatal=False) or '50' + + video_code = url_basename(iframe_url).rsplit('.')[0] + + length = 12 + req = '||'.join((generate_random_string(length), video_code, generate_random_string(length), 'streamsb')) + ereq = 'https://{0}/sources{1}/{2}'.format(domain, app_version, to_ascii_hex(req)) + + video_data = self._download_webpage(ereq, video_id, headers={ + 'Referer': iframe_url, + 'watchsb': 'sbstream', + }) + player_data = self._parse_json(video_data, video_id) + title = player_data['stream_data']['title'] + formats = self._extract_m3u8_formats(player_data['stream_data']['file'], video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + return { + 'id': video_id, + 'formats': formats, + 'title': title, + } From d524ac1898ad680e3476be30e218f44e42961374 Mon Sep 17 00:00:00 2001 From: Epsilon Spider <105926847+epsilonSpider@users.noreply.github.com> Date: Sat, 11 Feb 2023 14:42:25 -0500 Subject: [PATCH 14/15] Check for oreilly login with new url Resolves #30884 --- youtube_dl/extractor/safari.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 2cc665122..2d41482e5 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -38,7 +38,8 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return 'learning.oreilly.com/home/' in urlh.geturl() + return ('https://learning.oreilly.com/member/login/' == urlh.geturl() + or 'learning.oreilly.com/home/' in urlh.geturl()) if is_logged(urlh): self.LOGGED_IN = True From f7ca7e77be69b7028d68b92332d6de499d528601 Mon Sep 17 00:00:00 2001 From: Epsilon Spider <105926847+epsilonSpider@users.noreply.github.com> Date: Tue, 4 Apr 2023 16:29:00 -0400 Subject: [PATCH 15/15] [safari] Refactor login check --- youtube_dl/extractor/safari.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 2d41482e5..e923d7641 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -38,8 +38,11 @@ class SafariBaseIE(InfoExtractor): 'Downloading login page') def is_logged(urlh): - return ('https://learning.oreilly.com/member/login/' == urlh.geturl() - or 'learning.oreilly.com/home/' in urlh.geturl()) + url = urlh.geturl() + parsed_url = compat_urlparse.urlparse(url) + return parsed_url.hostname.endswith('learning.oreilly.com') and ( + parsed_url.path.startswith('/home/') + or (parsed_url.path == '/member/login/' and not parsed_url.query)) if is_logged(urlh): self.LOGGED_IN = True