diff --git a/test/test_utils.py b/test/test_utils.py index 102420fcb..ca36909a8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -81,6 +81,7 @@ from youtube_dl.utils import ( sanitize_filename, sanitize_path, sanitize_url, + sanitized_Request, shell_quote, smuggle_url, str_or_none, @@ -255,6 +256,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') self.assertEqual(sanitize_url('foo bar'), 'foo bar') + def test_sanitized_Request(self): + self.assertFalse(sanitized_Request('http://foo.bar').has_header('Authorization')) + self.assertFalse(sanitized_Request('http://:foo.bar').has_header('Authorization')) + self.assertEqual(sanitized_Request('http://@foo.bar').get_header('Authorization'), + 'Basic Og==') + self.assertEqual(sanitized_Request('http://:pass@foo.bar').get_header('Authorization'), + 'Basic OnBhc3M=') + self.assertEqual(sanitized_Request('http://user:@foo.bar').get_header('Authorization'), + 'Basic dXNlcjo=') + self.assertEqual(sanitized_Request('http://user:pass@foo.bar').get_header('Authorization'), + 'Basic dXNlcjpwYXNz') + def test_expand_path(self): def env(var): return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) @@ -512,11 +525,14 @@ class TestUtil(unittest.TestCase): self.assertEqual(float_or_none(set()), None) def test_int_or_none(self): + self.assertEqual(int_or_none(42), 42) self.assertEqual(int_or_none('42'), 42) self.assertEqual(int_or_none(''), None) self.assertEqual(int_or_none(None), None) self.assertEqual(int_or_none([]), None) self.assertEqual(int_or_none(set()), None) + self.assertEqual(int_or_none('42', base=8), 34) + self.assertRaises(TypeError, int_or_none(42, base=8)) def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0e5dfd8fa..7fae9e57b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -25,6 +25,7 @@ from ..compat import ( compat_getpass, compat_integer_types, compat_http_client, + compat_kwargs, compat_map as map, compat_open as open, compat_os_name, @@ -1102,6 +1103,60 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_json(self, start_pattern, string, name, video_id, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + + # self, start_pattern, string, name, video_id, *, end_pattern='', + # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT + # NB: end_pattern is only used to reduce the size of the initial match + end_pattern = kwargs.pop('end_pattern', '') + # (?:[\s\S]) simulates (?(s):.) (eg) + contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}') + fatal = kwargs.pop('fatal', True) + default = kwargs.pop('default', NO_DEFAULT) + + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + r'(?:{0})\s*(?P{1})\s*(?:{2})'.format( + start_pattern, contains_pattern, end_pattern), + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + # yt-dlp has a special JSON parser that allows trailing text. + # Until that arrives here, the diagnostic from the exception + # raised by json.loads() is used to extract the wanted text. + # Either way, it's a problem if a transform_source() can't + # handle the trailing text. + + # force an exception + kwargs['fatal'] = True + + # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + for _ in range(2): + try: + # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + transform_source = kwargs.pop('transform_source', None) + if transform_source: + json_string = transform_source(json_string) + return self._parse_json(json_string, video_id, **compat_kwargs(kwargs)) + except ExtractorError as e: + end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None)) + if end is not None: + json_string = json_string[:end] + continue + msg = 'Unable to extract {0} - Failed to parse JSON'.format(name) + if fatal: + raise ExtractorError(msg, cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id) + return default + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. @@ -2966,25 +3021,22 @@ class InfoExtractor(object): return formats def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data + return self._search_json( + r'''(?'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!).)*?\.\s*(?:setup\s*\(|(?Pload)\s*\(\s*\[)''', + webpage, 'JWPlayer data', video_id, + # must be a {...} or sequence, ending + contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))', + transform_source=transform_source, default=None) def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) + + # allow passing `transform_source` through to _find_jwplayer_data() + transform_source = kwargs.pop('transform_source', None) + kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {} + + jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind) + + return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs) def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): @@ -3018,22 +3070,14 @@ class InfoExtractor(object): mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) + for track in traverse_obj(video_data, ( + 'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))): + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) entry = { 'id': this_video_id, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a1905d13..c68c1d849 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -383,7 +383,6 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE -from .filemoon import FileMoonIE from .fifa import FifaIE from .filmon import ( FilmOnIE, diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py deleted file mode 100644 index 654df9b69..000000000 --- a/youtube_dl/extractor/filemoon.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, -) - - -class FileMoonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' - _TEST = { - 'url': 'https://filemoon.sx/e/dw40rxrzruqz', - 'md5': '5a713742f57ac4aef29b74733e8dda01', - 'info_dict': { - 'id': 'dw40rxrzruqz', - 'title': 'dw40rxrzruqz', - 'ext': 'mp4' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - matches = re.findall(r'(?s)(eval.*?)', webpage) - packed = matches[-1] - unpacked = decode_packed_codes(packed) - jwplayer_sources = self._parse_json( - self._search_regex( - r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), - video_id, transform_source=js_to_json) - - formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) - - return { - 'id': video_id, - 'title': self._generic_title(url) or video_id, - 'formats': formats - } diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index 4589e78a1..194b4b011 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -6,22 +6,31 @@ import re import string from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_ord, + compat_struct_pack, +) from ..utils import ( ExtractorError, int_or_none, mimetype2ext, parse_codecs, + parse_qs, update_url_query, urljoin, xpath_element, xpath_text, ) -from ..compat import ( - compat_b64decode, - compat_ord, - compat_struct_pack, - compat_urlparse, -) + + +def compat_random_choices(population, *args, **kwargs): + # weights=None, *, cum_weights=None, k=1 + # limited implementation needed here + weights = args[0] if args else kwargs.get('weights') + assert all(w is None for w in (weights, kwargs.get('cum_weights'))) + k = kwargs.get('k', 1) + return ''.join(random.choice(population) for _ in range(k)) class VideaIE(InfoExtractor): @@ -35,6 +44,7 @@ class VideaIE(InfoExtractor): ) (?P[^?#&]+) ''' + _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1'] _TESTS = [{ 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', @@ -44,6 +54,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', @@ -54,6 +65,7 @@ class VideaIE(InfoExtractor): 'title': 'Supercars előzés', 'thumbnail': r're:^https?://.*', 'duration': 64, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', @@ -64,6 +76,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', @@ -80,11 +93,14 @@ class VideaIE(InfoExtractor): }] _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', - webpage)] + @classmethod + def _extract_urls(cls, webpage): + def yield_urls(): + for pattern in cls._EMBED_REGEX: + for m in re.finditer(pattern, webpage): + yield m.group('url') + + return list(yield_urls()) @staticmethod def rc4(cipher_text, key): @@ -130,13 +146,13 @@ class VideaIE(InfoExtractor): for i in range(0, 32): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) - random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + query = parse_qs(player_url) + random_seed = ''.join(compat_random_choices(string.ascii_letters + string.digits, k=8)) query['_s'] = random_seed query['_t'] = result[:16] b64_info, handle = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + 'http://videa.hu/player/xml', video_id, query=query) if b64_info.startswith('%s)/(?:embed-)?(?P[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) + _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])] _FILE_NOT_FOUND_REGEXES = ( r'>(?:404 - )?File Not Found<', r'>The file was removed by administrator<', ) + _TITLE_REGEXES = ( + r'style="z-index: [0-9]+;">([^<]+)', + r'([^<]+)', + r'h4-fine[^>]*>([^<]+)<', + r'>Watch (.+)[ <]', + r'

([^<]+)

', + r'

]*>([^<]+)<', # streamin.to (dead) + r'title\s*:\s*"([^"]+)"', # govid.me + ) + _SOURCE_URL_REGEXES = ( + r'(?:file|src)\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', + r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + ) + _THUMBNAIL_REGEXES = ( + r']+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ) _TESTS = [{ - 'url': 'http://xvideosharing.com/fq65f94nd2ve', - 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', + 'note': 'link in `sources`', + 'url': 'https://uqload.to/dcsu06gdb45o', + 'md5': '7f8db187b254379440bf4fcad094ae86', 'info_dict': { - 'id': 'fq65f94nd2ve', + 'id': 'dcsu06gdb45o', 'ext': 'mp4', - 'title': 'sample', - 'thumbnail': r're:http://.*\.jpg', + 'title': 'f2e31015957e74c8c8427982e161c3fc mp4', + 'thumbnail': r're:https://.*\.jpg' + }, + 'params': { + 'nocheckcertificate': True, + }, + 'expected_warnings': ['Unable to extract JWPlayer data'], + }, { + 'note': 'link in decoded `sources`', + 'url': 'https://xvideosharing.com/1tlg6agrrdgc', + 'md5': '2608ce41932c1657ae56258a64e647d9', + 'info_dict': { + 'id': '1tlg6agrrdgc', + 'ext': 'mp4', + 'title': '0121', + 'thumbnail': r're:https?://.*\.jpg', + }, + 'skip': 'This server is in maintenance mode.', + }, { + 'note': 'JWPlayer link in un-p,a,c,k,e,d JS', + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + }, + }, { + 'note': 'JWPlayer link in un-p,a,c,k,e,d JS', + 'url': 'https://vadbam.net/6lnbkci96wly.html', + 'md5': 'a1616800076177e2ac769203957c54bc', + 'info_dict': { + 'id': '6lnbkci96wly', + 'title': 'Heart Crime S01 E03 weciima autos', + 'ext': 'mp4' + }, + }, { + 'note': 'JWPlayer link in clear', + 'url': 'https://w1.viidshar.com/nnibe0xf0h79.html', + 'md5': 'f0a580ce9df06cc61b4a5c979d672367', + 'info_dict': { + 'id': 'nnibe0xf0h79', + 'title': 'JaGa 68ar', + 'ext': 'mp4' + }, + 'params': { + 'skip_download': 'ffmpeg', + }, + 'expected_warnings': ['hlsnative has detected features it does not support'], + }, { + 'note': 'JWPlayer link in clear', + 'url': 'https://wolfstream.tv/a3drtehyrg52.html', + 'md5': '1901d86a79c5e0c6a51bdc9a4cfd3769', + 'info_dict': { + 'id': 'a3drtehyrg52', + 'title': 'NFL 2023 W04 DET@GB', + 'ext': 'mp4' }, }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, }, { - 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'url': 'https://uqload.to/ug5somm0ctnk.html', + 'only_matching': True, + }, { + 'url': 'https://highstream.tv/2owiyz3sjoux', + 'only_matching': True, + }, { + 'url': 'https://vedbam.xyz/6lnbkci96wly.html', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' - % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), - webpage)] + @classmethod + def _extract_urls(cls, webpage): + + def yield_urls(): + for regex in cls._EMBED_REGEX: + for mobj in re.finditer(regex, webpage): + yield mobj.group('url') + + return list(yield_urls()) def _real_extract(self, url): - host, video_id = re.match(self._VALID_URL, url).groups() + host, video_id = self._match_valid_url(url).group('host', 'id') - url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) + url = 'https://%s/%s' % ( + host, + 'embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) webpage = self._download_webpage(url, video_id) - - if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): + container_div = get_element_by_id('container', webpage) or webpage + if self._search_regex( + r'>This server is in maintenance mode\.', container_div, + 'maint error', group=0, default=None): + raise ExtractorError(clean_html(container_div), expected=True) + if self._search_regex( + self._FILE_NOT_FOUND_REGEXES, container_div, + 'missing video error', group=0, default=None): raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) @@ -122,59 +226,43 @@ class XFileShareIE(InfoExtractor): 'Content-type': 'application/x-www-form-urlencoded', }) - title = (self._search_regex( - (r'style="z-index: [0-9]+;">([^<]+)', - r'([^<]+)', - r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+)[ <]', - r'

([^<]+)

', - r'

]*>([^<]+)<', # streamin.to - r'title\s*:\s*"([^"]+)"'), # govid.me - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or video_id).strip() - - for regex, func in ( - (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), - (r'(゚.+)', aa_decode)): - obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) - if obf_code: - webpage = webpage.replace(obf_code, func(obf_code)) - - formats = [] - - jwplayer_data = self._search_regex( - [ - r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', - r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', - ], webpage, - 'jwplayer data', default=None) - if jwplayer_data: - jwplayer_data = self._parse_json( - jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + title = ( + self._search_regex(self._TITLE_REGEXES, webpage, 'title', default=None) + or self._og_search_title(webpage, default=None) + or video_id).strip() + + obf_code = True + while obf_code: + for regex, func in ( + (r'(?s)(?).)+\)\))', + decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + break + + jwplayer_data = self._find_jwplayer_data( + webpage.replace(r'\'', '\''), video_id) + result = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, + m3u8_id='hls', mpd_id='dash') + + if not traverse_obj(result, 'formats'): if jwplayer_data: - formats = self._parse_jwplayer_data( - jwplayer_data, video_id, False, - m3u8_id='hls', mpd_id='dash')['formats'] - - if not formats: - urls = [] - for regex in ( - r'(?:file|src)\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', - r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', - r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + self.report_warning( + 'Failed to extract JWPlayer formats', video_id=video_id) + urls = set() + for regex in self._SOURCE_URL_REGEXES: for mobj in re.finditer(regex, webpage): - video_url = mobj.group('url') - if video_url not in urls: - urls.append(video_url) + urls.add(mobj.group('url')) sources = self._search_regex( r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) - if sources: - urls.extend(self._parse_json(sources, video_id)) + urls.update(traverse_obj(sources, (T(lambda s: self._parse_json(s, video_id)), Ellipsis))) formats = [] - for video_url in urls: + for video_url in traverse_obj(urls, (Ellipsis, T(url_or_none))): if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', @@ -185,17 +273,19 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - self._sort_formats(formats) + result = {'formats': formats} + + self._sort_formats(result['formats']) thumbnail = self._search_regex( - [ - r']+poster="([^"]+)"', - r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', - ], webpage, 'thumbnail', default=None) + self._THUMBNAIL_REGEXES, webpage, 'thumbnail', default=None) + + if not (title or result.get('title')): + title = self._generic_title(url) or video_id - return { + return merge_dicts(result, { 'id': video_id, - 'title': title, + 'title': title or None, 'thumbnail': thumbnail, - 'formats': formats, - } + 'http_headers': {'Referer': url} + }) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 61b94d84c..03c73dff3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2182,8 +2182,28 @@ def sanitize_url(url): return url +def extract_basic_auth(url): + parts = compat_urllib_parse.urlsplit(url) + if parts.username is None: + return url, None + url = compat_urllib_parse.urlunsplit(parts._replace(netloc=( + parts.hostname if parts.port is None + else '%s:%d' % (parts.hostname, parts.port)))) + auth_payload = base64.b64encode( + ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8')) + return url, 'Basic {0}'.format(auth_payload.decode('ascii')) + + def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(escape_url(sanitize_url(url)), *args, **kwargs) + url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) + if auth_header is not None: + headers = args[1] if len(args) > 1 else kwargs.get('headers') + headers = headers or {} + headers['Authorization'] = auth_header + if len(args) <= 1 and kwargs.get('headers') is None: + kwargs['headers'] = headers + kwargs = compat_kwargs(kwargs) + return compat_urllib_request.Request(url, *args, **kwargs) def expand_path(s): @@ -3832,14 +3852,15 @@ class PUTRequest(compat_urllib_request.Request): return 'PUT' -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr: if v is not None: v = getattr(v, get_attr, None) if v in (None, ''): return default try: - return int(v) * invscale // scale + # like int, raise if base is specified and v is not a string + return (int(v) if base is None else int(v, base=base)) * invscale // scale except (ValueError, TypeError, OverflowError): return default