From f0812d784836d18fd25ea32f9b5a0c9c6e92425b Mon Sep 17 00:00:00 2001 From: Hubert Hirtz Date: Mon, 4 Mar 2024 01:27:55 +0000 Subject: [PATCH 1/8] [utils] Handle user:pass in URLs (#28801) * Handle user:pass in URLs Fixes "nonnumeric port" errors when youtube-dl is given URLs with usernames and passwords such as: http://username:password@example.com/myvideo.mp4 Refs: - https://en.wikipedia.org/wiki/Basic_access_authentication - https://tools.ietf.org/html/rfc1738#section-3.1 - https://docs.python.org/3.8/library/urllib.parse.html#urllib.parse.urlsplit Fixes #18276 (point 4) Fixes #20258 Fixes #26211 (see comment) * Align code with yt-dlp --------- Co-authored-by: dirkf --- test/test_utils.py | 13 +++++++++++++ youtube_dl/utils.py | 22 +++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 102420fcb..90d64b581 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -81,6 +81,7 @@ from youtube_dl.utils import ( sanitize_filename, sanitize_path, sanitize_url, + sanitized_Request, shell_quote, smuggle_url, str_or_none, @@ -255,6 +256,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') self.assertEqual(sanitize_url('foo bar'), 'foo bar') + def test_sanitized_Request(self): + self.assertFalse(sanitized_Request('http://foo.bar').has_header('Authorization')) + self.assertFalse(sanitized_Request('http://:foo.bar').has_header('Authorization')) + self.assertEqual(sanitized_Request('http://@foo.bar').get_header('Authorization'), + 'Basic Og==') + self.assertEqual(sanitized_Request('http://:pass@foo.bar').get_header('Authorization'), + 'Basic OnBhc3M=') + self.assertEqual(sanitized_Request('http://user:@foo.bar').get_header('Authorization'), + 'Basic dXNlcjo=') + self.assertEqual(sanitized_Request('http://user:pass@foo.bar').get_header('Authorization'), + 'Basic dXNlcjpwYXNz') + def test_expand_path(self): def env(var): return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 61b94d84c..c249e7168 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2182,8 +2182,28 @@ def sanitize_url(url): return url +def extract_basic_auth(url): + parts = compat_urllib_parse.urlsplit(url) + if parts.username is None: + return url, None + url = compat_urllib_parse.urlunsplit(parts._replace(netloc=( + parts.hostname if parts.port is None + else '%s:%d' % (parts.hostname, parts.port)))) + auth_payload = base64.b64encode( + ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8')) + return url, 'Basic {0}'.format(auth_payload.decode('ascii')) + + def sanitized_Request(url, *args, **kwargs): - return compat_urllib_request.Request(escape_url(sanitize_url(url)), *args, **kwargs) + url, auth_header = extract_basic_auth(escape_url(sanitize_url(url))) + if auth_header is not None: + headers = args[1] if len(args) > 1 else kwargs.get('headers') + headers = headers or {} + headers['Authorization'] = auth_header + if len(args) <= 1 and kwargs.get('headers') is None: + kwargs['headers'] = headers + kwargs = compat_kwargs(kwargs) + return compat_urllib_request.Request(url, *args, **kwargs) def expand_path(s): From acc383b9e3c2d454121c22570c901dd2c689dc26 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 4 Mar 2024 20:52:58 +0000 Subject: [PATCH 2/8] [utils] Let int_or_none() accept a base, like int() --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 90d64b581..ca36909a8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -525,11 +525,14 @@ class TestUtil(unittest.TestCase): self.assertEqual(float_or_none(set()), None) def test_int_or_none(self): + self.assertEqual(int_or_none(42), 42) self.assertEqual(int_or_none('42'), 42) self.assertEqual(int_or_none(''), None) self.assertEqual(int_or_none(None), None) self.assertEqual(int_or_none([]), None) self.assertEqual(int_or_none(set()), None) + self.assertEqual(int_or_none('42', base=8), 34) + self.assertRaises(TypeError, int_or_none(42, base=8)) def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c249e7168..03c73dff3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3852,14 +3852,15 @@ class PUTRequest(compat_urllib_request.Request): return 'PUT' -def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr: if v is not None: v = getattr(v, get_attr, None) if v in (None, ''): return default try: - return int(v) * invscale // scale + # like int, raise if base is specified and v is not a string + return (int(v) if base is None else int(v, base=base)) * invscale // scale except (ValueError, TypeError, OverflowError): return default From 7216fa2ac4706e099ea2ad9a04fe7bf4300bc745 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 21 Feb 2024 00:03:17 +0000 Subject: [PATCH 3/8] [InfoExtractor] Add `_search_json()` * uses the error diagnostic to truncate the JSON string * may be confused by non-C-Pythons --- youtube_dl/extractor/common.py | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0e5dfd8fa..b5e95a318 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -25,6 +25,7 @@ from ..compat import ( compat_getpass, compat_integer_types, compat_http_client, + compat_kwargs, compat_map as map, compat_open as open, compat_os_name, @@ -1102,6 +1103,60 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_json(self, start_pattern, string, name, video_id, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + + # self, start_pattern, string, name, video_id, *, end_pattern='', + # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT + # NB: end_pattern is only used to reduce the size of the initial match + end_pattern = kwargs.pop('end_pattern', '') + # (?:[\s\S]) simulates (?(s):.) (eg) + contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}') + fatal = kwargs.pop('fatal', True) + default = kwargs.pop('default', NO_DEFAULT) + + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + r'(?:{0})\s*(?P{1})\s*(?:{2})'.format( + start_pattern, contains_pattern, end_pattern), + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + # yt-dlp has a special JSON parser that allows trailing text. + # Until that arrives here, the diagnostic from the exception + # raised by json.loads() is used to extract the wanted text. + # Either way, it's a problem if a transform_source() can't + # handle the trailing text. + + # force an exception + kwargs['fatal'] = True + + # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + for _ in range(2): + try: + # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + transform_source = kwargs.pop('transform_source', None) + if transform_source: + json_string = transform_source(json_string) + return self._parse_json(json_string, video_id, **compat_kwargs(kwargs)) + except ExtractorError as e: + end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None)) + if end is not None: + json_string = json_string[:end] + continue + msg = 'Unable to extract {0} - Failed to parse JSON'.format(name) + if fatal: + raise ExtractorError(msg, cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id) + return default + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. From f66372403fd9e1661199fea100ba2600fa9697b2 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 21 Feb 2024 00:09:48 +0000 Subject: [PATCH 4/8] [InfoExtractor] Rework and improve JWPlayer extraction * use traverse_obj() and _search_json() * support playlist `.load({**video1},{**video2}, ...)` * support transform_source=... for _extract_jwplayer_data() --- youtube_dl/extractor/common.py | 55 ++++++++++++++-------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b5e95a318..7fae9e57b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3021,25 +3021,22 @@ class InfoExtractor(object): return formats def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): - mobj = re.search( - r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', - webpage) - if mobj: - try: - jwplayer_data = self._parse_json(mobj.group('options'), - video_id=video_id, - transform_source=transform_source) - except ExtractorError: - pass - else: - if isinstance(jwplayer_data, dict): - return jwplayer_data + return self._search_json( + r'''(?'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!).)*?\.\s*(?:setup\s*\(|(?Pload)\s*\(\s*\[)''', + webpage, 'JWPlayer data', video_id, + # must be a {...} or sequence, ending + contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))', + transform_source=transform_source, default=None) def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): - jwplayer_data = self._find_jwplayer_data( - webpage, video_id, transform_source=js_to_json) - return self._parse_jwplayer_data( - jwplayer_data, video_id, *args, **kwargs) + + # allow passing `transform_source` through to _find_jwplayer_data() + transform_source = kwargs.pop('transform_source', None) + kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {} + + jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind) + + return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs) def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): @@ -3073,22 +3070,14 @@ class InfoExtractor(object): mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(base_url, track.get('file')) - if not track_url: - continue - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track_url) - }) + for track in traverse_obj(video_data, ( + 'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))): + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) entry = { 'id': this_video_id, From f7b30e3f73f56aa4765212cd04eb48743e03dfcd Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 21 Feb 2024 00:18:17 +0000 Subject: [PATCH 5/8] [XFileShare] Update extractor for 2024 * simplify aa_decode() * review and update supported sites and tests * in above, include FileMoon.sx, and remove separate module * incorporate changes from yt-dlp * allow for decoding multiple scripts (eg, FileMoon) * use new JWPlayer extraction --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/filemoon.py | 43 ----- youtube_dl/extractor/xfileshare.py | 296 +++++++++++++++++++---------- 3 files changed, 193 insertions(+), 147 deletions(-) delete mode 100644 youtube_dl/extractor/filemoon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 29b655c94..a56a7c52f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -383,7 +383,6 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE -from .filemoon import FileMoonIE from .fifa import FifaIE from .filmon import ( FilmOnIE, diff --git a/youtube_dl/extractor/filemoon.py b/youtube_dl/extractor/filemoon.py deleted file mode 100644 index 654df9b69..000000000 --- a/youtube_dl/extractor/filemoon.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, -) - - -class FileMoonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' - _TEST = { - 'url': 'https://filemoon.sx/e/dw40rxrzruqz', - 'md5': '5a713742f57ac4aef29b74733e8dda01', - 'info_dict': { - 'id': 'dw40rxrzruqz', - 'title': 'dw40rxrzruqz', - 'ext': 'mp4' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - matches = re.findall(r'(?s)(eval.*?)', webpage) - packed = matches[-1] - unpacked = decode_packed_codes(packed) - jwplayer_sources = self._parse_json( - self._search_regex( - r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), - video_id, transform_source=js_to_json) - - formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) - - return { - 'id': video_id, - 'title': self._generic_title(url) or video_id, - 'formats': formats - } diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index df9efa9fa..4dc3032e7 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -4,20 +4,28 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_chr +from ..compat import ( + compat_chr, + compat_zip as zip, +) from ..utils import ( + clean_html, decode_packed_codes, determine_ext, ExtractorError, + get_element_by_id, int_or_none, - js_to_json, + merge_dicts, + T, + traverse_obj, + url_or_none, urlencode_postdata, ) # based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 def aa_decode(aa_code): - symbol_table = [ + symbol_table = ( ('7', '((゚ー゚) + (o^_^o))'), ('6', '((o^_^o) +(o^_^o))'), ('5', '((゚ー゚) + (゚Θ゚))'), @@ -26,84 +34,180 @@ def aa_decode(aa_code): ('3', '(o^_^o)'), ('1', '(゚Θ゚)'), ('0', '(c^_^o)'), - ] + ('+', ''), + ) delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aa_char in aa_code.split(delim): + + def chr_from_code(c): for val, pat in symbol_table: - aa_char = aa_char.replace(pat, val) - aa_char = aa_char.replace('+ ', '') - m = re.match(r'^\d+', aa_char) - if m: - ret += compat_chr(int(m.group(0), 8)) + c = c.replace(pat, val) + if c.startswith(('u', 'U')): + base = 16 + c = c[1:] else: - m = re.match(r'^u([\da-f]+)', aa_char) - if m: - ret += compat_chr(int(m.group(1), 16)) - return ret + base = 10 + c = int_or_none(c, base=base) + return '' if c is None else compat_chr(c) + + return ''.join( + chr_from_code(aa_char) + for aa_char in aa_code.split(delim)) class XFileShareIE(InfoExtractor): _SITES = ( - (r'aparat\.cam', 'Aparat'), - (r'clipwatching\.com', 'ClipWatching'), - (r'gounlimited\.to', 'GoUnlimited'), - (r'govid\.me', 'GoVid'), - (r'holavid\.com', 'HolaVid'), - (r'streamty\.com', 'Streamty'), - (r'thevideobee\.to', 'TheVideoBee'), - (r'uqload\.com', 'Uqload'), - (r'vidbom\.com', 'VidBom'), - (r'vidlo\.us', 'vidlo'), - (r'vidlocker\.xyz', 'VidLocker'), - (r'vidshare\.tv', 'VidShare'), - (r'vup\.to', 'VUp'), + # status check 2024-02: site availability, G site: search + (r'aparat\.cam', 'Aparat'), # Cloudflare says host error 522, apparently changed to wolfstreeam.tv + (r'filemoon\.sx/.', 'FileMoon'), + (r'gounlimited\.to', 'GoUnlimited'), # no media pages listed + (r'govid\.me', 'GoVid'), # no media pages listed + (r'highstream\.tv', 'HighStream'), # clipwatching.com redirects here + (r'holavid\.com', 'HolaVid'), # Cloudflare says host error 522 + # (r'streamty\.com', 'Streamty'), # no media pages listed, connection timeout + # (r'thevideobee\.to', 'TheVideoBee'), # no pages listed, refuses connection + (r'uqload\.to', 'Uqload'), # .com, .co redirect here + (r'(?:vedbam\.xyz|vadbam.net)', 'V?dB?m'), # vidbom.com redirects here, but no valid media pages listed + (r'vidlo\.us', 'vidlo'), # no valid media pages listed + (r'vidlocker\.xyz', 'VidLocker'), # no media pages listed + (r'(?:w\d\.)?viidshar\.com', 'VidShare'), # vidshare.tv redirects here + # (r'vup\.to', 'VUp'), # domain not found (r'wolfstream\.tv', 'WolfStream'), - (r'xvideosharing\.com', 'XVideoSharing'), + (r'xvideosharing\.com', 'XVideoSharing'), # just started showing 'maintenance mode' ) - IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) + IE_DESC = 'XFileShare-based sites: %s' % ', '.join(list(zip(*_SITES))[1]) _VALID_URL = (r'https?://(?:www\.)?(?P%s)/(?:embed-)?(?P[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) + _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])] _FILE_NOT_FOUND_REGEXES = ( r'>(?:404 - )?File Not Found<', r'>The file was removed by administrator<', ) + _TITLE_REGEXES = ( + r'style="z-index: [0-9]+;">([^<]+)', + r'([^<]+)', + r'h4-fine[^>]*>([^<]+)<', + r'>Watch (.+)[ <]', + r'

([^<]+)

', + r'

]*>([^<]+)<', # streamin.to (dead) + r'title\s*:\s*"([^"]+)"', # govid.me + ) + _SOURCE_URL_REGEXES = ( + r'(?:file|src)\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', + r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + ) + _THUMBNAIL_REGEXES = ( + r']+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ) _TESTS = [{ - 'url': 'http://xvideosharing.com/fq65f94nd2ve', - 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', + 'note': 'link in `sources`', + 'url': 'https://uqload.to/dcsu06gdb45o', + 'md5': '7f8db187b254379440bf4fcad094ae86', 'info_dict': { - 'id': 'fq65f94nd2ve', + 'id': 'dcsu06gdb45o', 'ext': 'mp4', - 'title': 'sample', - 'thumbnail': r're:http://.*\.jpg', + 'title': 'f2e31015957e74c8c8427982e161c3fc mp4', + 'thumbnail': r're:https://.*\.jpg' + }, + 'params': { + 'nocheckcertificate': True, + }, + 'expected_warnings': ['Unable to extract JWPlayer data'], + }, { + 'note': 'link in decoded `sources`', + 'url': 'https://xvideosharing.com/1tlg6agrrdgc', + 'md5': '2608ce41932c1657ae56258a64e647d9', + 'info_dict': { + 'id': '1tlg6agrrdgc', + 'ext': 'mp4', + 'title': '0121', + 'thumbnail': r're:https?://.*\.jpg', + }, + 'skip': 'This server is in maintenance mode.', + }, { + 'note': 'JWPlayer link in un-p,a,c,k,e,d JS', + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + }, + }, { + 'note': 'JWPlayer link in un-p,a,c,k,e,d JS', + 'url': 'https://vadbam.net/6lnbkci96wly.html', + 'md5': 'a1616800076177e2ac769203957c54bc', + 'info_dict': { + 'id': '6lnbkci96wly', + 'title': 'Heart Crime S01 E03 weciima autos', + 'ext': 'mp4' + }, + }, { + 'note': 'JWPlayer link in clear', + 'url': 'https://w1.viidshar.com/nnibe0xf0h79.html', + 'md5': 'f0a580ce9df06cc61b4a5c979d672367', + 'info_dict': { + 'id': 'nnibe0xf0h79', + 'title': 'JaGa 68ar', + 'ext': 'mp4' + }, + 'params': { + 'skip_download': 'ffmpeg', + }, + 'expected_warnings': ['hlsnative has detected features it does not support'], + }, { + 'note': 'JWPlayer link in clear', + 'url': 'https://wolfstream.tv/a3drtehyrg52.html', + 'md5': '1901d86a79c5e0c6a51bdc9a4cfd3769', + 'info_dict': { + 'id': 'a3drtehyrg52', + 'title': 'NFL 2023 W04 DET@GB', + 'ext': 'mp4' }, }, { 'url': 'https://aparat.cam/n4d6dh0wvlpr', 'only_matching': True, }, { - 'url': 'https://wolfstream.tv/nthme29v9u2x', + 'url': 'https://uqload.to/ug5somm0ctnk.html', + 'only_matching': True, + }, { + 'url': 'https://highstream.tv/2owiyz3sjoux', + 'only_matching': True, + }, { + 'url': 'https://vedbam.xyz/6lnbkci96wly.html', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' - % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), - webpage)] + @classmethod + def _extract_urls(cls, webpage): + + def yield_urls(): + for regex in cls._EMBED_REGEX: + for mobj in re.finditer(regex, webpage): + yield mobj.group('url') + + return list(yield_urls()) def _real_extract(self, url): - host, video_id = re.match(self._VALID_URL, url).groups() + host, video_id = self._match_valid_url(url).group('host', 'id') - url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) + url = 'https://%s/%s' % ( + host, + 'embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) webpage = self._download_webpage(url, video_id) - - if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): + container_div = get_element_by_id('container', webpage) or webpage + if self._search_regex( + r'>This server is in maintenance mode\.', container_div, + 'maint error', group=0, default=None): + raise ExtractorError(clean_html(container_div), expected=True) + if self._search_regex( + self._FILE_NOT_FOUND_REGEXES, container_div, + 'missing video error', group=0, default=None): raise ExtractorError('Video %s does not exist' % video_id, expected=True) fields = self._hidden_inputs(webpage) @@ -122,59 +226,43 @@ class XFileShareIE(InfoExtractor): 'Content-type': 'application/x-www-form-urlencoded', }) - title = (self._search_regex( - (r'style="z-index: [0-9]+;">([^<]+)', - r'([^<]+)', - r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+)[ <]', - r'

([^<]+)

', - r'

]*>([^<]+)<', # streamin.to - r'title\s*:\s*"([^"]+)"'), # govid.me - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or video_id).strip() - - for regex, func in ( - (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), - (r'(゚.+)', aa_decode)): - obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) - if obf_code: - webpage = webpage.replace(obf_code, func(obf_code)) - - formats = [] - - jwplayer_data = self._search_regex( - [ - r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', - r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', - ], webpage, - 'jwplayer data', default=None) - if jwplayer_data: - jwplayer_data = self._parse_json( - jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + title = ( + self._search_regex(self._TITLE_REGEXES, webpage, 'title', default=None) + or self._og_search_title(webpage, default=None) + or video_id).strip() + + obf_code = True + while obf_code: + for regex, func in ( + (r'(?s)(?).)+\)\))', + decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + break + + jwplayer_data = self._find_jwplayer_data( + webpage.replace(r'\'', '\''), video_id) + result = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, + m3u8_id='hls', mpd_id='dash') + + if not traverse_obj(result, 'formats'): if jwplayer_data: - formats = self._parse_jwplayer_data( - jwplayer_data, video_id, False, - m3u8_id='hls', mpd_id='dash')['formats'] - - if not formats: - urls = [] - for regex in ( - r'(?:file|src)\s*:\s*(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', - r'file_link\s*=\s*(["\'])(?Phttp(?:(?!\1).)+)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?Phttp(?:(?!\2).)+)\2\)', - r']+src=(["\'])(?Phttp(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + self.report_warning( + 'Failed to extract JWPlayer formats', video_id=video_id) + urls = set() + for regex in self._SOURCE_URL_REGEXES: for mobj in re.finditer(regex, webpage): - video_url = mobj.group('url') - if video_url not in urls: - urls.append(video_url) + urls.add(mobj.group('url')) sources = self._search_regex( r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) - if sources: - urls.extend(self._parse_json(sources, video_id)) + urls.update(traverse_obj(sources, (T(lambda s: self._parse_json(s, video_id)), Ellipsis))) formats = [] - for video_url in urls: + for video_url in traverse_obj(urls, (Ellipsis, T(url_or_none))): if determine_ext(video_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', @@ -185,17 +273,19 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - self._sort_formats(formats) + result = {'formats': formats} + + self._sort_formats(result['formats']) thumbnail = self._search_regex( - [ - r']+poster="([^"]+)"', - r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', - ], webpage, 'thumbnail', default=None) + self._THUMBNAIL_REGEXES, webpage, 'thumbnail', default=None) + + if not (title or result.get('title')): + title = self._generic_title(url) or video_id - return { + return merge_dicts(result, { 'id': video_id, - 'title': title, + 'title': title or None, 'thumbnail': thumbnail, - 'formats': formats, - } + 'http_headers': {'Referer': url} + }) From aef24d97e9c50cd9db06349b2b25c7f623baf932 Mon Sep 17 00:00:00 2001 From: dirkf Date: Thu, 29 Feb 2024 14:12:37 +0000 Subject: [PATCH 6/8] [Videa] Align with yt-dlp --- youtube_dl/extractor/videa.py | 39 +++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index 4589e78a1..0689764a5 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -6,22 +6,31 @@ import re import string from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_ord, + compat_struct_pack, +) from ..utils import ( ExtractorError, int_or_none, mimetype2ext, parse_codecs, + parse_qs, update_url_query, urljoin, xpath_element, xpath_text, ) -from ..compat import ( - compat_b64decode, - compat_ord, - compat_struct_pack, - compat_urlparse, -) + + +def compat_random_choices(population, *args, **kwargs): + # weights=None, *, cum_weights=None, k=1 + # limited implementation needed here + weights = args[0] if args else kwargs.get('weights') + assert all(w is None for w in (weights, kwargs.get('cum_weights'))) + k = kwargs.get('k', 1) + return ''.join(random.choice(population) for _ in range(k)) class VideaIE(InfoExtractor): @@ -35,6 +44,7 @@ class VideaIE(InfoExtractor): ) (?P[^?#&]+) ''' + _EMBED_REGEX = [r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1'] _TESTS = [{ 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', @@ -80,11 +90,14 @@ class VideaIE(InfoExtractor): }] _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', - webpage)] + @classmethod + def _extract_urls(cls, webpage): + def yield_urls(): + for pattern in cls._EMBED_REGEX: + for m in re.finditer(pattern, webpage): + yield m.group('url') + + return list(yield_urls()) @staticmethod def rc4(cipher_text, key): @@ -130,8 +143,8 @@ class VideaIE(InfoExtractor): for i in range(0, 32): result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)] - query = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query) - random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8)) + query = parse_qs(player_url) + random_seed = ''.join(compat_random_choices(string.ascii_letters + string.digits, k=8)) query['_s'] = random_seed query['_t'] = result[:16] From 820fae3b3a8587a6f57afbe803b4f91de7d4e086 Mon Sep 17 00:00:00 2001 From: hatsomatt <143712404+hatsomatt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:02:37 +0200 Subject: [PATCH 7/8] [Videa] Fix extraction * update API URL * from https://github.com/yt-dlp/yt-dlp/pull/8003 * thanks to the authors! Closes yt-dlp/7427 Authored by: hatsomatt, aky-01 --- youtube_dl/extractor/videa.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index 0689764a5..194b4b011 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -54,6 +54,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', @@ -64,6 +65,7 @@ class VideaIE(InfoExtractor): 'title': 'Supercars előzés', 'thumbnail': r're:^https?://.*', 'duration': 64, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', @@ -74,6 +76,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', @@ -149,7 +152,7 @@ class VideaIE(InfoExtractor): query['_t'] = result[:16] b64_info, handle = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + 'http://videa.hu/player/xml', video_id, query=query) if b64_info.startswith(' Date: Tue, 12 Mar 2024 19:44:13 +0800 Subject: [PATCH 8/8] [Vimeo] Improve `config` extraction (#32742) * update for more robust json parsing --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8e1a805f6..47ec0a9b4 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -673,8 +673,8 @@ class VimeoIE(VimeoBaseInfoExtractor): raise if '//player.vimeo.com/video/' in url: - config = self._parse_json(self._search_regex( - r'(?s)\b(?:playerC|c)onfig\s*=\s*({.+?})\s*[;\n]', webpage, 'info section'), video_id) + config = self._search_json( + r'\b(?:playerC|c)onfig\s*=', webpage, 'info section', video_id) if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers)