From 58babe9af79215bd6bdf07da0a8ebb1d3650e00b Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 30 Nov 2021 05:15:33 +0000 Subject: [PATCH] Support __INITIAL_DATA__ with stringified JSON Add test and fix test for bbcthreeConfig --- youtube_dl/extractor/bbc.py | 50 +++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 37d427a66..088af9823 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -12,6 +12,7 @@ from ..compat import ( compat_HTTPError, compat_parse_qs, compat_str, + compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, ) @@ -395,9 +396,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -775,21 +784,33 @@ class BBCIE(BBCCoUkIE): 'timestamp': 1437785037, 'upload_date': '20150725', }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -1162,9 +1183,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: