From cd2c7ab40ef72167914b2b71906bf53fd7f382ee Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Tue, 3 Sep 2019 22:28:30 +0200 Subject: [PATCH 1/2] [ardaudiothek] Add support for the ARD Audiothek --- youtube_dl/extractor/ardaudiothek.py | 305 +++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 309 insertions(+) create mode 100644 youtube_dl/extractor/ardaudiothek.py diff --git a/youtube_dl/extractor/ardaudiothek.py b/youtube_dl/extractor/ardaudiothek.py new file mode 100644 index 000000000..d8af4890f --- /dev/null +++ b/youtube_dl/extractor/ardaudiothek.py @@ -0,0 +1,305 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_str, + ExtractorError, + int_or_none, + parse_duration, + str_or_none, + try_get, + unified_strdate, + unified_timestamp, +) + + +class ARDAudiothekBaseIE(InfoExtractor): + + def _extract_episode_info(self, title): + """Try to extract episode data from the title.""" + res = {} + if not title: + return res + + for pattern in [ + r'.*(?P \(S(?P\d+)/E(?P\d+)\)).*', + r'.*(?P \((?:Folge |Teil )?(?P\d+)(?:/\d+)?\)).*', + r'.*(?PFolge (?P\d+)(?:\:| -|) )\"(?P.+)\".*', + r'.*(?PFolge (?P\d+)(?:/\d+)?(?:\:| -|) ).*', + ]: + m = re.match(pattern, title) + if m: + groupdict = m.groupdict() + for int_entry in ['season_number', 'episode_number']: + res[int_entry] = int_or_none(groupdict.get(int_entry)) + + for str_entry in ['episode']: + res[str_entry] = str_or_none(groupdict.get(str_entry)) + + # Build the episode title by removing numeric episode + # information. + if groupdict.get('ep_info') and not res['episode']: + res['episode'] = str_or_none( + title.replace(groupdict.get('ep_info'), '')) + + if res['episode']: + res['episode'] = res['episode'].strip() + + break + + # As a fallback use the whole title as the episode name + if not res.get('episode'): + res['episode'] = title.strip() + + return res + + def _extract_id_title_desc(self, json_data): + res = { + 'id': try_get(json_data, lambda x: x['id'], compat_str), + 'display_id': try_get(json_data, lambda x: x['slug'], compat_str), + } + res['title'] = try_get( + json_data, lambda x: x['title'], compat_str) + res['description'] = try_get( + json_data, lambda x: x['summary'], compat_str) + return res + + def _extract_episode(self, ep_data): + res = self._extract_id_title_desc(ep_data) + + res['url'] = try_get(ep_data, [ + lambda x: x['enclosure']['download_url'], + lambda x: x['enclosure']['playback_url'], + lambda x: x['guid'], + ], compat_str) + if not res['url']: + raise ExtractorError(msg='Could not find any downloads', + expected=True) + + res['format_note'] = try_get( + ep_data, lambda x: x['enclosure']['type'], compat_str) + res['duration'] = parse_duration( + try_get(ep_data, lambda x: x['duration'], compat_str)) + res['release_date'] = unified_strdate( + try_get(ep_data, lambda x: x['publication_date'], compat_str)) + res['timestamp'] = unified_timestamp( + try_get(ep_data, lambda x: x['publication_date'], compat_str)) + res['channel'] = try_get(ep_data, [ + lambda x: x['podcast']['station'], + lambda x: x['podcast']['organization_name'], + ], compat_str) + + # 'sharing_url' might be a redirecting URL. The generic extractor will + # handle the redirection just fine, so that this extractor here will + # be used. + res['webpage_url'] = try_get( + ep_data, lambda x: x['sharing_url'], compat_str) + + res['categories'] = [ + try_get(ep_data, lambda x: x['podcast']['category'], compat_str), + ] + + res['is_live'] = False + + res['series'] = try_get(ep_data, + lambda x: x['podcast']['title'], + compat_str) + + def make_thumbnail(url, id, preference): + # Note that the images don't necessarily have the advertised + # aspect ratio! So don't set the height based on the aspect + # ratio. + # Also note that the server will not return an image of any given + # width. Most multiples of 32 (or of 64 for higher numbers) seem to + # work. When requesting a width of 1080, the server returns an + # image with a width of 1024, for instance. Requesting 1400 gives + # us 1344, and so on. So a width of 1920 works best for both 1x1 + # and 16x9 images. + thumb_width = 1920 + return { + 'id': id, + # Only set the width if we actually replace the {width} + # placeholder in the URL. + 'width': thumb_width if '{width}' in url else None, + 'url': url.replace('{width}', str(thumb_width)), + 'preference': preference, + } + + # We prefer 1x1 images and we prefer episode images. But still provide + # all available images so that the user can choose. We use the + # thumbnail's 'preference' entry to sort them (the higher the better). + # The preferred thumbnail order is: + # (0) podcast-16x9 < (1) episode-16x9 + # < (2) podcast-1x1 < (3) episode-1x1 + thumbnails = [] + for ar_index, aspect_ratio in enumerate(['16x9', '1x1']): + image_key = 'image_%s' % aspect_ratio + image_sources = [ + {'name': 'podcast', + 'access': lambda x: x['podcast'][image_key]}, + {'name': 'episode', + 'access': lambda x: x[image_key]}, + ] + for src_index, src in enumerate(image_sources): + thumb_url = try_get(ep_data, src['access'], compat_str) + + if thumb_url: + thumbnails.append(make_thumbnail( + thumb_url, + src['name'] + '-' + aspect_ratio, + ar_index * len(image_sources) + src_index)) + res['thumbnails'] = thumbnails + + res.update(self._extract_episode_info(res.get('title'))) + + return res + + +class ARDAudiothekIE(ARDAudiothekBaseIE): + _VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/(?:[^/]+)/(?:[^/]+)/(?P[0-9]+)(?:/.*)?' + _TESTS = [{ + 'url': 'https://www.ardaudiothek.de/hoerspiel-pool/virginia-woolf-zum-leuchtturm-1-3-die-tuer-aus-glas/53728640', + 'md5': 'dc12a86bb46faadbdba7a8c9b5a24246', + 'info_dict': { + 'id': '53728640', + 'ext': 'mp3', + 'title': 'Virginia Woolf: Zum Leuchtturm (1/3) - Die Tür aus Glas', + 'description': r're:^Am Anfang steht die Frage.*', + 'thumbnail': compat_str, + 'timestamp': 1478818860, + 'upload_date': '20161110', + } + }, { + 'url': 'https://www.ardaudiothek.de/eine-stunde-talk/soziologe-matthias-quent-nicht-neutral-gegenueber-rechtsradikalismus/65904422', + 'md5': '326065e45e8172124165c3b0addd4553', + 'info_dict': { + 'id': '65904422', + 'ext': 'mp3', + 'title': 'Soziologe Matthias Quent - Nicht neutral gegenüber Rechtsradikalismus', + 'description': r're:^Matthias Quent erforscht die Ziele.*', + 'thumbnail': compat_str, + 'timestamp': 1565809200, + 'upload_date': '20190814', + } + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + + api_url = 'https://www.ardaudiothek.de/api/episodes/%s' % episode_id + result_data = self._download_json(api_url, episode_id, fatal=False) + ep_data = try_get(result_data, lambda x: x['result']['episode'], dict) + + if not ep_data: + raise ExtractorError(msg="Could not find any episode data", + expected=True) + + return self._extract_episode(ep_data) + + +class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE): + _VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/(?!kategorie)(?:[^/]+)/(?P[0-9]+)(?:/.*)?' + _TESTS = [{ + 'url': 'https://www.ardaudiothek.de/wirtschaft/62037362', + 'info_dict': { + 'id': '62037362', + 'title': 'Wirtschaft', + 'description': compat_str, + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.ardaudiothek.de/redezeit/7852070', + 'info_dict': { + 'id': '7852070', + 'title': 'Redezeit', + 'description': compat_str, + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.ardaudiothek.de/nur-fuer-starke-nerven-krimis-und-thriller/51581650/alle', + 'info_dict': { + 'id': '51581650', + 'title': r're:^Nur für starke Nerven', + 'description': compat_str, + }, + 'playlist_mincount': 5, + }] + + def _extract_episodes(self, podcast_id, n_entries): + # items_per_page works from 1 up to 2147483647 (2^31 - 1). + # The website calls the API with items_per_page set to 24. Setting it + # to 500 or 1000 would download the data of all episodes in one or two + # pages. Increasing this value might however trigger server errors in + # the future. So to avoid any problems we will keep using the default + # value and just download a few more pages. + items_per_page = 24 + + page = 1 + + api_url_template = 'https://www.ardaudiothek.de/api/podcasts/{}/episodes?items_per_page={}{}' + entries = [] + while True: + # The API sometimes returns 404s for page=1. So only add that + # parameter if we actually have paginated content. + page_str = '&page=' + compat_str(page) if page > 1 else '' + api_url = api_url_template.format(podcast_id, + items_per_page, + page_str) + result_data = self._download_json(api_url, podcast_id, fatal=False) + + episodes = try_get(result_data, + lambda x: x['result']['episodes'], + list) + if episodes is None: + break + + for episode in episodes: + entries.append(self._extract_episode(episode)) + + # Check if we're done + if len(entries) >= n_entries: + break + + # Sanity check, just in case + meta_total = try_get(result_data, + lambda x: + x['result']['meta']['episodes']['total'], + (int, float)) + meta_pages = try_get(result_data, + lambda x: + x['result']['meta']['episodes']['pages'], + (int, float)) + if not meta_total or not meta_pages: + break + + page += 1 + + return entries + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + api_url = 'https://www.ardaudiothek.de/api/podcasts/%s' % podcast_id + result_data = self._download_json(api_url, podcast_id, fatal=False) + pc_data = try_get(result_data, lambda x: x['result']['podcast'], dict) + + if not pc_data: + raise ExtractorError(msg="Could not find any playlist data", + expected=True) + + n_entries = try_get(pc_data, + lambda x: x['number_of_elements'], + (int, float)) + + res = self._extract_id_title_desc(pc_data) + res['_type'] = 'playlist' + res['entries'] = self._extract_episodes(podcast_id, n_entries) + + if n_entries > len(res['entries']): + self.to_screen('Only received {} of {} reported episode IDs' + .format(len(res['entries']), n_entries)) + + return res diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4adcae1e5..64d8130af 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -57,6 +57,10 @@ from .ard import ( ARDIE, ARDMediathekIE, ) +from .ardaudiothek import ( + ARDAudiothekIE, + ARDAudiothekPlaylistIE, +) from .arte import ( ArteTVPlus7IE, ArteTVEmbedIE, From d7c2b5dac896bd698b0ce923d6d3ed10caf03bc8 Mon Sep 17 00:00:00 2001 From: TinyToweringTree <54483833+TinyToweringTree@users.noreply.github.com> Date: Sun, 8 Sep 2019 21:37:43 +0200 Subject: [PATCH 2/2] [ardaudiothek] Add an extractor for search results --- youtube_dl/extractor/ardaudiothek.py | 224 +++++++++++++++++++++------ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 180 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/ardaudiothek.py b/youtube_dl/extractor/ardaudiothek.py index d8af4890f..d856b5f49 100644 --- a/youtube_dl/extractor/ardaudiothek.py +++ b/youtube_dl/extractor/ardaudiothek.py @@ -3,6 +3,14 @@ from __future__ import unicode_literals import re +try: + from urllib.parse import unquote as _unquote_compat +except ImportError: + from urllib import unquote + + def _unquote_compat(str): + return unquote(str.encode('utf-8')).decode('utf-8') + from .common import InfoExtractor from ..utils import ( compat_str, @@ -76,7 +84,7 @@ class ARDAudiothekBaseIE(InfoExtractor): lambda x: x['guid'], ], compat_str) if not res['url']: - raise ExtractorError(msg='Could not find any downloads', + raise ExtractorError(msg='Could not find a URL to download', expected=True) res['format_note'] = try_get( @@ -228,55 +236,132 @@ class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE): 'playlist_mincount': 5, }] - def _extract_episodes(self, podcast_id, n_entries): - # items_per_page works from 1 up to 2147483647 (2^31 - 1). - # The website calls the API with items_per_page set to 24. Setting it - # to 500 or 1000 would download the data of all episodes in one or two - # pages. Increasing this value might however trigger server errors in - # the future. So to avoid any problems we will keep using the default - # value and just download a few more pages. - items_per_page = 24 - + def _get_page_str(self, page): + # The API sometimes returns 404s for page=1. So only add that + # parameter if we actually are past the first page + return '&page=' + compat_str(page) if page > 1 else '' + + def _get_episode_from_array_entry(self, array_entry): + # The array entry already is a an 'episode' dict. + return array_entry + + def _extract_episodes( + self, display_id, api_url_template, default_items_per_page): + """ + Extract episodes by calling a web API end point. + + Sometimes the server does not respond properly when requesting a page. + This also happens on the website. It sometimes hangs when trying to + load more search results, for instance. Thus the number of entries + reported by the API is often wrong and we do not solely rely on that + number to stop reading episodes. + + This function handles paginated content in a robust way by skipping + over faulty server responses. In this case it reduces the page size to + get as many episodes as possible. It also removes duplicate entries + from the result. + + Args: + display_id: Only used for user feedback. + api_url_template: This is the URL of the API to download JSON data + from. It is a format string expected to have the following + fields: + - {items_per_page} + - {page_str} + default_items_per_page: The number of items to fetch per page. + It is best to set this to the same value that is used by the + website when accessing the API. This function automatically + reduces the number of items per page when the server responds + with errors or missing data. + + Returns: + A list of extracted episode dicts to be used as playlist entries. + + Raises: + ExtractorError: Might be raised when extracting episode data. + + """ + items_per_page = default_items_per_page page = 1 - api_url_template = 'https://www.ardaudiothek.de/api/podcasts/{}/episodes?items_per_page={}{}' entries = [] - while True: - # The API sometimes returns 404s for page=1. So only add that - # parameter if we actually have paginated content. - page_str = '&page=' + compat_str(page) if page > 1 else '' - api_url = api_url_template.format(podcast_id, - items_per_page, - page_str) - result_data = self._download_json(api_url, podcast_id, fatal=False) + # The number of entries as reported by the API + n_entries = None + + # The API sometimes returns an empty page without any episodes. In this + # case the next page often has episodes. This, however, throws off + # the total number of entries and it no longer becomes a reliable + # stopping condition when comparing it with the number of entries + # reported by the API. So we deal with this by not stopping at the + # first occurance of an empty page. We skip over a certain number of + # empty pages before giving up. + max_n_skipped_pages = default_items_per_page + 3 + n_skipped_pages = 0 + + while True: + # We need this to check if we actually added any entries + n_entries_before_this_page = len(entries) + + # Fetch data + api_url = api_url_template.format( + page_str=self._get_page_str(page), + items_per_page=items_per_page) + result_data = self._download_json(api_url, display_id, fatal=False) episodes = try_get(result_data, lambda x: x['result']['episodes'], list) - if episodes is None: - break - for episode in episodes: - entries.append(self._extract_episode(episode)) - - # Check if we're done - if len(entries) >= n_entries: + # Add entries + for episode in episodes or []: + entry = self._extract_episode( + self._get_episode_from_array_entry(episode)) + if entry not in entries: + entries.append(entry) + + # Fetch how many episodes the API says it has (it's enough to + # read it once) + n_entries = n_entries if n_entries is not None else try_get( + result_data, + lambda x: x['result']['meta']['episodes']['total'], + int) + + # Check if we have read the reported number of episodes + if n_entries is not None and len(entries) >= n_entries: break - # Sanity check, just in case - meta_total = try_get(result_data, - lambda x: - x['result']['meta']['episodes']['total'], - (int, float)) - meta_pages = try_get(result_data, - lambda x: - x['result']['meta']['episodes']['pages'], - (int, float)) - if not meta_total or not meta_pages: - break + # Check if we actually added any entries + if n_entries_before_this_page == len(entries): + # This was an empty page so we have to skip it + n_skipped_pages += 1 + if n_skipped_pages >= max_n_skipped_pages: + # Enough skipping, give up + break + + # Throttle by reading only half as many entries as before + if items_per_page > 1: + new_items_per_page = int(max(1, items_per_page / 2)) + page = int((page - 1) * items_per_page / + new_items_per_page) + items_per_page = new_items_per_page + else: + # This page had episodes, so we're no longer skipping + n_skipped_pages = 0 + + # Try to go back to full speed by going back to the default + # items_per_page value if possible. + if items_per_page * page % default_items_per_page == 0: + page = int(page * items_per_page / + default_items_per_page) + items_per_page = default_items_per_page page += 1 + # Tell the user if we received less entries than the API reported + if n_entries is not None and len(entries) < n_entries: + self.to_screen('Received {} of {} reported episodes'.format( + len(entries), n_entries)) + return entries def _real_extract(self, url): @@ -290,16 +375,65 @@ class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE): raise ExtractorError(msg="Could not find any playlist data", expected=True) - n_entries = try_get(pc_data, - lambda x: x['number_of_elements'], - (int, float)) - res = self._extract_id_title_desc(pc_data) res['_type'] = 'playlist' - res['entries'] = self._extract_episodes(podcast_id, n_entries) - if n_entries > len(res['entries']): - self.to_screen('Only received {} of {} reported episode IDs' - .format(len(res['entries']), n_entries)) + # items_per_page works from 1 up to 2147483647 (2^31 - 1). + # The website calls the API with items_per_page set to 24. Setting it + # to 500 or 1000 would download the data of all episodes in one or two + # pages. Increasing this value might however trigger server errors in + # the future. So to avoid any problems we will keep using the default + # value and just download a few more pages. + res['entries'] = self._extract_episodes( + podcast_id, + 'https://www.ardaudiothek.de/api/podcasts/%s/episodes?items_per_page={items_per_page}{page_str}' % podcast_id, + 24) return res + + +class ARDAudiothekSearchIE(ARDAudiothekPlaylistIE): + _VALID_URL = r'https?://(?:www\.|beta\.)?ardaudiothek\.de/suche\?(?:(?!q=).*&)?q=(?P[^&]+)(?:&.*)?' + _TESTS = [{ + 'url': 'https://www.ardaudiothek.de/suche?q=Sommer', + 'info_dict': { + 'id': 'Sommer', + 'title': 'Sommer', + 'description': compat_str, + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www.ardaudiothek.de/suche?q=Angela%20Merkel', + 'info_dict': { + 'id': 'Angela%20Merkel', + 'title': 'Angela Merkel', + 'description': compat_str, + }, + 'playlist_mincount': 5, + }] + + def _get_page_str(self, page): + # The search API always works with a page number + return '&page=' + compat_str(page) + + def _get_episode_from_array_entry(self, array_entry): + # The array entry is a dict with an 'episode' and a 'search_meta' entry + return try_get(array_entry, lambda x: x['episode'], dict) + + def _real_extract(self, url): + search_str = self._match_id(url) + display_str = _unquote_compat(search_str) + + return { + '_type': 'playlist', + 'id': search_str, + 'display_id': display_str, + 'title': display_str, + 'description': 'ARD Audiothek-Suche nach "' + display_str + '"', + # Searching on the website calls the API with items_per_page set + # to 8. Other values sometimes cause server errors. + 'entries': self._extract_episodes( + display_str, + 'https://www.ardaudiothek.de/api/search/%s?focus=episodes{page_str}&items_per_page={items_per_page}' % search_str, + 8), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d8130af..933451d4d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -60,6 +60,7 @@ from .ard import ( from .ardaudiothek import ( ARDAudiothekIE, ARDAudiothekPlaylistIE, + ARDAudiothekSearchIE, ) from .arte import ( ArteTVPlus7IE,