From ddc080a562cce984ac4a86969f511b1ae59421bf Mon Sep 17 00:00:00 2001 From: df Date: Mon, 18 Oct 2021 15:54:26 +0100 Subject: [PATCH] Add ArteTVCategoryIE to support category playlists --- youtube_dl/extractor/arte.py | 47 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 48 insertions(+) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 03abdbfaf..5bfe57b10 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -252,3 +253,49 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?P[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r']*?href\s*=\s*(?P"|\'|\b)(?Phttps?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + if items: + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r']*>([^<]+)', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + result = self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title) + if result: + description = self._og_search_description(webpage, default=None) + if description: + result['description'] = description + return result diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e70daf2b1..50b7cb4a0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,7 @@ from .arte import ( ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) from .arnes import ArnesIE from .asiancrush import (