diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a56a7c52f..b3e2ef7c7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -969,6 +969,7 @@ from .pluralsight import ( PluralsightIE, PluralsightCourseIE, ) +from .podchaser import PodchaserIE from .podomatic import PodomaticIE from .pokemon import PokemonIE from .polskieradio import ( diff --git a/youtube_dl/extractor/podchaser.py b/youtube_dl/extractor/podchaser.py new file mode 100644 index 000000000..83fa2246d --- /dev/null +++ b/youtube_dl/extractor/podchaser.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from __future__ import unicode_literals +import json +import re +from ..utils import float_or_none, try_get, str_to_int, unified_timestamp, merge_dicts +from ..compat import compat_str +from .common import InfoExtractor + + +class PodchaserIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?podchaser\.com/ + (?: + (?:podcasts/[\w-]+-(?P[\d]+))) + (?:/episodes/[\w\-]+- + (?P[\d]+))?''' + + _TESTS = [{ + 'url': 'https://www.podchaser.com/podcasts/cum-town-36924/episodes/ep-285-freeze-me-off-104365585', + 'info_dict': { + 'id': '104365585', + 'title': "Ep. 285 – freeze me off", + 'description': 'cam ahn', + 'thumbnail': r're:^https?://.*\.jpg$', + 'ext': 'mp3', + 'categories': ['Comedy'], + 'tags': ['comedy', 'dark humor'], + 'series': 'Cum Town', + 'duration': 3708, + 'timestamp': 1636531259, + 'upload_date': '20211110' + } + }, { + 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', + 'info_dict': { + 'id': '28853', + 'title': 'The Bone Zone', + 'description': 'Podcast by The Bone Zone', + }, + 'playlist_count': 275 + }, { + 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', + 'info_dict': { + 'id': '699349', + 'title': "Sean Carroll's Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas", + 'description': 'md5:2cbd8f4749891a84dc8235342e0b5ff1' + }, + 'playlist_mincount': 199 + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + audio_id, podcast_id = mobj.group('id'), mobj.group('podcast_id') + + # If one episode + if audio_id: + episodes = [self._download_json("https://api.podchaser.com/episodes/%s" % audio_id, audio_id)] + + # Else get every episode available + else: + total_episode_count = self._download_json( + "https://api.podchaser.com/list/episode", podcast_id, + headers={'Content-Type': 'application/json;charset=utf-8'}, + data=json.dumps({ + "filters": {"podcast_id": podcast_id} + }).encode()).get('total') + episodes = [] + print(total_episode_count) + for i in range(total_episode_count // 100 + 1): + curr_episodes_data = self._download_json( + "https://api.podchaser.com/list/episode", podcast_id, + headers={'Content-Type': 'application/json;charset=utf-8'}, + data=json.dumps({ + "start": i * 100, + "count": (i + 1) * 100, + "sort_order": "SORT_ORDER_RECENT", + "filters": { + "podcast_id": podcast_id + }, "options": {} + }).encode()) + curr_episodes = curr_episodes_data.get('entities') or [] + if len(curr_episodes) + len(episodes) <= total_episode_count: + episodes.extend(curr_episodes) + + podcast_data = merge_dicts( + self._download_json("https://api.podchaser.com/podcasts/%s" % podcast_id, audio_id or podcast_id) or {}, + episodes[0].get('podcast') or {} if episodes else {}) + + entries = [{ + 'id': compat_str(episode.get('id')), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'url': episode.get('audio_url'), + 'thumbnail': episode.get('image_url'), + 'duration': str_to_int(episode.get('length')), + 'timestamp': unified_timestamp(episode.get('air_date')), + 'rating': float_or_none(episode.get('rating')), + 'categories': [ + x.get('text') for x in + podcast_data.get('categories') + or try_get(podcast_data, lambda x: x['summary']['categories'], list) or []], + 'tags': [tag.get('text') for tag in podcast_data.get('tags') or []], + 'series': podcast_data.get('title'), + } for episode in episodes] + + if len(entries) > 1: + # Return playlist + return self.playlist_result( + entries, playlist_id=compat_str(podcast_data.get('id')), + playlist_title=podcast_data.get('title'), + playlist_description=podcast_data.get('description')) + + # Return episode + return entries[0]