From 991ac059a4b4cef57fa3b2f9799ce21a933e5c84 Mon Sep 17 00:00:00 2001 From: Sepehr Kalanaki Date: Thu, 15 Dec 2022 19:35:03 +0330 Subject: [PATCH 1/2] [invidious] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/invidious.py | 188 +++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 youtube_dl/extractor/invidious.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 947cbe8fd..44a178bd0 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1667,3 +1667,4 @@ from .zingmp3 import ( ) from .zoom import ZoomIE from .zype import ZypeIE +from .invidious import InvidiousIE, InvidiousPlaylistIE diff --git a/youtube_dl/extractor/invidious.py b/youtube_dl/extractor/invidious.py new file mode 100644 index 000000000..430f5a7af --- /dev/null +++ b/youtube_dl/extractor/invidious.py @@ -0,0 +1,188 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse + +INSTANCES = [ + 'y.com.sb', + 'yt.artemislena.eu' +] + +INSTANCES_HOST_REGEX = '(?:' + '|'.join([instance.replace('.', r'\.') for instance in INSTANCES]) + ')' + + +class InvidiousIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?' + INSTANCES_HOST_REGEX + r'/watch\?v=(?P.+)' + _TEST = { + 'url': 'https://y.com.sb/watch?v=xKTygGa6hg0', + 'md5': 'fb95b1847f5a90af14fda5273fc84fc0', + 'info_dict': { + 'id': 'xKTygGa6hg0', + 'ext': 'mp4', + 'title': 'Coding in C++ - Creating a Player Controller - CRYENGINE Summer Academy S1E5 - [Tutorial]', + 'uploader': 'CRYENGINE', + 'uploader_id': 'UCtaXcIVFp8HEpthm7qwtKCQ', + 'description': 'md5:7aa75816d40ffccdbf3e15a90b05fca3', + } + } + + def __init__(self, downloader=None): + super().__init__(downloader) + + # type is either 'video' or 'audio' + # ext is the file extension + @staticmethod + def _get_additional_format_data(format_type, bitrate, resolution, fps): + out = {} + + try: + type_and_ext, codecs = format_type.split(';') + except Exception: + pass + + try: + type_, ext = type_and_ext.split('/') + # codec = codecs.split('"')[1] + out['ext'] = ext + # if type_ == 'audio': + # out['acodec'] = codec + # elif type_ == 'video': + # out['vcodec'] = codec + except Exception: + pass + + try: + bitrate = float(bitrate) / 1000 + # if type_ == 'audio': + # out['abr'] = bitrate + # elif type_ == 'video': + # out['vbr'] = bitrate + # out['tbr'] = bitrate + except Exception: + pass + + try: + if type_ == 'audio': + out['resolution'] = type_and_ext + ' @ ' + str(bitrate) + 'k - audio only' + elif type_ == 'video': + out['resolution'] = resolution + ' - ' + type_and_ext + ' @ ' + str(fps) + 'fps - video only' + except Exception: + pass + + return out + + def _patch_url(self, url): + return compat_urllib_parse.urlparse(url)._replace(netloc=self.url_netloc).geturl() + + def _get_formats(self, api_response): + all_formats = [] + + # Video/audio only + for format_ in api_response.get('adaptiveFormats') or []: + all_formats.append({ + 'url': self._patch_url(format_['url']), + 'format_id': format_.get('itag'), + # 'fps': format_.get('fps'), + # 'container': format_.get('container') + } | InvidiousIE._get_additional_format_data(format_.get('type'), format_.get('bitrate'), format_.get('resolution'), format_.get('fps'))) + + # Both video and audio + for format_ in api_response.get('formatStreams') or []: + all_formats.append({ + 'url': self._patch_url(format_['url']), + 'format_id': format_.get('itag'), + # 'fps': format_.get('fps'), + # 'container': format_.get('container') + } | InvidiousIE._get_additional_format_data(format_.get('type'), format_.get('bitrate'), format_.get('resolution'), format_.get('fps'))) + + return all_formats + + def _get_thumbnails(self, api_response): + thumbnails = [] + video_thumbnails = api_response.get('videoThumbnails') or [] + + for inversed_quality, thumbnail in enumerate(video_thumbnails): + thumbnails.append({ + 'id': thumbnail.get('quality'), + 'url': thumbnail.get('url'), + 'quality': len(video_thumbnails) - inversed_quality, + 'width': thumbnail.get('width'), + 'height': thumbnail.get('height') + }) + + return thumbnails + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = None + + # host_url will contain `http[s]://example.com` where `example.com` is the used invidious instance. + url_parsed = compat_urllib_parse.urlparse(url) + self.url_netloc = url_parsed.netloc + host_url = url_parsed.scheme + '://' + url_parsed.netloc + + api_response = self._download_json(host_url + '/api/v1/videos/' + video_id, video_id) + + def download_webpage_and(fn, fatal=True): + global webpage + if webpage is None: + webpage = self._download_webpage(url, video_id, fatal=fatal) + return fn() + + out = { + 'id': video_id, + 'title': api_response.get('title') or download_webpage_and(lambda: self._og_search_title(webpage)), + 'description': api_response.get('description') or download_webpage_and(lambda: self._og_search_description(webpage)), + + 'release_timestamp': api_response.get('published'), + + 'uploader': api_response.get('author'), + 'uploader_id': api_response.get('authorId'), + 'channel': api_response.get('author'), + 'channel_id': api_response.get('authorId'), + 'channel_url': host_url + api_response.get('authorUrl'), + + 'duration': api_response.get('lengthSeconds'), + + 'view_count': api_response.get('viewCount'), + 'like_count': api_response.get('likeCount'), + 'dislike_count': api_response.get('dislikeCount'), + + # 'isFamilyFriendly': 18 if api_response.get('isFamilyFriendly') == False else None + + 'tags': api_response.get('keywords'), + 'is_live': api_response.get('liveNow'), + + 'formats': self._get_formats(api_response), + 'thumbnails': self._get_thumbnails(api_response) + } + + if api_response.get('isFamilyFriendly') is False: + out['age_limit'] = 18 + + return out + + +class InvidiousPlaylistIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:www\.)?' + INSTANCES_HOST_REGEX + r'/playlist\?list=(?P.+)' + + def _get_entries(self, api_response): + out = [] + for video in api_response['videos']: + out.append(InvidiousIE(self._downloader)._real_extract(self.host_url + '/watch?v=' + video['videoId'])) + return out + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + # host_url will contain `http[s]://example.com` where `example.com` is the used invidious instance. + url_parsed = compat_urllib_parse.urlparse(url) + self.host_url = url_parsed.scheme + '://' + url_parsed.netloc + + api_response = self._download_json(self.host_url + '/api/v1/playlists/' + playlist_id, playlist_id) + return InfoExtractor.playlist_result(self._get_entries(api_response), playlist_id, api_response.get('title'), api_response.get('description')) | { + 'release_timestamp': api_response.get('updated'), + + 'uploader': api_response.get('author'), + 'uploader_id': api_response.get('authorId'), + } From a784be7395cbb0426b8111ca126e73fe4cf653cc Mon Sep 17 00:00:00 2001 From: Sepehr Kalanaki Date: Thu, 15 Dec 2022 20:04:39 +0330 Subject: [PATCH 2/2] [invidious] Add more tests --- youtube_dl/extractor/invidious.py | 61 +++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/invidious.py b/youtube_dl/extractor/invidious.py index 430f5a7af..e97c5b9a6 100644 --- a/youtube_dl/extractor/invidious.py +++ b/youtube_dl/extractor/invidious.py @@ -13,18 +13,37 @@ INSTANCES_HOST_REGEX = '(?:' + '|'.join([instance.replace('.', r'\.') for instan class InvidiousIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?' + INSTANCES_HOST_REGEX + r'/watch\?v=(?P.+)' - _TEST = { - 'url': 'https://y.com.sb/watch?v=xKTygGa6hg0', - 'md5': 'fb95b1847f5a90af14fda5273fc84fc0', - 'info_dict': { - 'id': 'xKTygGa6hg0', - 'ext': 'mp4', - 'title': 'Coding in C++ - Creating a Player Controller - CRYENGINE Summer Academy S1E5 - [Tutorial]', - 'uploader': 'CRYENGINE', - 'uploader_id': 'UCtaXcIVFp8HEpthm7qwtKCQ', - 'description': 'md5:7aa75816d40ffccdbf3e15a90b05fca3', - } - } + _TESTS = [ + { + 'url': 'https://y.com.sb/watch?v=xKTygGa6hg0', + 'info_dict': { + 'id': 'xKTygGa6hg0', + 'ext': 'mp4', + 'title': 'Coding in C++ - Creating a Player Controller - CRYENGINE Summer Academy S1E5 - [Tutorial]', + 'uploader': 'CRYENGINE', + 'uploader_id': 'UCtaXcIVFp8HEpthm7qwtKCQ', + 'description': 'md5:7aa75816d40ffccdbf3e15a90b05fca3', + } + }, + { + 'url': 'https://yt.artemislena.eu/watch?v=BaW_jenozKc', + 'md5': '5515885fed58607bfae88f7d2090bc93', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'tags': ['youtube-dl'], + 'duration': 10, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + } + }, + ] def __init__(self, downloader=None): super().__init__(downloader) @@ -165,12 +184,22 @@ class InvidiousIE(InfoExtractor): class InvidiousPlaylistIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:www\.)?' + INSTANCES_HOST_REGEX + r'/playlist\?list=(?P.+)' + _TEST = { + 'url': 'https://yt.artemislena.eu/playlist?list=PLowKtXNTBypGqImE405J2565dvjafglHU', + 'md5': 'de4a9175071169961fe7cf2b6740da12', + 'info_dict': { + 'id': 'HyznrdDSSGM', + 'ext': 'mp4', + 'title': '8-bit computer update', + 'uploader': 'Ben Eater', + 'uploader_id': 'UCS0N5baNlQWJCUrhCEo8WlA', + 'description': 'An update on my plans to build another 8-bit computer from scratch and make videos of the whole process! Buy a kit and build your own! https://eater.net/8bit/kits\n\nSupport me on Patreon: https://www.patreon.com/beneater', + } + } def _get_entries(self, api_response): - out = [] - for video in api_response['videos']: - out.append(InvidiousIE(self._downloader)._real_extract(self.host_url + '/watch?v=' + video['videoId'])) - return out + return [InvidiousIE(self._downloader)._real_extract(self.host_url + '/watch?v=' + video['videoId']) + for video in api_response['videos']] def _real_extract(self, url): playlist_id = self._match_id(url)