From 910ef313e1c12f8c7fc8b84140ef8fc9dc98d8ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Wed, 10 Feb 2021 19:43:17 +0100 Subject: [PATCH 01/10] [RTV SLO 4D] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rtvslo.py | 61 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/rtvslo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 84998316c..08cdd78d3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1006,6 +1006,7 @@ from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE from .rtvs import RTVSIE +from .rtvslo import RTVSLO4DIE from .ruhd import RUHDIE from .rumble import RumbleEmbedIE from .rutube import ( diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py new file mode 100644 index 000000000..5b6a3dbbd --- /dev/null +++ b/youtube_dl/extractor/rtvslo.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_timestamp, + try_get, ExtractorError +) + + +class RTVSLO4DIE(InfoExtractor): + _VALID_URL = r'https?://(?:4d|www)\.rtvslo\.si/(?:arhiv/.+|embed|4d/arhiv)/(?P\d+)' + _TEST = { + 'url': 'https://4d.rtvslo.si/arhiv/seje-odbora-za-kmetijstvo-gozdarstvo-in-prehrano/174595438', + 'md5': 'b87e5a65be2365f83eb0d24d44131d0f', + 'info_dict': { + 'id': '174595438', + 'ext': 'mp4', + 'title': 'Krajčič o tatvini sendviča', + 'thumbnail': r're:https://img.rtvslo.si/.+\.jpg', + 'timestamp': 1549999614, + 'upload_date': '20190212', + 'duration': 85 + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + embed_url = 'https://4d.rtvslo.si/embed/' + video_id + embed_html = self._download_webpage(embed_url, video_id) + + client_id = self._search_regex(r'\[\'client\'\] = "(.+?)";', embed_html, 'clientId') + + info_url = 'https://api.rtvslo.si/ava/getRecordingDrm/' + video_id + '?client_id=' + client_id + video_info = self._download_json(info_url, video_id)['response'] + + if video_info["mediaType"] != "video": + raise ExtractorError("Downloading audio is not implemented for this source yet") + + jwt = video_info['jwt'] + + media_info_url = 'https://api.rtvslo.si/ava/getMedia/' + video_id + '?client_id=' + client_id + '&jwt=' + jwt + media_info = self._download_json(media_info_url, video_id)['response'] + + # TODO: Support for audio-only links (like radio shows) + # Instead of HLS, an mp3 URL is provided for those in ".mediaFiles[0].streams.https" + + formats = self._extract_m3u8_formats( + media_info['addaptiveMedia']['hls'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': video_info['title'], + 'description': try_get(video_info, 'description'), + 'thumbnail': video_info.get('thumbnail_sec'), + 'timestamp': unified_timestamp(video_info['broadcastDate']), + 'duration': video_info.get('duration'), + 'formats': formats, + } From 8e6eca64326dc7a13138c210e10a69356790cd54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Thu, 11 Feb 2021 12:49:53 +0100 Subject: [PATCH 02/10] [RTV SLO 4D] Removed unnecessary requests, improved formatting --- youtube_dl/extractor/rtvslo.py | 37 +++++++++++++--------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index 5b6a3dbbd..12762b529 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -4,7 +4,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( unified_timestamp, - try_get, ExtractorError + try_get, + ExtractorError ) @@ -25,37 +26,27 @@ class RTVSLO4DIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + media_id = self._match_id(url) - embed_url = 'https://4d.rtvslo.si/embed/' + video_id - embed_html = self._download_webpage(embed_url, video_id) + info_url = 'https://api.rtvslo.si/ava/getRecording/' + media_id + '?client_id=19cc0556a5ee31d0d52a0e30b0696b26' + media_info = self._download_json(info_url, media_id)['response'] - client_id = self._search_regex(r'\[\'client\'\] = "(.+?)";', embed_html, 'clientId') - - info_url = 'https://api.rtvslo.si/ava/getRecordingDrm/' + video_id + '?client_id=' + client_id - video_info = self._download_json(info_url, video_id)['response'] - - if video_info["mediaType"] != "video": - raise ExtractorError("Downloading audio is not implemented for this source yet") - - jwt = video_info['jwt'] - - media_info_url = 'https://api.rtvslo.si/ava/getMedia/' + video_id + '?client_id=' + client_id + '&jwt=' + jwt - media_info = self._download_json(media_info_url, video_id)['response'] + if media_info['mediaType'] != 'video': + raise ExtractorError('Downloading audio is not implemented for this source yet') # TODO: Support for audio-only links (like radio shows) # Instead of HLS, an mp3 URL is provided for those in ".mediaFiles[0].streams.https" formats = self._extract_m3u8_formats( - media_info['addaptiveMedia']['hls'], video_id, 'mp4', + media_info['addaptiveMedia']['hls'], media_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') return { - 'id': video_id, - 'title': video_info['title'], - 'description': try_get(video_info, 'description'), - 'thumbnail': video_info.get('thumbnail_sec'), - 'timestamp': unified_timestamp(video_info['broadcastDate']), - 'duration': video_info.get('duration'), + 'id': media_id, + 'title': media_info['title'], + 'description': try_get(media_info, 'description'), + 'thumbnail': media_info.get('thumbnail_sec'), + 'timestamp': unified_timestamp(media_info['broadcastDate']), + 'duration': media_info.get('duration'), 'formats': formats, } From 2cf78de692395448066f7a1c6adad2d8c8877aa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Thu, 11 Feb 2021 13:51:54 +0100 Subject: [PATCH 03/10] [RTV SLO 4D] Improved URL regex --- youtube_dl/extractor/rtvslo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index 12762b529..bb0dadf6f 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -10,7 +10,7 @@ from ..utils import ( class RTVSLO4DIE(InfoExtractor): - _VALID_URL = r'https?://(?:4d|www)\.rtvslo\.si/(?:arhiv/.+|embed|4d/arhiv)/(?P\d+)' + _VALID_URL = r'https?://(?:4d\.rtvslo\.si/(?:arhiv/.+|embed)|www\.rtvslo\.si/(?:4d/arhiv|mmr/prispevek))/(?P\d+)' _TEST = { 'url': 'https://4d.rtvslo.si/arhiv/seje-odbora-za-kmetijstvo-gozdarstvo-in-prehrano/174595438', 'md5': 'b87e5a65be2365f83eb0d24d44131d0f', From 5fe9de5f7afe4a173807461285cd40991d163b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Thu, 11 Feb 2021 13:52:52 +0100 Subject: [PATCH 04/10] [RTV SLO 4D] Added support for audio, more tests --- youtube_dl/extractor/rtvslo.py | 54 ++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index bb0dadf6f..d27dc963e 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -4,14 +4,13 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( unified_timestamp, - try_get, - ExtractorError + try_get ) class RTVSLO4DIE(InfoExtractor): _VALID_URL = r'https?://(?:4d\.rtvslo\.si/(?:arhiv/.+|embed)|www\.rtvslo\.si/(?:4d/arhiv|mmr/prispevek))/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://4d.rtvslo.si/arhiv/seje-odbora-za-kmetijstvo-gozdarstvo-in-prehrano/174595438', 'md5': 'b87e5a65be2365f83eb0d24d44131d0f', 'info_dict': { @@ -23,7 +22,31 @@ class RTVSLO4DIE(InfoExtractor): 'upload_date': '20190212', 'duration': 85 }, - } + }, { + 'url': 'https://4d.rtvslo.si/arhiv/punto-e-a-capo/174752966', + 'md5': 'a1ce903ee0a4051e417c9357e3d51c71', + 'info_dict': { + 'id': '174752966', + 'ext': 'mp3', + 'title': 'Dante divulgatore della scienza, con Gian Italo Bischi. E un ricordo di Federico Roncoroni', + 'thumbnail': r're:https://img.rtvslo.si/.+\.jpg', + 'timestamp': 1613033635, + 'upload_date': '20210211', + 'duration': 1740 + }, + }, { + 'url': 'https://4d.rtvslo.si/arhiv/punto-e-a-capo/174752966', + 'only_matching': True, + }, { + 'url': 'https://4d.rtvslo.si/embed/174595438', + 'only_matching': True, + }, { + 'url': 'https://www.rtvslo.si/4d/arhiv/174752597?s=tv_ita', + 'only_matching': True, + }, { + 'url': 'https://www.rtvslo.si/mmr/prispevek/174752987', + 'only_matching': True, + }] def _real_extract(self, url): media_id = self._match_id(url) @@ -31,22 +54,21 @@ class RTVSLO4DIE(InfoExtractor): info_url = 'https://api.rtvslo.si/ava/getRecording/' + media_id + '?client_id=19cc0556a5ee31d0d52a0e30b0696b26' media_info = self._download_json(info_url, media_id)['response'] - if media_info['mediaType'] != 'video': - raise ExtractorError('Downloading audio is not implemented for this source yet') - - # TODO: Support for audio-only links (like radio shows) - # Instead of HLS, an mp3 URL is provided for those in ".mediaFiles[0].streams.https" - - formats = self._extract_m3u8_formats( - media_info['addaptiveMedia']['hls'], media_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - - return { + extracted = { 'id': media_id, 'title': media_info['title'], 'description': try_get(media_info, 'description'), 'thumbnail': media_info.get('thumbnail_sec'), 'timestamp': unified_timestamp(media_info['broadcastDate']), 'duration': media_info.get('duration'), - 'formats': formats, } + + if media_info['mediaType'] == 'video': + extracted['formats'] = self._extract_m3u8_formats( + media_info['addaptiveMedia']['hls'], media_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + elif media_info['mediaType'] == 'audio': + extracted['url'] = media_info['mediaFiles'][0]['streamers']['http'] + '/' + media_info['mediaFiles'][0]['filename'] + + return extracted From 2e75b8092f6ee303b6568de12d795fcf3a100392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Sun, 14 Feb 2021 16:48:49 +0100 Subject: [PATCH 05/10] [RTV SLO 4D] Cleanup, switched to HTTPS --- youtube_dl/extractor/rtvslo.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index d27dc963e..bc1c6d4e2 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_timestamp, - try_get + try_get, + unified_timestamp ) class RTVSLO4DIE(InfoExtractor): - _VALID_URL = r'https?://(?:4d\.rtvslo\.si/(?:arhiv/.+|embed)|www\.rtvslo\.si/(?:4d/arhiv|mmr/prispevek))/(?P\d+)' + _VALID_URL = r'https?://(?:4d\.rtvslo\.si/(?:arhiv/[^/]+|embed)|www\.rtvslo\.si/(?:4d/arhiv|mmr/prispevek))/(?P\d+)' _TESTS = [{ 'url': 'https://4d.rtvslo.si/arhiv/seje-odbora-za-kmetijstvo-gozdarstvo-in-prehrano/174595438', 'md5': 'b87e5a65be2365f83eb0d24d44131d0f', @@ -51,22 +51,24 @@ class RTVSLO4DIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) - info_url = 'https://api.rtvslo.si/ava/getRecording/' + media_id + '?client_id=19cc0556a5ee31d0d52a0e30b0696b26' - media_info = self._download_json(info_url, media_id)['response'] + media_info = self._download_json( + 'https://api.rtvslo.si/ava/getRecording/' + media_id, media_id, + query={'client_id': '19cc0556a5ee31d0d52a0e30b0696b26'})['response'] extracted = { 'id': media_id, 'title': media_info['title'], 'description': try_get(media_info, 'description'), 'thumbnail': media_info.get('thumbnail_sec'), - 'timestamp': unified_timestamp(media_info['broadcastDate']), + 'timestamp': unified_timestamp(media_info.get('broadcastDate')), 'duration': media_info.get('duration'), } if media_info['mediaType'] == 'video': extracted['formats'] = self._extract_m3u8_formats( - media_info['addaptiveMedia']['hls'], media_id, 'mp4', + media_info['addaptiveMedia']['hls_sec'], media_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(extracted['formats']) elif media_info['mediaType'] == 'audio': extracted['url'] = media_info['mediaFiles'][0]['streamers']['http'] + '/' + media_info['mediaFiles'][0]['filename'] From 6435b66967ab4728ae957226e1acbf85bcff5b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Mon, 15 Feb 2021 15:31:03 +0100 Subject: [PATCH 06/10] [RTV SLO 4D] Support for multiple audio formats --- youtube_dl/extractor/rtvslo.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index bc1c6d4e2..fd9bd02fa 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + determine_ext, + int_or_none, try_get, unified_timestamp ) @@ -71,6 +73,15 @@ class RTVSLO4DIE(InfoExtractor): self._sort_formats(extracted['formats']) elif media_info['mediaType'] == 'audio': - extracted['url'] = media_info['mediaFiles'][0]['streamers']['http'] + '/' + media_info['mediaFiles'][0]['filename'] + extracted['formats'] = [{ + 'format_id': file['mediaType'], + 'url': file['streamers']['http'] + '/' + file['filename'], + 'ext': determine_ext(file['filename']), + 'tbr': int_or_none(file.get('bitrate')), + 'filesize': int_or_none(file.get('filesize')), + 'vcodec': 'none' + } for file in media_info['mediaFiles']] + + self._sort_formats(extracted['formats']) return extracted From 8fb3a99c34af2f983a56af54089de7b3567442ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Mon, 15 Feb 2021 15:33:21 +0100 Subject: [PATCH 07/10] [RTV SLO 4D] Fixed test that failed due to different sorting --- youtube_dl/extractor/rtvslo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index fd9bd02fa..96abd4438 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -14,7 +14,7 @@ class RTVSLO4DIE(InfoExtractor): _VALID_URL = r'https?://(?:4d\.rtvslo\.si/(?:arhiv/[^/]+|embed)|www\.rtvslo\.si/(?:4d/arhiv|mmr/prispevek))/(?P\d+)' _TESTS = [{ 'url': 'https://4d.rtvslo.si/arhiv/seje-odbora-za-kmetijstvo-gozdarstvo-in-prehrano/174595438', - 'md5': 'b87e5a65be2365f83eb0d24d44131d0f', + 'md5': '37ab1181292a08e0d6b7952545e6ce8b', 'info_dict': { 'id': '174595438', 'ext': 'mp4', From 944f674c6d3039325658c4db7af7542afbe9e1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Mon, 15 Feb 2021 16:38:15 +0100 Subject: [PATCH 08/10] [RTV SLO 4D] Reordered info extraction --- youtube_dl/extractor/rtvslo.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index 96abd4438..c7f33d138 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -57,23 +57,13 @@ class RTVSLO4DIE(InfoExtractor): 'https://api.rtvslo.si/ava/getRecording/' + media_id, media_id, query={'client_id': '19cc0556a5ee31d0d52a0e30b0696b26'})['response'] - extracted = { - 'id': media_id, - 'title': media_info['title'], - 'description': try_get(media_info, 'description'), - 'thumbnail': media_info.get('thumbnail_sec'), - 'timestamp': unified_timestamp(media_info.get('broadcastDate')), - 'duration': media_info.get('duration'), - } - if media_info['mediaType'] == 'video': - extracted['formats'] = self._extract_m3u8_formats( + formats = self._extract_m3u8_formats( media_info['addaptiveMedia']['hls_sec'], media_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(extracted['formats']) elif media_info['mediaType'] == 'audio': - extracted['formats'] = [{ + formats = [{ 'format_id': file['mediaType'], 'url': file['streamers']['http'] + '/' + file['filename'], 'ext': determine_ext(file['filename']), @@ -82,6 +72,14 @@ class RTVSLO4DIE(InfoExtractor): 'vcodec': 'none' } for file in media_info['mediaFiles']] - self._sort_formats(extracted['formats']) + self._sort_formats(formats) - return extracted + return { + 'id': media_id, + 'title': media_info['title'], + 'formats': formats, + 'description': try_get(media_info, 'description'), + 'thumbnail': media_info.get('thumbnail_sec'), + 'timestamp': unified_timestamp(media_info.get('broadcastDate')), + 'duration': media_info.get('duration'), + } From a302001725fceae0c6a7e754e3acfd442e17fc39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Thu, 25 Mar 2021 11:48:40 +0100 Subject: [PATCH 09/10] [RTV SLO 4D] Extract both HTTPS and HTTP HLS URLs --- youtube_dl/extractor/rtvslo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index c7f33d138..ff8e59e71 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -58,9 +58,11 @@ class RTVSLO4DIE(InfoExtractor): query={'client_id': '19cc0556a5ee31d0d52a0e30b0696b26'})['response'] if media_info['mediaType'] == 'video': - formats = self._extract_m3u8_formats( - media_info['addaptiveMedia']['hls_sec'], media_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + formats = [] + for proto in ('hls_sec', 'hls',): + formats += self._extract_m3u8_formats( + media_info['addaptiveMedia'][proto], media_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') elif media_info['mediaType'] == 'audio': formats = [{ From d81793ea564e44219aa06f806caa8d46a242d2f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miha=20Frange=C5=BE?= Date: Thu, 25 Mar 2021 11:49:56 +0100 Subject: [PATCH 10/10] [RTV SLO 4D] Added support for subtitles --- youtube_dl/extractor/rtvslo.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/rtvslo.py b/youtube_dl/extractor/rtvslo.py index ff8e59e71..1264a8752 100644 --- a/youtube_dl/extractor/rtvslo.py +++ b/youtube_dl/extractor/rtvslo.py @@ -84,4 +84,15 @@ class RTVSLO4DIE(InfoExtractor): 'thumbnail': media_info.get('thumbnail_sec'), 'timestamp': unified_timestamp(media_info.get('broadcastDate')), 'duration': media_info.get('duration'), + 'subtitles': self.extract_subtitles(media_info) } + + def _get_subtitles(self, media_info): + subs = {} + for sub in media_info.get('subtitles', []): + subs[sub['language']] = [{ + 'ext': 'vtt', + 'url': sub['file'] + }] + + return subs