From b22afc8e94fc4a3f02144038f9053a3aca65d746 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 19:35:56 -0400 Subject: [PATCH 01/18] almost works... --- youtube_dl/extractor/digitalconcerthall.py | 59 ++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/digitalconcerthall.py diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py new file mode 100644 index 000000000..586f6046f --- /dev/null +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DigitalConcertHallIE(InfoExtractor): + IE_DESC = 'DigitalConcertHall extractor' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.digitalconcerthall.com/en/concert/51841', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '51841', + 'language': 'en', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': r're:^https?://.*/images/core/Phil.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + },] + + def _real_extract(self, url): + #video_id = self._match_id(url) + language, video_id = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + print("url: ", url, " video_id: ", video_id, " language: ", language, "\n") + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'(.+?)', webpage, 'title') + print("title: ", title, "\n") + + # this returns JSON, which contains the urls of the playlist + #video_data = self._download_webpage( + # 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id) + playlist_dict = self._download_json( + 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id)['urls'] + + entries = [] + for key in playlist_dict: + print("key: ", key, "\n") + print("key url: ", playlist_dict[key][0]['url'], "\n") + entries.append(playlist_dict[key][0]['url']) + +# for i in entries: +# print(i) + + return { + '_type': 'playlist', + 'id': video_id, + 'title': title, + 'entries': entries, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d1fa251..5496e5667 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1243,6 +1243,7 @@ from .ufctv import ( ) from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE +from .digitalconcerthall import DigitalConcertHallIE from .dlive import ( DLiveVODIE, DLiveStreamIE, From 61a4a0fda86eb5a84fea641d1c3196ce9f8c87f5 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 19:52:33 -0400 Subject: [PATCH 02/18] Download now works, but video info is missing --- youtube_dl/extractor/digitalconcerthall.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 586f6046f..ec8fd95d6 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -46,7 +46,11 @@ class DigitalConcertHallIE(InfoExtractor): for key in playlist_dict: print("key: ", key, "\n") print("key url: ", playlist_dict[key][0]['url'], "\n") - entries.append(playlist_dict[key][0]['url']) + entries.append({ + 'id': video_id, + 'title': title + "-" + key, + 'url': playlist_dict[key][0]['url'], + }) # for i in entries: # print(i) From 831695a576c6a5cb37a235c382b796cdf35c44de Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 20:31:19 -0400 Subject: [PATCH 03/18] add m3u8 formats extractor so that user can use -f best --- youtube_dl/extractor/digitalconcerthall.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index ec8fd95d6..b010be267 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -45,11 +45,15 @@ class DigitalConcertHallIE(InfoExtractor): entries = [] for key in playlist_dict: print("key: ", key, "\n") - print("key url: ", playlist_dict[key][0]['url'], "\n") + m3u8_url = playlist_dict[key][0]['url'] + print("key url: ", m3u8_url, "\n") + formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + print("formats:\n", formats) entries.append({ - 'id': video_id, - 'title': title + "-" + key, - 'url': playlist_dict[key][0]['url'], + 'id': key, + 'title': title, + 'url': m3u8_url, + 'formats': formats, }) # for i in entries: From 2a11f19466515f42fdf035c62bb648179266076b Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 21:39:44 -0400 Subject: [PATCH 04/18] get titles of playlist items and use debug instead of print --- youtube_dl/extractor/digitalconcerthall.py | 33 ++++++++++++++-------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index b010be267..d64ed3e24 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -4,6 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_id, + get_element_by_attribute, + get_element_by_class, +) class DigitalConcertHallIE(InfoExtractor): @@ -18,23 +25,23 @@ class DigitalConcertHallIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video title goes here', 'thumbnail': r're:^https?://.*/images/core/Phil.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) } },] + def debug_out(self, args): + if not self._downloader.params.get('verbose', False): + return + + self.to_screen('[debug] %s' % args) + def _real_extract(self, url): - #video_id = self._match_id(url) language, video_id = re.match(self._VALID_URL, url).groups() if not language: language = 'en' - print("url: ", url, " video_id: ", video_id, " language: ", language, "\n") + self.debug_out("url: " + url + " video_id: " + video_id + " language: " + language) webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'(.+?)', webpage, 'title') - print("title: ", title, "\n") + self.to_screen("title: " + title) # this returns JSON, which contains the urls of the playlist #video_data = self._download_webpage( @@ -44,11 +51,15 @@ class DigitalConcertHallIE(InfoExtractor): entries = [] for key in playlist_dict: - print("key: ", key, "\n") + self.debug_out("key: " + key) m3u8_url = playlist_dict[key][0]['url'] - print("key url: ", m3u8_url, "\n") + self.debug_out("key url: " + m3u8_url) formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - print("formats:\n", formats) + self.debug_out(formats) + vid_info_div = clean_html(get_element_by_id(key, webpage)) + self.debug_out("vid_info_div:\n" + vid_info_div) + title = re.sub('\s+', ' ', vid_info_div) + self.to_screen("title: " + title ) entries.append({ 'id': key, 'title': title, From 5ef679391f246356e32f8016cb04866e383edbe5 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 21:46:55 -0400 Subject: [PATCH 05/18] cleanup unused code --- youtube_dl/extractor/digitalconcerthall.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index d64ed3e24..4ff03804b 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -16,7 +16,7 @@ from ..utils import ( class DigitalConcertHallIE(InfoExtractor): IE_DESC = 'DigitalConcertHall extractor' _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' - _TESTS = [{ + _TEST = { 'url': 'https://www.digitalconcerthall.com/en/concert/51841', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { @@ -26,7 +26,7 @@ class DigitalConcertHallIE(InfoExtractor): 'title': 'Video title goes here', 'thumbnail': r're:^https?://.*/images/core/Phil.*\.jpg$', } - },] + } def debug_out(self, args): if not self._downloader.params.get('verbose', False): @@ -43,9 +43,7 @@ class DigitalConcertHallIE(InfoExtractor): title = self._html_search_regex(r'(.+?)', webpage, 'title') self.to_screen("title: " + title) - # this returns JSON, which contains the urls of the playlist - #video_data = self._download_webpage( - # 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id) + # this returns JSON containing the urls of the playlist playlist_dict = self._download_json( 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id)['urls'] @@ -56,6 +54,7 @@ class DigitalConcertHallIE(InfoExtractor): self.debug_out("key url: " + m3u8_url) formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) + # the div with id=key contains the video title vid_info_div = clean_html(get_element_by_id(key, webpage)) self.debug_out("vid_info_div:\n" + vid_info_div) title = re.sub('\s+', ' ', vid_info_div) @@ -67,9 +66,6 @@ class DigitalConcertHallIE(InfoExtractor): 'formats': formats, }) -# for i in entries: -# print(i) - return { '_type': 'playlist', 'id': video_id, From bedaa1c962fae9d4bff7672720aabe1f8e354da6 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 21:55:21 -0400 Subject: [PATCH 06/18] flake8 fixes --- youtube_dl/extractor/digitalconcerthall.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 4ff03804b..ab6a9ad9e 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -6,10 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, - extract_attributes, get_element_by_id, - get_element_by_attribute, - get_element_by_class, ) @@ -57,8 +54,8 @@ class DigitalConcertHallIE(InfoExtractor): # the div with id=key contains the video title vid_info_div = clean_html(get_element_by_id(key, webpage)) self.debug_out("vid_info_div:\n" + vid_info_div) - title = re.sub('\s+', ' ', vid_info_div) - self.to_screen("title: " + title ) + title = re.sub(r'\s+', ' ', vid_info_div) + self.to_screen("title: " + title) entries.append({ 'id': key, 'title': title, From b98ae0e5e9d4b03458fd7b6b85db95f2cceda373 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 22:08:01 -0400 Subject: [PATCH 07/18] youtube-dl coding convention fixes --- youtube_dl/extractor/digitalconcerthall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index ab6a9ad9e..b8f304c0b 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -42,12 +42,12 @@ class DigitalConcertHallIE(InfoExtractor): # this returns JSON containing the urls of the playlist playlist_dict = self._download_json( - 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id)['urls'] + 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id).get('urls') entries = [] for key in playlist_dict: self.debug_out("key: " + key) - m3u8_url = playlist_dict[key][0]['url'] + m3u8_url = playlist_dict.get(key)[0].get('url') self.debug_out("key url: " + m3u8_url) formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) From b7c149a5c85a1e93d8bad10bc8d845d63f272858 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 22:20:21 -0400 Subject: [PATCH 08/18] more youtube-dl coding convention fixes --- youtube_dl/extractor/digitalconcerthall.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index b8f304c0b..d8e1a97a3 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -42,19 +42,22 @@ class DigitalConcertHallIE(InfoExtractor): # this returns JSON containing the urls of the playlist playlist_dict = self._download_json( - 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + video_id + "&language=" + language, video_id).get('urls') + 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + + video_id + "&language=" + language, video_id).get('urls') entries = [] for key in playlist_dict: self.debug_out("key: " + key) m3u8_url = playlist_dict.get(key)[0].get('url') self.debug_out("key url: " + m3u8_url) - formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) # the div with id=key contains the video title vid_info_div = clean_html(get_element_by_id(key, webpage)) self.debug_out("vid_info_div:\n" + vid_info_div) - title = re.sub(r'\s+', ' ', vid_info_div) + title = re.sub(r'\s+', ' ', vid_info_div) \ + or self._og_search_title(webpage) self.to_screen("title: " + title) entries.append({ 'id': key, From 15f29e48c9218dda4b765332cb8f7ee44f5275b9 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 22:28:33 -0400 Subject: [PATCH 09/18] re-run flake after convention changes --- youtube_dl/extractor/digitalconcerthall.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index d8e1a97a3..aa5414f9a 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -42,16 +42,16 @@ class DigitalConcertHallIE(InfoExtractor): # this returns JSON containing the urls of the playlist playlist_dict = self._download_json( - 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' - + video_id + "&language=" + language, video_id).get('urls') + 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' + + video_id + "&language=" + language, video_id).get('urls') entries = [] for key in playlist_dict: self.debug_out("key: " + key) m3u8_url = playlist_dict.get(key)[0].get('url') self.debug_out("key url: " + m3u8_url) - formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) + formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) # the div with id=key contains the video title vid_info_div = clean_html(get_element_by_id(key, webpage)) From 5c11300fbf4252d9cefb9caaebec7bef679e4534 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 21 Mar 2020 23:25:46 -0400 Subject: [PATCH 10/18] make flake happy --- youtube_dl/extractor/digitalconcerthall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index aa5414f9a..f38d12d22 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -50,8 +50,8 @@ class DigitalConcertHallIE(InfoExtractor): self.debug_out("key: " + key) m3u8_url = playlist_dict.get(key)[0].get('url') self.debug_out("key url: " + m3u8_url) - formats = self._extract_m3u8_formats(m3u8_url, key, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False) + formats = self._extract_m3u8_formats( + m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) # the div with id=key contains the video title vid_info_div = clean_html(get_element_by_id(key, webpage)) From b09bb338018860f22cbdbb6271a40333238fa9ec Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sun, 22 Mar 2020 08:08:36 -0400 Subject: [PATCH 11/18] change to_screen output to debug_out --- youtube_dl/extractor/digitalconcerthall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index f38d12d22..e63983e6d 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -38,7 +38,7 @@ class DigitalConcertHallIE(InfoExtractor): self.debug_out("url: " + url + " video_id: " + video_id + " language: " + language) webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'(.+?)', webpage, 'title') - self.to_screen("title: " + title) + self.debug_out("title: " + title) # this returns JSON containing the urls of the playlist playlist_dict = self._download_json( @@ -58,7 +58,7 @@ class DigitalConcertHallIE(InfoExtractor): self.debug_out("vid_info_div:\n" + vid_info_div) title = re.sub(r'\s+', ' ', vid_info_div) \ or self._og_search_title(webpage) - self.to_screen("title: " + title) + self.debug_out("title: " + title) entries.append({ 'id': key, 'title': title, From cddf55300dc38b203f72ac5d3600cad55d6c7188 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sun, 22 Mar 2020 08:40:14 -0400 Subject: [PATCH 12/18] separate playlist title from entry titles --- youtube_dl/extractor/digitalconcerthall.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index e63983e6d..4c089f8d8 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -37,8 +37,8 @@ class DigitalConcertHallIE(InfoExtractor): language = 'en' self.debug_out("url: " + url + " video_id: " + video_id + " language: " + language) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') - self.debug_out("title: " + title) + playlist_title = self._html_search_regex(r'(.+?)', webpage, 'title') + self.debug_out("playlist_title: " + playlist_title) # this returns JSON containing the urls of the playlist playlist_dict = self._download_json( @@ -69,6 +69,6 @@ class DigitalConcertHallIE(InfoExtractor): return { '_type': 'playlist', 'id': video_id, - 'title': title, + 'title': playlist_title, 'entries': entries, } From 32fdbd27fadc8a5155b8cb08911a34fbc10c94bc Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 4 Apr 2020 19:51:09 -0400 Subject: [PATCH 13/18] use _og_search_title for fallback playlist title ; use digitalconcerthall API for individual playlist item titles --- youtube_dl/extractor/digitalconcerthall.py | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 4c089f8d8..646fe15e5 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -4,10 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - clean_html, - get_element_by_id, -) class DigitalConcertHallIE(InfoExtractor): @@ -32,18 +28,26 @@ class DigitalConcertHallIE(InfoExtractor): self.to_screen('[debug] %s' % args) def _real_extract(self, url): + MAX_TITLE_LENGTH = 128 language, video_id = re.match(self._VALID_URL, url).groups() if not language: language = 'en' self.debug_out("url: " + url + " video_id: " + video_id + " language: " + language) webpage = self._download_webpage(url, video_id) - playlist_title = self._html_search_regex(r'(.+?)', webpage, 'title') + playlist_title = self._html_search_regex(r'(.+?)', webpage, 'title') \ + or self._og_search_title(webpage) self.debug_out("playlist_title: " + playlist_title) # this returns JSON containing the urls of the playlist + # Note: you must be authenticated to get the stream info playlist_dict = self._download_json( 'https://www.digitalconcerthall.com/json_services/get_stream_urls?id=' - + video_id + "&language=" + language, video_id).get('urls') + + video_id + "&language=" + language, video_id, note='Downloading Stream JSON').get('urls') + # use the API to get other information about the concert + vid_info_dict = self._download_json( + 'https://api.digitalconcerthall.com/v2/concert/' + + video_id, video_id, headers={'Accept': 'application/json', + 'Accept-Language': language}).get('_embedded') entries = [] for key in playlist_dict: @@ -53,11 +57,10 @@ class DigitalConcertHallIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) - # the div with id=key contains the video title - vid_info_div = clean_html(get_element_by_id(key, webpage)) - self.debug_out("vid_info_div:\n" + vid_info_div) - title = re.sub(r'\s+', ' ', vid_info_div) \ - or self._og_search_title(webpage) + title = [vid_info_dict.get(x)[0].get('title',"unknown title") for x in vid_info_dict + if vid_info_dict.get(x)[0].get('id') == key][0] + # avoid filenames that exceed filesystem limits + title = (title[:MAX_TITLE_LENGTH] + '..') if len(title) > MAX_TITLE_LENGTH else title self.debug_out("title: " + title) entries.append({ 'id': key, From 6073b451b215424f6fe8480a3066d73492a986e3 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sat, 4 Apr 2020 19:55:10 -0400 Subject: [PATCH 14/18] flake8 fixes --- youtube_dl/extractor/digitalconcerthall.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 646fe15e5..272a12c79 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -34,8 +34,8 @@ class DigitalConcertHallIE(InfoExtractor): language = 'en' self.debug_out("url: " + url + " video_id: " + video_id + " language: " + language) webpage = self._download_webpage(url, video_id) - playlist_title = self._html_search_regex(r'(.+?)', webpage, 'title') \ - or self._og_search_title(webpage) + playlist_title = self._html_search_regex(r'(.+?)', webpage, 'title') or \ + self._og_search_title(webpage) self.debug_out("playlist_title: " + playlist_title) # this returns JSON containing the urls of the playlist @@ -57,7 +57,7 @@ class DigitalConcertHallIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) - title = [vid_info_dict.get(x)[0].get('title',"unknown title") for x in vid_info_dict + title = [vid_info_dict.get(x)[0].get('title', "unknown title") for x in vid_info_dict if vid_info_dict.get(x)[0].get('id') == key][0] # avoid filenames that exceed filesystem limits title = (title[:MAX_TITLE_LENGTH] + '..') if len(title) > MAX_TITLE_LENGTH else title From 218365d9f5571fc56ae4a6e31c9676279b671ff6 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Sun, 5 Apr 2020 08:49:01 -0400 Subject: [PATCH 15/18] add video description, timestamp, and chapter information. add composer (or 'interview') and video duration to title --- youtube_dl/extractor/digitalconcerthall.py | 39 ++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 272a12c79..1a32a8fa6 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -47,7 +47,8 @@ class DigitalConcertHallIE(InfoExtractor): vid_info_dict = self._download_json( 'https://api.digitalconcerthall.com/v2/concert/' + video_id, video_id, headers={'Accept': 'application/json', - 'Accept-Language': language}).get('_embedded') + 'Accept-Language': language}) + embedded = vid_info_dict.get('_embedded') entries = [] for key in playlist_dict: @@ -57,17 +58,49 @@ class DigitalConcertHallIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) - title = [vid_info_dict.get(x)[0].get('title', "unknown title") for x in vid_info_dict - if vid_info_dict.get(x)[0].get('id') == key][0] + flat_list = [] + for embed_type in embedded: + for item in embedded.get(embed_type): + if embed_type == 'interview': + item['is_interview'] = 1 + else: + item['is_interview'] = 0 + flat_list.append(item) + vid_info = [x for x in flat_list if x.get('id') == key][0] + if vid_info.get('is_interview') == 1: + title = "Interview - " + vid_info.get('title', "unknown interview title") + else: + title = (vid_info.get('name_composer') if vid_info.get('name_composer') + else 'unknown composer') + ' - ' + vid_info.get('title', "unknown title") + duration = vid_info.get('duration_total') # avoid filenames that exceed filesystem limits title = (title[:MAX_TITLE_LENGTH] + '..') if len(title) > MAX_TITLE_LENGTH else title + # append the duration in minutes to the title + title = title + " (" + str(round(duration / 60)) + " min.)" self.debug_out("title: " + title) + timestamp = vid_info.get('date').get('published') entries.append({ 'id': key, 'title': title, 'url': m3u8_url, 'formats': formats, + 'duration': duration, + 'timestamp': timestamp, }) + if vid_info_dict.get('short_description'): + entries[-1]['description'] = vid_info_dict.get('short_description') + if vid_info.get('cuepoints'): + chapters = [] + for chapter in vid_info.get('cuepoints'): + start_time = chapter.get('time') + end_time = start_time + chapter.get('duration') + chapter_title = chapter.get('text') + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title + }) + entries[-1]['chapters'] = chapters return { '_type': 'playlist', From 1487c0d8684077e0bdd23e50a2fb8be9aee3b414 Mon Sep 17 00:00:00 2001 From: Robert Jacobson Date: Fri, 10 Apr 2020 19:37:43 -0400 Subject: [PATCH 16/18] Only need to generate flat_list once per playlist --- youtube_dl/extractor/digitalconcerthall.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 1a32a8fa6..b16394942 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -50,6 +50,15 @@ class DigitalConcertHallIE(InfoExtractor): 'Accept-Language': language}) embedded = vid_info_dict.get('_embedded') + flat_list = [] + for embed_type in embedded: + for item in embedded.get(embed_type): + if embed_type == 'interview': + item['is_interview'] = 1 + else: + item['is_interview'] = 0 + flat_list.append(item) + entries = [] for key in playlist_dict: self.debug_out("key: " + key) @@ -58,14 +67,6 @@ class DigitalConcertHallIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, key, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self.debug_out(formats) - flat_list = [] - for embed_type in embedded: - for item in embedded.get(embed_type): - if embed_type == 'interview': - item['is_interview'] = 1 - else: - item['is_interview'] = 0 - flat_list.append(item) vid_info = [x for x in flat_list if x.get('id') == key][0] if vid_info.get('is_interview') == 1: title = "Interview - " + vid_info.get('title', "unknown interview title") From 09014a88ca55fc26cdd342c7f61225b346c1a0c4 Mon Sep 17 00:00:00 2001 From: teridon Date: Mon, 13 Apr 2020 19:21:09 -0400 Subject: [PATCH 17/18] By default, use playlist description for video. If a short_description exists for a video (likely an interview), use that instead. --- youtube_dl/extractor/digitalconcerthall.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index b16394942..99911ffec 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -88,8 +88,12 @@ class DigitalConcertHallIE(InfoExtractor): 'duration': duration, 'timestamp': timestamp, }) + # use playlist description for video description by default + # but if the video has a description, use it if vid_info_dict.get('short_description'): - entries[-1]['description'] = vid_info_dict.get('short_description') + entries[-1]['description'] = vid_info_dict.get('short_description', "missing description") + if vid_info.get('short_description'): + entries[-1]['description'] = vid_info.get('short_description', "missing description") if vid_info.get('cuepoints'): chapters = [] for chapter in vid_info.get('cuepoints'): From 6bfc0d96d7918b2742a75288eb1c1b80afecc55e Mon Sep 17 00:00:00 2001 From: teridon Date: Fri, 7 Aug 2020 20:32:21 -0400 Subject: [PATCH 18/18] if first chapter starts at non-zero time, youtube-dl (or maybe ffmpeg?) ignores it and you end up with the first chapter starting at the wrong time. So, when first chapter does not start at time 0, insert an "Intro" chapter, so that the music starts at a chapter marker. --- youtube_dl/extractor/digitalconcerthall.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/digitalconcerthall.py b/youtube_dl/extractor/digitalconcerthall.py index 99911ffec..76ef327a1 100644 --- a/youtube_dl/extractor/digitalconcerthall.py +++ b/youtube_dl/extractor/digitalconcerthall.py @@ -96,8 +96,18 @@ class DigitalConcertHallIE(InfoExtractor): entries[-1]['description'] = vid_info.get('short_description', "missing description") if vid_info.get('cuepoints'): chapters = [] + first_chapter = 1 for chapter in vid_info.get('cuepoints'): start_time = chapter.get('time') + # Often, the first chapter does not start at zero. In this case, + # insert an intro chapter so that first chapter is the start of the music + if (first_chapter == 1) and (start_time != 0): + chapters.append({ + 'start_time': 0, + 'end_time': start_time, + 'title': '0. Intro' + }) + first_chapter = 0 end_time = start_time + chapter.get('duration') chapter_title = chapter.get('text') chapters.append({