diff --git a/test/test_subtitles.py b/test/test_subtitles.py index bcc69a778..fbc9eaf4d 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -27,15 +27,23 @@ class BaseTestSubtitles(unittest.TestCase): def setUp(self): self.DL = FakeYDL() - self.ie = self.IE(self.DL) + self.ie = self.IE() + self.DL.add_info_extractor(self.ie) def getInfoDict(self): - info_dict = self.ie.extract(self.url) + info_dict = self.DL.extract_info(self.url, download=False) return info_dict def getSubtitles(self): info_dict = self.getInfoDict() - return info_dict['subtitles'] + subtitles = info_dict['subtitles'] + if not subtitles: + return subtitles + for sub_info in subtitles.values(): + if sub_info.get('data') is None: + uf = self.DL.urlopen(sub_info['url']) + sub_info['data'] = uf.read().decode('utf-8') + return dict((l, sub_info['data']) for l, sub_info in subtitles.items()) class TestYoutubeSubtitles(BaseTestSubtitles): @@ -176,7 +184,7 @@ class TestTedSubtitles(BaseTestSubtitles): def test_no_writesubtitles(self): subtitles = self.getSubtitles() - self.assertEqual(subtitles, None) + self.assertFalse(subtitles) def test_subtitles(self): self.DL.params['writesubtitles'] = True @@ -196,18 +204,10 @@ class TestTedSubtitles(BaseTestSubtitles): self.assertTrue(len(subtitles.keys()) >= 28) def test_list_subtitles(self): - self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.params['listsubtitles'] = True info_dict = self.getInfoDict() self.assertEqual(info_dict, None) - def test_automatic_captions(self): - self.DL.expect_warning('Automatic Captions not supported by this server') - self.DL.params['writeautomaticsub'] = True - self.DL.params['subtitleslang'] = ['en'] - subtitles = self.getSubtitles() - self.assertTrue(len(subtitles.keys()) == 0) - def test_multiple_langs(self): self.DL.params['writesubtitles'] = True langs = ['es', 'fr', 'de'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 13d18e25e..e665e3d53 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -154,7 +154,7 @@ class YoutubeDL(object): allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video - subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) + subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. @@ -1019,6 +1019,11 @@ class YoutubeDL(object): info_dict['timestamp']) info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + if self.params.get('listsubtitles', False): + self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) + return + info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@ -1147,6 +1152,53 @@ class YoutubeDL(object): info_dict.update(formats_to_download[-1]) return info_dict + def process_subtitles(self, video_id, available_subs): + """Select the requested subtitles and their format""" + if not available_subs: + return available_subs + + if self.params.get('allsubtitles', False): + requested_langs = available_subs.keys() + else: + if self.params.get('subtitleslangs', False): + requested_langs = self.params.get('subtitleslangs') + elif 'en' in available_subs: + requested_langs = ['en'] + else: + requested_langs = [list(available_subs.keys())[0]] + + formats_query = self.params.get('subtitlesformat', 'best') + formats_preference = formats_query.split('/') if formats_query else [] + subs = {} + for lang in requested_langs: + formats = available_subs.get(lang) + if formats is None: + self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + continue + if isinstance(formats, compat_str): + # TODO: convert all IE with subtitles support to the new format + # and remove this + subs[lang] = { + 'ext': formats_preference[0], + 'data': formats, + } + continue + for ext in formats_preference: + if ext == 'best': + f = formats[-1] + break + matches = list(filter(lambda f: f['ext'] == ext, formats)) + if matches: + f = matches[-1] + break + else: + f = formats[-1] + self.report_warning( + 'No subtitle format found matching "%s" for language %s, ' + 'using %s' % (formats_query, lang, f['ext'])) + subs[lang] = f + return subs + def process_info(self, info_dict): """Process a single resolved IE result.""" @@ -1253,11 +1305,18 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['subtitles'] - sub_format = self.params.get('subtitlesformat', 'srt') - for sub_lang in subtitles.keys(): - sub = subtitles[sub_lang] - if sub is None: - continue + for sub_lang, sub_info in subtitles.items(): + sub_format = sub_info['ext'] + if sub_info.get('data') is not None: + sub_data = sub_info['data'] + else: + try: + uf = self.urlopen(sub_info['url']) + sub_data = uf.read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, compat_str(err))) + continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): @@ -1265,7 +1324,7 @@ class YoutubeDL(object): else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) return @@ -1586,6 +1645,18 @@ class YoutubeDL(object): ['ID', 'width', 'height', 'URL'], [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + def list_subtitles(self, video_id, subtitles): + if not subtitles: + self.to_screen('%s has no subtitles' % video_id) + return + header_line = 'Language formats' + sub_lines = [ + '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) + for lang, formats in subtitles.items()] + self.to_screen( + 'Available subtitles for %s:\n%s\n%s' % + (video_id, header_line, '\n'.join(sub_lines))) + def urlopen(self, req): """ Start an HTTP download """ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ed22f169f..5f2585003 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -226,7 +226,6 @@ def _real_main(argv=None): if opts.embedsubtitles: postprocessors.append({ 'key': 'FFmpegEmbedSubtitle', - 'subtitlesformat': opts.subtitlesformat, }) if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c784eedb9..161c623eb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -151,8 +151,14 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A url pointing to the subtitles file + Note: YoutubeDL.extract_info will get the requested + format and replace the "subformats" list with it. duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video @@ -993,6 +999,16 @@ class InfoExtractor(object): any_restricted = any_restricted or is_restricted return not any_restricted + def extract_subtitles(self, *args, **kwargs): + subtitles = {} + list_subtitles = self._downloader.params.get('listsubtitles') + if self._downloader.params.get('writesubtitles', False) or list_subtitles: + subtitles.update(self._get_subtitles(*args, **kwargs)) + return subtitles + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 10b3b706a..1809eaae4 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals import json import re -from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor from ..compat import ( compat_str, ) -class TEDIE(SubtitlesInfoExtractor): +class TEDIE(InfoExtractor): _VALID_URL = r'''(?x) (?Phttps?://) (?Pwww|embed(?:-ssl)?)(?P\.ted\.com/ @@ -165,9 +165,6 @@ class TEDIE(SubtitlesInfoExtractor): video_id = compat_str(talk_info['id']) # subtitles video_subtitles = self.extract_subtitles(video_id, talk_info) - if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id, talk_info) - return thumbnail = talk_info['thumb'] if not thumbnail.startswith('http'): @@ -183,13 +180,18 @@ class TEDIE(SubtitlesInfoExtractor): 'duration': talk_info.get('duration'), } - def _get_available_subtitles(self, video_id, talk_info): + def _get_subtitles(self, video_id, talk_info): languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] if languages: sub_lang_list = {} for l in languages: - url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) - sub_lang_list[l] = url + sub_lang_list[l] = [ + { + 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext), + 'ext': ext, + } + for ext in ['ted', 'srt'] + ] return sub_lang_list else: self._downloader.report_warning('video doesn\'t have subtitles') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 873432bee..4fcf8c83d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None): help='lists all available subtitles for the video') subtitles.add_option( '--sub-format', - action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', - help='subtitle format (default=srt) ([sbv/vtt] youtube only)') + action='store', dest='subtitlesformat', metavar='FORMAT', default='best', + help='subtitle format, accepts formats preference, for example: "ass/srt/best"') subtitles.add_option( '--sub-lang', '--sub-langs', '--srt-lang', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 5238ce534..d1bbfbfe3 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -453,10 +453,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): 'zu': 'zul', } - def __init__(self, downloader=None, subtitlesformat='srt'): - super(FFmpegEmbedSubtitlePP, self).__init__(downloader) - self._subformat = subtitlesformat - @classmethod def _conver_lang_code(cls, code): """Convert language code from ISO 639-1 to ISO 639-2/T""" @@ -472,7 +468,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): sub_langs = [key for key in information['subtitles']] filename = information['filepath'] - input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] + input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()] opts = [ '-map', '0',