Merge branch 'subtitles-rework'

(Closes PR #4964)
pull/5055/head
Jaime Marquínez Ferrándiz 9 years ago
commit bfc993cc91

@ -43,7 +43,7 @@ test:
ot: offlinetest ot: offlinetest
offlinetest: codetest offlinetest: codetest
nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py
tar: youtube-dl.tar.gz tar: youtube-dl.tar.gz

@ -28,7 +28,7 @@
"retries": 10, "retries": 10,
"simulate": false, "simulate": false,
"subtitleslang": null, "subtitleslang": null,
"subtitlesformat": "srt", "subtitlesformat": "best",
"test": true, "test": true,
"updatetime": true, "updatetime": true,
"usenetrc": false, "usenetrc": false,

@ -337,6 +337,65 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'G') self.assertEqual(downloaded['format_id'], 'G')
def test_subtitles(self):
def s_formats(lang, autocaption=False):
return [{
'ext': ext,
'url': 'http://localhost/video.%s.%s' % (lang, ext),
'_auto': autocaption,
} for ext in ['vtt', 'srt', 'ass']]
subtitles = dict((l, s_formats(l)) for l in ['en', 'fr', 'es'])
auto_captions = dict((l, s_formats(l, True)) for l in ['it', 'pt', 'es'])
info_dict = {
'id': 'test',
'title': 'Test',
'url': 'http://localhost/video.mp4',
'subtitles': subtitles,
'automatic_captions': auto_captions,
'extractor': 'TEST',
}
def get_info(params={}):
params.setdefault('simulate', True)
ydl = YDL(params)
ydl.report_warning = lambda *args, **kargs: None
return ydl.process_video_result(info_dict, download=False)
result = get_info()
self.assertFalse(result.get('requested_subtitles'))
self.assertEqual(result['subtitles'], subtitles)
self.assertEqual(result['automatic_captions'], auto_captions)
result = get_info({'writesubtitles': True})
subs = result['requested_subtitles']
self.assertTrue(subs)
self.assertEqual(set(subs.keys()), set(['en']))
self.assertTrue(subs['en'].get('data') is None)
self.assertEqual(subs['en']['ext'], 'ass')
result = get_info({'writesubtitles': True, 'subtitlesformat': 'foo/srt'})
subs = result['requested_subtitles']
self.assertEqual(subs['en']['ext'], 'srt')
result = get_info({'writesubtitles': True, 'subtitleslangs': ['es', 'fr', 'it']})
subs = result['requested_subtitles']
self.assertTrue(subs)
self.assertEqual(set(subs.keys()), set(['es', 'fr']))
result = get_info({'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']})
subs = result['requested_subtitles']
self.assertTrue(subs)
self.assertEqual(set(subs.keys()), set(['es', 'pt']))
self.assertFalse(subs['es']['_auto'])
self.assertTrue(subs['pt']['_auto'])
result = get_info({'writeautomaticsub': True, 'subtitleslangs': ['es', 'pt']})
subs = result['requested_subtitles']
self.assertTrue(subs)
self.assertEqual(set(subs.keys()), set(['es', 'pt']))
self.assertTrue(subs['es']['_auto'])
self.assertTrue(subs['pt']['_auto'])
def test_add_extra_info(self): def test_add_extra_info(self):
test_dict = { test_dict = {
'extractor': 'Foo', 'extractor': 'Foo',

@ -18,6 +18,13 @@ from youtube_dl.extractor import (
VimeoIE, VimeoIE,
WallaIE, WallaIE,
CeskaTelevizeIE, CeskaTelevizeIE,
LyndaIE,
NPOIE,
ComedyCentralIE,
NRKTVIE,
RaiIE,
VikiIE,
ThePlatformIE,
) )
@ -27,42 +34,38 @@ class BaseTestSubtitles(unittest.TestCase):
def setUp(self): def setUp(self):
self.DL = FakeYDL() self.DL = FakeYDL()
self.ie = self.IE(self.DL) self.ie = self.IE()
self.DL.add_info_extractor(self.ie)
def getInfoDict(self): def getInfoDict(self):
info_dict = self.ie.extract(self.url) info_dict = self.DL.extract_info(self.url, download=False)
return info_dict return info_dict
def getSubtitles(self): def getSubtitles(self):
info_dict = self.getInfoDict() info_dict = self.getInfoDict()
return info_dict['subtitles'] subtitles = info_dict['requested_subtitles']
if not subtitles:
return subtitles
for sub_info in subtitles.values():
if sub_info.get('data') is None:
uf = self.DL.urlopen(sub_info['url'])
sub_info['data'] = uf.read().decode('utf-8')
return dict((l, sub_info['data']) for l, sub_info in subtitles.items())
class TestYoutubeSubtitles(BaseTestSubtitles): class TestYoutubeSubtitles(BaseTestSubtitles):
url = 'QRS8MkLhQmM' url = 'QRS8MkLhQmM'
IE = YoutubeIE IE = YoutubeIE
def test_youtube_no_writesubtitles(self):
self.DL.params['writesubtitles'] = False
subtitles = self.getSubtitles()
self.assertEqual(subtitles, None)
def test_youtube_subtitles(self):
self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
def test_youtube_subtitles_lang(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitleslangs'] = ['it']
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
def test_youtube_allsubtitles(self): def test_youtube_allsubtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles.keys()), 13) self.assertEqual(len(subtitles.keys()), 13)
self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
for lang in ['it', 'fr', 'de']:
self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
def test_youtube_subtitles_sbv_format(self): def test_youtube_subtitles_sbv_format(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
@ -76,12 +79,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
def test_youtube_list_subtitles(self):
self.DL.expect_warning('Video doesn\'t have automatic captions')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_youtube_automatic_captions(self): def test_youtube_automatic_captions(self):
self.url = '8YoUxe5ncPo' self.url = '8YoUxe5ncPo'
self.DL.params['writeautomaticsub'] = True self.DL.params['writeautomaticsub'] = True
@ -103,55 +100,22 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertFalse(subtitles)
def test_youtube_multiple_langs(self):
self.url = 'QRS8MkLhQmM'
self.DL.params['writesubtitles'] = True
langs = ['it', 'fr', 'de']
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestDailymotionSubtitles(BaseTestSubtitles): class TestDailymotionSubtitles(BaseTestSubtitles):
url = 'http://www.dailymotion.com/video/xczg00' url = 'http://www.dailymotion.com/video/xczg00'
IE = DailymotionIE IE = DailymotionIE
def test_no_writesubtitles(self):
subtitles = self.getSubtitles()
self.assertEqual(subtitles, None)
def test_subtitles(self):
self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f')
def test_subtitles_lang(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitleslangs'] = ['fr']
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) >= 6) self.assertTrue(len(subtitles.keys()) >= 6)
self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f')
def test_list_subtitles(self): self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
self.DL.expect_warning('Automatic Captions not supported by this server') for lang in ['es', 'fr', 'de']:
self.DL.params['listsubtitles'] = True self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_automatic_captions(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) == 0)
def test_nosubtitles(self): def test_nosubtitles(self):
self.DL.expect_warning('video doesn\'t have subtitles') self.DL.expect_warning('video doesn\'t have subtitles')
@ -159,61 +123,21 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertFalse(subtitles)
def test_multiple_langs(self):
self.DL.params['writesubtitles'] = True
langs = ['es', 'fr', 'de']
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestTedSubtitles(BaseTestSubtitles): class TestTedSubtitles(BaseTestSubtitles):
url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
IE = TEDIE IE = TEDIE
def test_no_writesubtitles(self):
subtitles = self.getSubtitles()
self.assertEqual(subtitles, None)
def test_subtitles(self):
self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14')
def test_subtitles_lang(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitleslangs'] = ['fr']
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5')
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) >= 28) self.assertTrue(len(subtitles.keys()) >= 28)
self.assertEqual(md5(subtitles['en']), '4262c1665ff928a2dada178f62cb8d14')
def test_list_subtitles(self): self.assertEqual(md5(subtitles['fr']), '66a63f7f42c97a50f8c0e90bc7797bb5')
self.DL.expect_warning('Automatic Captions not supported by this server') for lang in ['es', 'fr', 'de']:
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_automatic_captions(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) == 0)
def test_multiple_langs(self):
self.DL.params['writesubtitles'] = True
langs = ['es', 'fr', 'de']
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
@ -221,14 +145,7 @@ class TestBlipTVSubtitles(BaseTestSubtitles):
url = 'http://blip.tv/a/a-6603250' url = 'http://blip.tv/a/a-6603250'
IE = BlipTVIE IE = BlipTVIE
def test_list_subtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
@ -240,39 +157,13 @@ class TestVimeoSubtitles(BaseTestSubtitles):
url = 'http://vimeo.com/76979871' url = 'http://vimeo.com/76979871'
IE = VimeoIE IE = VimeoIE
def test_no_writesubtitles(self):
subtitles = self.getSubtitles()
self.assertEqual(subtitles, None)
def test_subtitles(self):
self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888')
def test_subtitles_lang(self):
self.DL.params['writesubtitles'] = True
self.DL.params['subtitleslangs'] = ['fr']
subtitles = self.getSubtitles()
self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8')
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr'])) self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr']))
self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888')
def test_list_subtitles(self): self.assertEqual(md5(subtitles['fr']), 'b6191146a6c5d3a452244d853fde6dc8')
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_automatic_captions(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) == 0)
def test_nosubtitles(self): def test_nosubtitles(self):
self.DL.expect_warning('video doesn\'t have subtitles') self.DL.expect_warning('video doesn\'t have subtitles')
@ -280,27 +171,13 @@ class TestVimeoSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertFalse(subtitles)
def test_multiple_langs(self):
self.DL.params['writesubtitles'] = True
langs = ['es', 'fr', 'de']
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestWallaSubtitles(BaseTestSubtitles): class TestWallaSubtitles(BaseTestSubtitles):
url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' url = 'http://vod.walla.co.il/movie/2705958/the-yes-men'
IE = WallaIE IE = WallaIE
def test_list_subtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
@ -315,19 +192,13 @@ class TestWallaSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertFalse(subtitles)
class TestCeskaTelevizeSubtitles(BaseTestSubtitles): class TestCeskaTelevizeSubtitles(BaseTestSubtitles):
url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky'
IE = CeskaTelevizeIE IE = CeskaTelevizeIE
def test_list_subtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self): def test_allsubtitles(self):
self.DL.expect_warning('Automatic Captions not supported by this server') self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
@ -342,7 +213,96 @@ class TestCeskaTelevizeSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles() subtitles = self.getSubtitles()
self.assertEqual(len(subtitles), 0) self.assertFalse(subtitles)
class TestLyndaSubtitles(BaseTestSubtitles):
url = 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html'
IE = LyndaIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), '09bbe67222259bed60deaa26997d73a7')
class TestNPOSubtitles(BaseTestSubtitles):
url = 'http://www.npo.nl/nos-journaal/28-08-2014/POW_00722860'
IE = NPOIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['nl']))
self.assertEqual(md5(subtitles['nl']), 'fc6435027572b63fb4ab143abd5ad3f4')
class TestMTVSubtitles(BaseTestSubtitles):
url = 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother'
IE = ComedyCentralIE
def getInfoDict(self):
return super(TestMTVSubtitles, self).getInfoDict()['entries'][0]
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), 'b9f6ca22a6acf597ec76f61749765e65')
class TestNRKSubtitles(BaseTestSubtitles):
url = 'http://tv.nrk.no/serie/ikke-gjoer-dette-hjemme/DMPV73000411/sesong-2/episode-1'
IE = NRKTVIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['no']))
self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a')
class TestRaiSubtitles(BaseTestSubtitles):
url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
IE = RaiIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['it']))
self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a')
class TestVikiSubtitles(BaseTestSubtitles):
url = 'http://www.viki.com/videos/1060846v-punch-episode-18'
IE = VikiIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), '53cb083a5914b2d84ef1ab67b880d18a')
class TestThePlatformSubtitles(BaseTestSubtitles):
# from http://www.3playmedia.com/services-features/tools/integrations/theplatform/
# (see http://theplatform.com/about/partners/type/subtitles-closed-captioning/)
url = 'theplatform:JFUjUE1_ehvq'
IE = ThePlatformIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['en']))
self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b')
if __name__ == '__main__': if __name__ == '__main__':

@ -154,7 +154,7 @@ class YoutubeDL(object):
allsubtitles: Downloads all the subtitles of the video allsubtitles: Downloads all the subtitles of the video
(requires writesubtitles or writeautomaticsub) (requires writesubtitles or writeautomaticsub)
listsubtitles: Lists all available subtitles for the video listsubtitles: Lists all available subtitles for the video
subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt) subtitlesformat: The format code for subtitles
subtitleslangs: List of languages of the subtitles to download subtitleslangs: List of languages of the subtitles to download
keepvideo: Keep the video file after post-processing keepvideo: Keep the video file after post-processing
daterange: A DateRange object, download only if the upload_date is in the range. daterange: A DateRange object, download only if the upload_date is in the range.
@ -1008,6 +1008,15 @@ class YoutubeDL(object):
info_dict['timestamp']) info_dict['timestamp'])
info_dict['upload_date'] = upload_date.strftime('%Y%m%d') info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
if self.params.get('listsubtitles', False):
if 'automatic_captions' in info_dict:
self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
return
info_dict['requested_subtitles'] = self.process_subtitles(
info_dict['id'], info_dict.get('subtitles'),
info_dict.get('automatic_captions'))
# This extractors handle format selection themselves # This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']: if info_dict['extractor'] in ['Youku']:
if download: if download:
@ -1136,6 +1145,55 @@ class YoutubeDL(object):
info_dict.update(formats_to_download[-1]) info_dict.update(formats_to_download[-1])
return info_dict return info_dict
def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
"""Select the requested subtitles and their format"""
available_subs = {}
if normal_subtitles and self.params.get('writesubtitles'):
available_subs.update(normal_subtitles)
if automatic_captions and self.params.get('writeautomaticsub'):
for lang, cap_info in automatic_captions.items():
if lang not in available_subs:
available_subs[lang] = cap_info
if (not self.params.get('writesubtitles') and not
self.params.get('writeautomaticsub') or not
available_subs):
return None
if self.params.get('allsubtitles', False):
requested_langs = available_subs.keys()
else:
if self.params.get('subtitleslangs', False):
requested_langs = self.params.get('subtitleslangs')
elif 'en' in available_subs:
requested_langs = ['en']
else:
requested_langs = [list(available_subs.keys())[0]]
formats_query = self.params.get('subtitlesformat', 'best')
formats_preference = formats_query.split('/') if formats_query else []
subs = {}
for lang in requested_langs:
formats = available_subs.get(lang)
if formats is None:
self.report_warning('%s subtitles not available for %s' % (lang, video_id))
continue
for ext in formats_preference:
if ext == 'best':
f = formats[-1]
break
matches = list(filter(lambda f: f['ext'] == ext, formats))
if matches:
f = matches[-1]
break
else:
f = formats[-1]
self.report_warning(
'No subtitle format found matching "%s" for language %s, '
'using %s' % (formats_query, lang, f['ext']))
subs[lang] = f
return subs
def process_info(self, info_dict): def process_info(self, info_dict):
"""Process a single resolved IE result.""" """Process a single resolved IE result."""
@ -1238,15 +1296,22 @@ class YoutubeDL(object):
subtitles_are_requested = any([self.params.get('writesubtitles', False), subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub')]) self.params.get('writeautomaticsub')])
if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']: if subtitles_are_requested and info_dict.get('requested_subtitles'):
# subtitles download errors are already managed as troubles in relevant IE # subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE # that way it will silently go on when used with unsupporting IE
subtitles = info_dict['subtitles'] subtitles = info_dict['requested_subtitles']
sub_format = self.params.get('subtitlesformat', 'srt') for sub_lang, sub_info in subtitles.items():
for sub_lang in subtitles.keys(): sub_format = sub_info['ext']
sub = subtitles[sub_lang] if sub_info.get('data') is not None:
if sub is None: sub_data = sub_info['data']
continue else:
try:
uf = self.urlopen(sub_info['url'])
sub_data = uf.read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, compat_str(err)))
continue
try: try:
sub_filename = subtitles_filename(filename, sub_lang, sub_format) sub_filename = subtitles_filename(filename, sub_lang, sub_format)
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
@ -1254,7 +1319,7 @@ class YoutubeDL(object):
else: else:
self.to_screen('[info] Writing video subtitles to: ' + sub_filename) self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
subfile.write(sub) subfile.write(sub_data)
except (OSError, IOError): except (OSError, IOError):
self.report_error('Cannot write subtitles file ' + sub_filename) self.report_error('Cannot write subtitles file ' + sub_filename)
return return
@ -1564,6 +1629,17 @@ class YoutubeDL(object):
['ID', 'width', 'height', 'URL'], ['ID', 'width', 'height', 'URL'],
[[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
def list_subtitles(self, video_id, subtitles, name='subtitles'):
if not subtitles:
self.to_screen('%s has no %s' % (video_id, name))
return
self.to_screen(
'Available %s for %s:' % (name, video_id))
self.to_screen(render_table(
['Language', 'formats'],
[[lang, ', '.join(f['ext'] for f in reversed(formats))]
for lang, formats in subtitles.items()]))
def urlopen(self, req): def urlopen(self, req):
""" Start an HTTP download """ """ Start an HTTP download """

@ -226,7 +226,6 @@ def _real_main(argv=None):
if opts.embedsubtitles: if opts.embedsubtitles:
postprocessors.append({ postprocessors.append({
'key': 'FFmpegEmbedSubtitle', 'key': 'FFmpegEmbedSubtitle',
'subtitlesformat': opts.subtitlesformat,
}) })
if opts.xattrs: if opts.xattrs:
postprocessors.append({'key': 'XAttrMetadata'}) postprocessors.append({'key': 'XAttrMetadata'})

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import time import time
import hmac import hmac
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
compat_urllib_parse, compat_urllib_parse,
@ -17,7 +17,7 @@ from ..utils import (
) )
class AtresPlayerIE(SubtitlesInfoExtractor): class AtresPlayerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html' _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html'
_TESTS = [ _TESTS = [
{ {
@ -144,13 +144,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor):
thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail')
subtitles = {} subtitles = {}
subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle') subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle')
if subtitle: if subtitle_url:
subtitles['es'] = subtitle subtitles['es'] = [{
'ext': 'srt',
if self._downloader.params.get('listsubtitles', False): 'url': subtitle_url,
self._list_available_subtitles(video_id, subtitles) }]
return
return { return {
'id': video_id, 'id': video_id,
@ -159,5 +158,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor):
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'duration': duration, 'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': self.extract_subtitles(video_id, subtitles), 'subtitles': subtitles,
} }

@ -2,12 +2,12 @@ from __future__ import unicode_literals
import xml.etree.ElementTree import xml.etree.ElementTree
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import ExtractorError
from ..compat import compat_HTTPError from ..compat import compat_HTTPError
class BBCCoUkIE(SubtitlesInfoExtractor): class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk' IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer' IE_DESC = 'BBC iPlayer'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
formats.extend(conn_formats) formats.extend(conn_formats)
return formats return formats
def _extract_captions(self, media, programme_id): def _get_subtitles(self, media, programme_id):
subtitles = {} subtitles = {}
for connection in self._extract_connections(media): for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
srt = '' srt = ''
def _extract_text(p):
if p.text is not None:
stripped_text = p.text.strip()
if stripped_text:
return stripped_text
return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
for pos, p in enumerate(ps): for pos, p in enumerate(ps):
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
p.text.strip() if p.text is not None else '') subtitles[lang] = [
subtitles[lang] = srt {
'url': connection.get('href'),
'ext': 'ttml',
},
{
'data': srt,
'ext': 'srt',
},
]
return subtitles return subtitles
def _download_media_selector(self, programme_id): def _download_media_selector(self, programme_id):
@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
elif kind == 'video': elif kind == 'video':
formats.extend(self._extract_video(media, programme_id)) formats.extend(self._extract_video(media, programme_id))
elif kind == 'captions': elif kind == 'captions':
subtitles = self._extract_captions(media, programme_id) subtitles = self.extract_subtitles(media, programme_id)
return formats, subtitles return formats, subtitles
@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
else: else:
programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(programme_id, subtitles)
return
self._sort_formats(formats) self._sort_formats(formats)
return { return {

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
@ -18,7 +17,7 @@ from ..utils import (
) )
class BlipTVIE(SubtitlesInfoExtractor): class BlipTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))'
_TESTS = [ _TESTS = [
@ -143,7 +142,7 @@ class BlipTVIE(SubtitlesInfoExtractor):
categories = [category.text for category in item.findall('category')] categories = [category.text for category in item.findall('category')]
formats = [] formats = []
subtitles = {} subtitles_urls = {}
media_group = item.find(media('group')) media_group = item.find(media('group'))
for media_content in media_group.findall(media('content')): for media_content in media_group.findall(media('content')):
@ -161,7 +160,7 @@ class BlipTVIE(SubtitlesInfoExtractor):
} }
lang = role.rpartition('-')[-1].strip().lower() lang = role.rpartition('-')[-1].strip().lower()
langcode = LANGS.get(lang, lang) langcode = LANGS.get(lang, lang)
subtitles[langcode] = url subtitles_urls[langcode] = url
elif media_type.startswith('video/'): elif media_type.startswith('video/'):
formats.append({ formats.append({
'url': real_url, 'url': real_url,
@ -175,11 +174,7 @@ class BlipTVIE(SubtitlesInfoExtractor):
}) })
self._sort_formats(formats) self._sort_formats(formats)
# subtitles subtitles = self.extract_subtitles(video_id, subtitles_urls)
video_subtitles = self.extract_subtitles(video_id, subtitles)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
return { return {
'id': video_id, 'id': video_id,
@ -192,15 +187,22 @@ class BlipTVIE(SubtitlesInfoExtractor):
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'categories': categories, 'categories': categories,
'formats': formats, 'formats': formats,
'subtitles': video_subtitles, 'subtitles': subtitles,
} }
def _download_subtitle_url(self, sub_lang, url): def _get_subtitles(self, video_id, subtitles_urls):
# For some weird reason, blip.tv serves a video instead of subtitles subtitles = {}
# when we request with a common UA for lang, url in subtitles_urls.items():
req = compat_urllib_request.Request(url) # For some weird reason, blip.tv serves a video instead of subtitles
req.add_header('User-Agent', 'youtube-dl') # when we request with a common UA
return self._download_webpage(req, None, note=False) req = compat_urllib_request.Request(url)
req.add_header('User-Agent', 'youtube-dl')
subtitles[lang] = [{
# The extension is 'srt' but it's actually an 'ass' file
'ext': 'ass',
'data': self._download_webpage(req, None, note=False),
}]
return subtitles
class BlipTVUserIE(InfoExtractor): class BlipTVUserIE(InfoExtractor):

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re import re
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_request, compat_urllib_request,
compat_urllib_parse, compat_urllib_parse,
@ -15,7 +15,7 @@ from ..utils import (
) )
class CeskaTelevizeIE(SubtitlesInfoExtractor): class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
_TESTS = [ _TESTS = [
@ -107,13 +107,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):
subtitles = {} subtitles = {}
subs = item.get('subtitles') subs = item.get('subtitles')
if subs: if subs:
subtitles['cs'] = subs[0]['url'] subtitles = self.extract_subtitles(episode_id, subs)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles))
return { return {
'id': episode_id, 'id': episode_id,
@ -125,11 +119,20 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):
'subtitles': subtitles, 'subtitles': subtitles,
} }
def _get_subtitles(self, episode_id, subs):
original_subtitles = self._download_webpage(
subs[0]['url'], episode_id, 'Downloading subtitles')
srt_subs = self._fix_subtitles(original_subtitles)
return {
'cs': [{
'ext': 'srt',
'data': srt_subs,
}]
}
@staticmethod @staticmethod
def _fix_subtitles(subtitles): def _fix_subtitles(subtitles):
""" Convert millisecond-based subtitles to SRT """ """ Convert millisecond-based subtitles to SRT """
if subtitles is None:
return subtitles # subtitles not requested
def _msectotimecode(msec): def _msectotimecode(msec):
""" Helper utility to convert milliseconds to timecode """ """ Helper utility to convert milliseconds to timecode """
@ -149,7 +152,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):
else: else:
yield line yield line
fixed_subtitles = {} return "\r\n".join(_fix_subtitle(subtitles))
for k, v in subtitles.items():
fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v))
return fixed_subtitles

@ -150,8 +150,14 @@ class InfoExtractor(object):
If not explicitly set, calculated from timestamp. If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader. uploader_id: Nickname or id of the video uploader.
location: Physical location where the video was filmed. location: Physical location where the video was filmed.
subtitles: The subtitle file contents as a dictionary in the format subtitles: The available subtitles as a dictionary in the format
{language: subtitles}. {language: subformats}. "subformats" is a list sorted from
lower to higher preference, each element is a dictionary
with the "ext" entry and one of:
* "data": The subtitles file contents
* "url": A url pointing to the subtitles file
automatic_captions: Like 'subtitles', used by the YoutubeIE for
automatically generated captions
duration: Length of the video in seconds, as an integer. duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform. view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video like_count: Number of positive ratings of the video
@ -1011,6 +1017,24 @@ class InfoExtractor(object):
any_restricted = any_restricted or is_restricted any_restricted = any_restricted or is_restricted
return not any_restricted return not any_restricted
def extract_subtitles(self, *args, **kwargs):
if (self._downloader.params.get('writesubtitles', False) or
self._downloader.params.get('listsubtitles')):
return self._get_subtitles(*args, **kwargs)
return {}
def _get_subtitles(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses")
def extract_automatic_captions(self, *args, **kwargs):
if (self._downloader.params.get('writeautomaticsub', False) or
self._downloader.params.get('listsubtitles')):
return self._get_automatic_captions(*args, **kwargs)
return {}
def _get_automatic_captions(self, *args, **kwargs):
raise NotImplementedError("This method must be implemented by subclasses")
class SearchInfoExtractor(InfoExtractor): class SearchInfoExtractor(InfoExtractor):
""" """

@ -9,7 +9,7 @@ import xml.etree.ElementTree
from hashlib import sha1 from hashlib import sha1
from math import pow, sqrt, floor from math import pow, sqrt, floor
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request, compat_urllib_request,
@ -25,10 +25,9 @@ from ..aes import (
aes_cbc_decrypt, aes_cbc_decrypt,
inc, inc,
) )
from .common import InfoExtractor
class CrunchyrollIE(SubtitlesInfoExtractor): class CrunchyrollIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
@ -187,6 +186,38 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output return output
def _get_subtitles(self, video_id, webpage):
subtitles = {}
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
sub_page = self._download_webpage(
'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
video_id, note='Downloading subtitles for ' + sub_name)
id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
if not id or not iv or not data:
continue
id = int(id)
iv = base64.b64decode(iv)
data = base64.b64decode(data)
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
sub_root = xml.etree.ElementTree.fromstring(subtitle)
subtitles[lang_code] = [
{
'ext': 'srt',
'data': self._convert_subtitles_to_srt(sub_root),
},
{
'ext': 'ass',
'data': self._convert_subtitles_to_ass(sub_root),
},
]
return subtitles
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id') video_id = mobj.group('video_id')
@ -249,34 +280,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'format_id': video_format, 'format_id': video_format,
}) })
subtitles = {} subtitles = self.extract_subtitles(video_id, webpage)
sub_format = self._downloader.params.get('subtitlesformat', 'srt')
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
sub_page = self._download_webpage(
'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
video_id, note='Downloading subtitles for ' + sub_name)
id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
if not id or not iv or not data:
continue
id = int(id)
iv = base64.b64decode(iv)
data = base64.b64decode(data)
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
sub_root = xml.etree.ElementTree.fromstring(subtitle)
if sub_format == 'ass':
subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root)
else:
subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
return { return {
'id': video_id, 'id': video_id,

@ -6,7 +6,6 @@ import json
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
@ -31,7 +30,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
return request return request
class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor):
"""Information Extractor for Dailymotion""" """Information Extractor for Dailymotion"""
_VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
@ -143,9 +142,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, webpage) video_subtitles = self.extract_subtitles(video_id, webpage)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, webpage)
return
view_count = str_to_int(self._search_regex( view_count = str_to_int(self._search_regex(
r'video_views_count[^>]+>\s+([\d\.,]+)', r'video_views_count[^>]+>\s+([\d\.,]+)',
@ -169,7 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'view_count': view_count, 'view_count': view_count,
} }
def _get_available_subtitles(self, video_id, webpage): def _get_subtitles(self, video_id, webpage):
try: try:
sub_list = self._download_webpage( sub_list = self._download_webpage(
'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
@ -179,7 +175,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
return {} return {}
info = json.loads(sub_list) info = json.loads(sub_list)
if (info['total'] > 0): if (info['total'] > 0):
sub_lang_list = dict((l['language'], l['url']) for l in info['list']) sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
return sub_lang_list return sub_lang_list
self._downloader.report_warning('video doesn\'t have subtitles') self._downloader.report_warning('video doesn\'t have subtitles')
return {} return {}

@ -1,11 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor, ExtractorError
from .common import ExtractorError
from ..utils import parse_iso8601 from ..utils import parse_iso8601
class DRTVIE(SubtitlesInfoExtractor): class DRTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'
_TEST = { _TEST = {
@ -76,7 +75,7 @@ class DRTVIE(SubtitlesInfoExtractor):
} }
for subs in subtitles_list: for subs in subtitles_list:
lang = subs['Language'] lang = subs['Language']
subtitles[LANGS.get(lang, lang)] = subs['Uri'] subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}]
if not formats and restricted_to_denmark: if not formats and restricted_to_denmark:
raise ExtractorError( raise ExtractorError(
@ -84,10 +83,6 @@ class DRTVIE(SubtitlesInfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
@ -96,5 +91,5 @@ class DRTVIE(SubtitlesInfoExtractor):
'timestamp': timestamp, 'timestamp': timestamp,
'duration': duration, 'duration': duration,
'formats': formats, 'formats': formats,
'subtitles': self.extract_subtitles(video_id, subtitles), 'subtitles': subtitles,
} }

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re import re
import json import json
from .subtitles import SubtitlesInfoExtractor
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
@ -16,7 +15,7 @@ from ..utils import (
) )
class LyndaIE(SubtitlesInfoExtractor): class LyndaIE(InfoExtractor):
IE_NAME = 'lynda' IE_NAME = 'lynda'
IE_DESC = 'lynda.com videos' IE_DESC = 'lynda.com videos'
_VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
@ -88,11 +87,7 @@ class LyndaIE(SubtitlesInfoExtractor):
self._check_formats(formats, video_id) self._check_formats(formats, video_id)
self._sort_formats(formats) self._sort_formats(formats)
if self._downloader.params.get('listsubtitles', False): subtitles = self.extract_subtitles(video_id, page)
self._list_available_subtitles(video_id, page)
return
subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
return { return {
'id': video_id, 'id': video_id,
@ -144,38 +139,31 @@ class LyndaIE(SubtitlesInfoExtractor):
if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
raise ExtractorError('Unable to log in') raise ExtractorError('Unable to log in')
def _fix_subtitles(self, subtitles): def _fix_subtitles(self, subs):
if subtitles is None: srt = ''
return subtitles # subtitles not requested for pos in range(0, len(subs) - 1):
seq_current = subs[pos]
fixed_subtitles = {} m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
for k, v in subtitles.items(): if m_current is None:
subs = json.loads(v)
if len(subs) == 0:
continue continue
srt = '' seq_next = subs[pos + 1]
for pos in range(0, len(subs) - 1): m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
seq_current = subs[pos] if m_next is None:
m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) continue
if m_current is None: appear_time = m_current.group('timecode')
continue disappear_time = m_next.group('timecode')
seq_next = subs[pos + 1] text = seq_current['Caption']
m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
if m_next is None: if srt:
continue return srt
appear_time = m_current.group('timecode')
disappear_time = m_next.group('timecode') def _get_subtitles(self, video_id, webpage):
text = seq_current['Caption']
srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
if srt:
fixed_subtitles[k] = srt
return fixed_subtitles
def _get_available_subtitles(self, video_id, webpage):
url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
sub = self._download_webpage(url, None, False) subs = self._download_json(url, None, False)
sub_json = json.loads(sub) if subs:
return {'en': url} if len(sub_json) > 0 else {} return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]}
else:
return {}
class LyndaCourseIE(InfoExtractor): class LyndaCourseIE(InfoExtractor):

@ -5,9 +5,6 @@ import json
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..compat import (
compat_urlparse,
)
from ..utils import ( from ..utils import (
clean_html, clean_html,
ExtractorError, ExtractorError,
@ -108,7 +105,6 @@ class OCWMITIE(InfoExtractor):
'upload_date': '20121109', 'upload_date': '20121109',
'uploader_id': 'MIT', 'uploader_id': 'MIT',
'uploader': 'MIT OpenCourseWare', 'uploader': 'MIT OpenCourseWare',
# 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
} }
}, },
{ {
@ -121,7 +117,6 @@ class OCWMITIE(InfoExtractor):
'uploader_id': 'MIT', 'uploader_id': 'MIT',
'uploader': 'MIT OpenCourseWare', 'uploader': 'MIT OpenCourseWare',
'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
# 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
} }
} }
] ]
@ -140,7 +135,6 @@ class OCWMITIE(InfoExtractor):
metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
metadata = re.split(r', ?', metadata) metadata = re.split(r', ?', metadata)
yt = metadata[1] yt = metadata[1]
subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
else: else:
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
@ -148,7 +142,6 @@ class OCWMITIE(InfoExtractor):
metadata = re.sub(r'[\'"]', '', embed_media.group(1)) metadata = re.sub(r'[\'"]', '', embed_media.group(1))
metadata = re.split(r', ?', metadata) metadata = re.split(r', ?', metadata)
yt = metadata[1] yt = metadata[1]
subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
else: else:
raise ExtractorError('Unable to find embedded YouTube video.') raise ExtractorError('Unable to find embedded YouTube video.')
video_id = YoutubeIE.extract_id(yt) video_id = YoutubeIE.extract_id(yt)
@ -159,7 +152,5 @@ class OCWMITIE(InfoExtractor):
'title': title, 'title': title,
'description': description, 'description': description,
'url': yt, 'url': yt,
'url_transparent'
'subtitles': subs,
'ie_key': 'Youtube', 'ie_key': 'Youtube',
} }

@ -2,7 +2,7 @@ from __future__ import unicode_literals
import re import re
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_parse, compat_urllib_parse,
compat_urllib_request, compat_urllib_request,
@ -23,7 +23,7 @@ def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag return '{http://search.yahoo.com/mrss/}%s' % tag
class MTVServicesInfoExtractor(SubtitlesInfoExtractor): class MTVServicesInfoExtractor(InfoExtractor):
_MOBILE_TEMPLATE = None _MOBILE_TEMPLATE = None
@staticmethod @staticmethod
@ -95,25 +95,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
def _extract_subtitles(self, mdoc, mtvn_id): def _extract_subtitles(self, mdoc, mtvn_id):
subtitles = {} subtitles = {}
FORMATS = {
'scc': 'cea-608',
'eia-608': 'cea-608',
'xml': 'ttml',
}
subtitles_format = FORMATS.get(
self._downloader.params.get('subtitlesformat'), 'ttml')
for transcript in mdoc.findall('.//transcript'): for transcript in mdoc.findall('.//transcript'):
if transcript.get('kind') != 'captions': if transcript.get('kind') != 'captions':
continue continue
lang = transcript.get('srclang') lang = transcript.get('srclang')
for typographic in transcript.findall('./typographic'): subtitles[lang] = [{
captions_format = typographic.get('format') 'url': compat_str(typographic.get('src')),
if captions_format == subtitles_format: 'ext': typographic.get('format')
subtitles[lang] = compat_str(typographic.get('src')) } for typographic in transcript.findall('./typographic')]
break return subtitles
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(mtvn_id, subtitles)
return self.extract_subtitles(mtvn_id, subtitles)
def _get_video_info(self, itemdoc): def _get_video_info(self, itemdoc):
uri = itemdoc.find('guid').text uri = itemdoc.find('guid').text
@ -196,8 +186,6 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
webpage, 'mgid') webpage, 'mgid')
videos_info = self._get_videos_info(mgid) videos_info = self._get_videos_info(mgid)
if self._downloader.params.get('listsubtitles', False):
return
return videos_info return videos_info

@ -1,6 +1,5 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .subtitles import SubtitlesInfoExtractor
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
fix_xml_ampersands, fix_xml_ampersands,
@ -12,7 +11,7 @@ from ..utils import (
) )
class NPOBaseIE(SubtitlesInfoExtractor): class NPOBaseIE(InfoExtractor):
def _get_token(self, video_id): def _get_token(self, video_id):
token_page = self._download_webpage( token_page = self._download_webpage(
'http://ida.omroep.nl/npoplayer/i.js', 'http://ida.omroep.nl/npoplayer/i.js',
@ -164,13 +163,10 @@ class NPOIE(NPOBaseIE):
subtitles = {} subtitles = {}
if metadata.get('tt888') == 'ja': if metadata.get('tt888') == 'ja':
subtitles['nl'] = 'http://e.omroep.nl/tt888/%s' % video_id subtitles['nl'] = [{
'ext': 'vtt',
if self._downloader.params.get('listsubtitles', False): 'url': 'http://e.omroep.nl/tt888/%s' % video_id,
self._list_available_subtitles(video_id, subtitles) }]
return
subtitles = self.extract_subtitles(video_id, subtitles)
return { return {
'id': video_id, 'id': video_id,

@ -10,7 +10,6 @@ from ..utils import (
parse_duration, parse_duration,
unified_strdate, unified_strdate,
) )
from .subtitles import SubtitlesInfoExtractor
class NRKIE(InfoExtractor): class NRKIE(InfoExtractor):
@ -73,7 +72,7 @@ class NRKIE(InfoExtractor):
} }
class NRKTVIE(SubtitlesInfoExtractor): class NRKTVIE(InfoExtractor):
_VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
_TESTS = [ _TESTS = [
@ -156,7 +155,7 @@ class NRKTVIE(SubtitlesInfoExtractor):
if self._downloader.params.get('verbose', False): if self._downloader.params.get('verbose', False):
self.to_screen('[debug] %s' % txt) self.to_screen('[debug] %s' % txt)
def _extract_captions(self, subtitlesurl, video_id, baseurl): def _get_subtitles(self, subtitlesurl, video_id, baseurl):
url = "%s%s" % (baseurl, subtitlesurl) url = "%s%s" % (baseurl, subtitlesurl)
self._debug_print('%s: Subtitle url: %s' % (video_id, url)) self._debug_print('%s: Subtitle url: %s' % (video_id, url))
captions = self._download_xml(url, video_id, 'Downloading subtitles') captions = self._download_xml(url, video_id, 'Downloading subtitles')
@ -170,7 +169,10 @@ class NRKTVIE(SubtitlesInfoExtractor):
endtime = self._seconds2str(begin + duration) endtime = self._seconds2str(begin + duration)
text = '\n'.join(p.itertext()) text = '\n'.join(p.itertext())
srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text) srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text)
return {lang: srt} return {lang: [
{'ext': 'ttml', 'url': url},
{'ext': 'srt', 'data': srt},
]}
def _extract_f4m(self, manifest_url, video_id): def _extract_f4m(self, manifest_url, video_id):
return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
@ -243,10 +245,7 @@ class NRKTVIE(SubtitlesInfoExtractor):
webpage, 'subtitle URL', default=None) webpage, 'subtitle URL', default=None)
subtitles = None subtitles = None
if subtitles_url: if subtitles_url:
subtitles = self._extract_captions(subtitles_url, video_id, baseurl) subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
return { return {
'id': video_id, 'id': video_id,

@ -2,7 +2,7 @@ from __future__ import unicode_literals
import re import re
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_parse, compat_urllib_parse,
) )
@ -12,7 +12,7 @@ from ..utils import (
) )
class RaiIE(SubtitlesInfoExtractor): class RaiIE(InfoExtractor):
_VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)'
_TESTS = [ _TESTS = [
{ {
@ -89,15 +89,7 @@ class RaiIE(SubtitlesInfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
}) })
if self._downloader.params.get('listsubtitles', False): subtitles = self.extract_subtitles(video_id, url)
page = self._download_webpage(url, video_id)
self._list_available_subtitles(video_id, page)
return
subtitles = {}
if self._have_to_download_any_subtitles:
page = self._download_webpage(url, video_id)
subtitles = self.extract_subtitles(video_id, page)
return { return {
'id': video_id, 'id': video_id,
@ -111,7 +103,8 @@ class RaiIE(SubtitlesInfoExtractor):
'subtitles': subtitles, 'subtitles': subtitles,
} }
def _get_available_subtitles(self, video_id, webpage): def _get_subtitles(self, video_id, url):
webpage = self._download_webpage(url, video_id)
subtitles = {} subtitles = {}
m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)
if m: if m:
@ -120,5 +113,8 @@ class RaiIE(SubtitlesInfoExtractor):
SRT_EXT = '.srt' SRT_EXT = '.srt'
if captions.endswith(STL_EXT): if captions.endswith(STL_EXT):
captions = captions[:-len(STL_EXT)] + SRT_EXT captions = captions[:-len(STL_EXT)] + SRT_EXT
subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions) subtitles['it'] = [{
'ext': 'srt',
'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),
}]
return subtitles return subtitles

@ -1,99 +0,0 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
)
class SubtitlesInfoExtractor(InfoExtractor):
@property
def _have_to_download_any_subtitles(self):
return any([self._downloader.params.get('writesubtitles', False),
self._downloader.params.get('writeautomaticsub')])
def _list_available_subtitles(self, video_id, webpage):
""" outputs the available subtitles for the video """
sub_lang_list = self._get_available_subtitles(video_id, webpage)
auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
sub_lang = ",".join(list(sub_lang_list.keys()))
self.to_screen('%s: Available subtitles for video: %s' %
(video_id, sub_lang))
auto_lang = ",".join(auto_captions_list.keys())
self.to_screen('%s: Available automatic captions for video: %s' %
(video_id, auto_lang))
def extract_subtitles(self, video_id, webpage):
"""
returns {sub_lang: sub} ,{} if subtitles not found or None if the
subtitles aren't requested.
"""
if not self._have_to_download_any_subtitles:
return None
available_subs_list = {}
if self._downloader.params.get('writeautomaticsub', False):
available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))
if self._downloader.params.get('writesubtitles', False):
available_subs_list.update(self._get_available_subtitles(video_id, webpage))
if not available_subs_list: # error, it didn't get the available subtitles
return {}
if self._downloader.params.get('allsubtitles', False):
sub_lang_list = available_subs_list
else:
if self._downloader.params.get('subtitleslangs', False):
requested_langs = self._downloader.params.get('subtitleslangs')
elif 'en' in available_subs_list:
requested_langs = ['en']
else:
requested_langs = [list(available_subs_list.keys())[0]]
sub_lang_list = {}
for sub_lang in requested_langs:
if sub_lang not in available_subs_list:
self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang)
continue
sub_lang_list[sub_lang] = available_subs_list[sub_lang]
subtitles = {}
for sub_lang, url in sub_lang_list.items():
subtitle = self._request_subtitle_url(sub_lang, url)
if subtitle:
subtitles[sub_lang] = subtitle
return subtitles
def _download_subtitle_url(self, sub_lang, url):
return self._download_webpage(url, None, note=False)
def _request_subtitle_url(self, sub_lang, url):
""" makes the http request for the subtitle """
try:
sub = self._download_subtitle_url(sub_lang, url)
except ExtractorError as err:
self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
return
if not sub:
self._downloader.report_warning('Did not fetch video subtitles')
return
return sub
def _get_available_subtitles(self, video_id, webpage):
"""
returns {sub_lang: url} or {} if not available
Must be redefined by the subclasses
"""
# By default, allow implementations to simply pass in the result
assert isinstance(webpage, dict), \
'_get_available_subtitles not implemented'
return webpage
def _get_available_automatic_caption(self, video_id, webpage):
"""
returns {sub_lang: url} or {} if not available
Must be redefined by the subclasses that support automatic captions,
otherwise it will return {}
"""
self._downloader.report_warning('Automatic Captions not supported by this server')
return {}

@ -3,14 +3,14 @@ from __future__ import unicode_literals
import json import json
import re import re
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
) )
class TEDIE(SubtitlesInfoExtractor): class TEDIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?P<proto>https?://) (?P<proto>https?://)
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
@ -184,11 +184,6 @@ class TEDIE(SubtitlesInfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
video_id = compat_str(talk_info['id']) video_id = compat_str(talk_info['id'])
# subtitles
video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, talk_info)
return
thumbnail = talk_info['thumb'] thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'): if not thumbnail.startswith('http'):
@ -199,21 +194,25 @@ class TEDIE(SubtitlesInfoExtractor):
'uploader': talk_info['speaker'], 'uploader': talk_info['speaker'],
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'subtitles': video_subtitles, 'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats, 'formats': formats,
'duration': talk_info.get('duration'), 'duration': talk_info.get('duration'),
} }
def _get_available_subtitles(self, video_id, talk_info): def _get_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])] languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages: if languages:
sub_lang_list = {} sub_lang_list = {}
for l in languages: for l in languages:
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) sub_lang_list[l] = [
sub_lang_list[l] = url {
'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
'ext': ext,
}
for ext in ['ted', 'srt']
]
return sub_lang_list return sub_lang_list
else: else:
self._downloader.report_warning('video doesn\'t have subtitles')
return {} return {}
def _watch_info(self, url, name): def _watch_info(self, url, name):

@ -8,7 +8,7 @@ import binascii
import hashlib import hashlib
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_str, compat_str,
) )
@ -22,7 +22,7 @@ from ..utils import (
_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
class ThePlatformIE(SubtitlesInfoExtractor): class ThePlatformIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
@ -106,15 +106,11 @@ class ThePlatformIE(SubtitlesInfoExtractor):
captions = info.get('captions') captions = info.get('captions')
if isinstance(captions, list): if isinstance(captions, list):
for caption in captions: for caption in captions:
lang, src = caption.get('lang'), caption.get('src') lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
if lang and src: subtitles[lang] = [{
subtitles[lang] = src 'ext': 'srt' if mime == 'text/srt' else 'ttml',
'url': src,
if self._downloader.params.get('listsubtitles', False): }]
self._list_available_subtitles(video_id, subtitles)
return
subtitles = self.extract_subtitles(video_id, subtitles)
head = meta.find(_x('smil:head')) head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body')) body = meta.find(_x('smil:body'))

@ -2,16 +2,17 @@ from __future__ import unicode_literals
import re import re
from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
US_RATINGS, US_RATINGS,
) )
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
class VikiIE(SubtitlesInfoExtractor): class VikiIE(InfoExtractor):
IE_NAME = 'viki' IE_NAME = 'viki'
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
@ -69,9 +70,6 @@ class VikiIE(SubtitlesInfoExtractor):
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, info_webpage) video_subtitles = self.extract_subtitles(video_id, info_webpage)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, info_webpage)
return
return { return {
'id': video_id, 'id': video_id,
@ -85,12 +83,15 @@ class VikiIE(SubtitlesInfoExtractor):
'upload_date': upload_date, 'upload_date': upload_date,
} }
def _get_available_subtitles(self, video_id, info_webpage): def _get_subtitles(self, video_id, info_webpage):
res = {} res = {}
for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage): for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
sturl = unescapeHTML(sturl_html) sturl = unescapeHTML(sturl_html)
m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
if not m: if not m:
continue continue
res[m.group('lang')] = sturl res[m.group('lang')] = [{
'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
'ext': 'vtt',
}]
return res return res

@ -7,7 +7,6 @@ import itertools
import hashlib import hashlib
from .common import InfoExtractor from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_urllib_parse, compat_urllib_parse,
@ -53,7 +52,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
self._download_webpage(login_request, None, False, 'Wrong login info') self._download_webpage(login_request, None, False, 'Wrong login info')
class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): class VimeoIE(VimeoBaseInfoExtractor):
"""Information extractor for vimeo.com.""" """Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs # _VALID_URL matches Vimeo URLs
@ -378,12 +377,10 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
text_tracks = config['request'].get('text_tracks') text_tracks = config['request'].get('text_tracks')
if text_tracks: if text_tracks:
for tt in text_tracks: for tt in text_tracks:
subtitles[tt['lang']] = 'http://vimeo.com' + tt['url'] subtitles[tt['lang']] = [{
'ext': 'vtt',
video_subtitles = self.extract_subtitles(video_id, subtitles) 'url': 'http://vimeo.com' + tt['url'],
if self._downloader.params.get('listsubtitles', False): }]
self._list_available_subtitles(video_id, subtitles)
return
return { return {
'id': video_id, 'id': video_id,
@ -399,7 +396,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'view_count': view_count, 'view_count': view_count,
'like_count': like_count, 'like_count': like_count,
'comment_count': comment_count, 'comment_count': comment_count,
'subtitles': video_subtitles, 'subtitles': subtitles,
} }

@ -3,14 +3,14 @@ from __future__ import unicode_literals
import re import re
from .subtitles import SubtitlesInfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
xpath_text, xpath_text,
int_or_none, int_or_none,
) )
class WallaIE(SubtitlesInfoExtractor): class WallaIE(InfoExtractor):
_VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
_TEST = { _TEST = {
'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one',
@ -52,13 +52,10 @@ class WallaIE(SubtitlesInfoExtractor):
subtitles = {} subtitles = {}
for subtitle in item.findall('./subtitles/subtitle'): for subtitle in item.findall('./subtitles/subtitle'):
lang = xpath_text(subtitle, './title') lang = xpath_text(subtitle, './title')
subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
'ext': 'srt',
if self._downloader.params.get('listsubtitles', False): 'url': xpath_text(subtitle, './src'),
self._list_available_subtitles(video_id, subtitles) }]
return
subtitles = self.extract_subtitles(video_id, subtitles)
formats = [] formats = []
for quality in item.findall('./qualities/quality'): for quality in item.findall('./qualities/quality'):

@ -11,7 +11,6 @@ import time
import traceback import traceback
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter from ..swfinterp import SWFInterpreter
from ..compat import ( from ..compat import (
@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return return
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com' IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^ _VALID_URL = r"""(?x)^
( (
@ -648,7 +647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError( raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e) 'Signature extraction failed: ' + tb, cause=e)
def _get_available_subtitles(self, video_id, webpage): def _get_subtitles(self, video_id, webpage):
try: try:
subs_doc = self._download_xml( subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@ -662,23 +661,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
lang = track.attrib['lang_code'] lang = track.attrib['lang_code']
if lang in sub_lang_list: if lang in sub_lang_list:
continue continue
params = compat_urllib_parse.urlencode({ sub_formats = []
'lang': lang, for ext in ['sbv', 'vtt', 'srt']:
'v': video_id, params = compat_urllib_parse.urlencode({
'fmt': self._downloader.params.get('subtitlesformat', 'srt'), 'lang': lang,
'name': track.attrib['name'].encode('utf-8'), 'v': video_id,
}) 'fmt': ext,
url = 'https://www.youtube.com/api/timedtext?' + params 'name': track.attrib['name'].encode('utf-8'),
sub_lang_list[lang] = url })
sub_formats.append({
'url': 'https://www.youtube.com/api/timedtext?' + params,
'ext': ext,
})
sub_lang_list[lang] = sub_formats
if not sub_lang_list: if not sub_lang_list:
self._downloader.report_warning('video doesn\'t have subtitles') self._downloader.report_warning('video doesn\'t have subtitles')
return {} return {}
return sub_lang_list return sub_lang_list
def _get_available_automatic_caption(self, video_id, webpage): def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen('%s: Looking for automatic captions' % video_id) self.to_screen('%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage) mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id err_msg = 'Couldn\'t find automatic captions for %s' % video_id
@ -708,14 +711,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
sub_lang_list = {} sub_lang_list = {}
for lang_node in caption_list.findall('target'): for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code'] sub_lang = lang_node.attrib['lang_code']
params = compat_urllib_parse.urlencode({ sub_formats = []
'lang': original_lang, for ext in ['sbv', 'vtt', 'srt']:
'tlang': sub_lang, params = compat_urllib_parse.urlencode({
'fmt': sub_format, 'lang': original_lang,
'ts': timestamp, 'tlang': sub_lang,
'kind': caption_kind, 'fmt': ext,
}) 'ts': timestamp,
sub_lang_list[sub_lang] = caption_url + '&' + params 'kind': caption_kind,
})
sub_formats.append({
'url': caption_url + '&' + params,
'ext': ext,
})
sub_lang_list[sub_lang] = sub_formats
return sub_lang_list return sub_lang_list
# An extractor error can be raise by the download process if there are # An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles # no automatic captions but there are subtitles
@ -970,10 +979,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage) video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, video_webpage)
return
if 'length_seconds' not in video_info: if 'length_seconds' not in video_info:
self._downloader.report_warning('unable to extract video duration') self._downloader.report_warning('unable to extract video duration')
@ -1122,6 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'description': video_description, 'description': video_description,
'categories': video_categories, 'categories': video_categories,
'subtitles': video_subtitles, 'subtitles': video_subtitles,
'automatic_captions': automatic_captions,
'duration': video_duration, 'duration': video_duration,
'age_limit': 18 if age_gate else 0, 'age_limit': 18 if age_gate else 0,
'annotations': video_annotations, 'annotations': video_annotations,

@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None):
help='lists all available subtitles for the video') help='lists all available subtitles for the video')
subtitles.add_option( subtitles.add_option(
'--sub-format', '--sub-format',
action='store', dest='subtitlesformat', metavar='FORMAT', default='srt', action='store', dest='subtitlesformat', metavar='FORMAT', default='best',
help='subtitle format (default=srt) ([sbv/vtt] youtube only)') help='subtitle format, accepts formats preference, for example: "ass/srt/best"')
subtitles.add_option( subtitles.add_option(
'--sub-lang', '--sub-langs', '--srt-lang', '--sub-lang', '--sub-langs', '--srt-lang',
action='callback', dest='subtitleslangs', metavar='LANGS', type='str', action='callback', dest='subtitleslangs', metavar='LANGS', type='str',

@ -496,10 +496,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
'zu': 'zul', 'zu': 'zul',
} }
def __init__(self, downloader=None, subtitlesformat='srt'):
super(FFmpegEmbedSubtitlePP, self).__init__(downloader)
self._subformat = subtitlesformat
@classmethod @classmethod
def _conver_lang_code(cls, code): def _conver_lang_code(cls, code):
"""Convert language code from ISO 639-1 to ISO 639-2/T""" """Convert language code from ISO 639-1 to ISO 639-2/T"""
@ -509,13 +505,14 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
if information['ext'] != 'mp4': if information['ext'] != 'mp4':
self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files') self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files')
return True, information return True, information
if not information.get('subtitles'): subtitles = information.get('requested_subtitles')
if not subtitles:
self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed')
return True, information return True, information
sub_langs = [key for key in information['subtitles']] sub_langs = list(subtitles.keys())
filename = information['filepath'] filename = information['filepath']
input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()]
opts = [ opts = [
'-map', '0', '-map', '0',

Loading…
Cancel
Save