From ca1fee34f22e3fac9cc7a55c55c7aa7519f788b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 5 Mar 2014 13:22:10 +0100 Subject: [PATCH] [ted] Fix playlist extraction and add a test --- test/test_playlists.py | 10 ++++++++++ youtube_dl/extractor/ted.py | 37 ++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index 07c85b322..4bd815a0e 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -36,6 +36,7 @@ from youtube_dl.extractor import ( RutubeChannelIE, GoogleSearchIE, GenericIE, + TEDIE, ) @@ -259,5 +260,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], 'Zero Punctuation') self.assertTrue(len(result['entries']) > 10) + def test_ted_playlist(self): + dl = FakeYDL() + ie = TEDIE(dl) + result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '10') + self.assertEqual(result['title'], 'Who are the hackers?') + self.assertTrue(len(result['entries']) >= 6) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 10cb1e4be..f3cb85ab0 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -6,6 +6,7 @@ import re from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_str, RegexNotFoundError, ) @@ -13,7 +14,7 @@ from ..utils import ( class TEDIE(SubtitlesInfoExtractor): _VALID_URL=r'''(?x)http://www\.ted\.com/ ( - ((?Pplaylists)/(?P\d+)) # We have a playlist + (?Pplaylists(?:/\d+)?) # We have a playlist | ((?Ptalks)) # We have a simple talk ) @@ -37,35 +38,35 @@ class TEDIE(SubtitlesInfoExtractor): 'high': 3, } + def _extract_info(self, webpage): + info_json = self._search_regex(r'q\("\w+.init",({.+})\)', webpage, 'info json') + return json.loads(info_json) + def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) if m.group('type_talk'): return self._talk_info(url) else : - playlist_id=m.group('playlist_id') name=m.group('name') - self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) - return [self._playlist_videos_info(url,name,playlist_id)] + return self._playlist_videos_info(url, name) - def _playlist_videos_info(self, url, name, playlist_id): + def _playlist_videos_info(self, url, name): '''Returns the videos of the playlist''' - webpage = self._download_webpage( - url, playlist_id, 'Downloading playlist webpage') - matches = re.finditer( - r'/talks/[^"]+\.html)">[^<]*

', - webpage) - - playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', - webpage, 'playlist title') + webpage = self._download_webpage(url, name, + 'Downloading playlist webpage') + info = self._extract_info(webpage) + playlist_info = info['playlist'] playlist_entries = [ - self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED') - for m in matches + self.url_result(u'http://www.ted.com/talks/' + talk['slug'], self.ie_key()) + for talk in info['talks'] ] return self.playlist_result( - playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title) + playlist_entries, + playlist_id=compat_str(playlist_info['id']), + playlist_title=playlist_info['title']) def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" @@ -74,9 +75,7 @@ class TEDIE(SubtitlesInfoExtractor): webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) self.report_extraction(video_name) - info_json = self._search_regex(r'"talkPage.init",({.+})\)', webpage, 'info json') - info = json.loads(info_json) - talk_info = info['talks'][0] + talk_info = self._extract_info(webpage)['talks'][0] formats = [{ 'ext': 'mp4',