From 5aba6ea4fe6ad227d64a7e8b487d7cd7c3ad1f11 Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sat, 29 Jan 2011 11:55:20 +0100 Subject: [PATCH 1/5] =?UTF-8?q?Add=20YoutubeUserIE=20(code=20courtesy=20of?= =?UTF-8?q?=20Pawe=C5=82=20Paprota)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube-dl | 79 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/youtube-dl b/youtube-dl index 8dd03daf3..c8c2ea88e 100755 --- a/youtube-dl +++ b/youtube-dl @@ -5,6 +5,7 @@ # Author: Benjamin Johnson # Author: Vasyl' Vavrychuk # Author: Witold Baryluk +# Author: Paweł Paprota # License: Public domain code import cookielib import ctypes @@ -2159,9 +2160,11 @@ class YoutubePlaylistIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): """Information Extractor for YouTube users.""" - _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)' + _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' - _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this. + _GDATA_PAGE_SIZE = 50 + _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' + _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _youtube_ie = None def __init__(self, youtube_ie, downloader=None): @@ -2172,9 +2175,10 @@ class YoutubeUserIE(InfoExtractor): def suitable(url): return (re.match(YoutubeUserIE._VALID_URL, url) is not None) - def report_download_page(self, username): + def report_download_page(self, username, start_index): """Report attempt to download user page.""" - self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username)) + self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % + (username, start_index, start_index + self._GDATA_PAGE_SIZE)) def _real_initialize(self): self._youtube_ie.initialize() @@ -2186,34 +2190,63 @@ class YoutubeUserIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid url: %s' % url) return - # Download user page username = mobj.group(1) + + # Download video ids using YouTube Data API. Result size per + # query is limited (currently to 50 videos) so we need to query + # page by page until there are no video ids - it means we got + # all of them. + video_ids = [] - pagenum = 1 + pagenum = 0 - self.report_download_page(username) - request = urllib2.Request(self._TEMPLATE_URL % (username)) - try: - page = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) - return + while True: + start_index = pagenum * self._GDATA_PAGE_SIZE + 1 + self.report_download_page(username, start_index) - # Extract video identifiers - ids_in_page = [] + request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + try: + page = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) + return + # Extract video identifiers + ids_in_page = [] + + for mobj in re.finditer(self._VIDEO_INDICATOR, page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(mobj.group(1)) + + video_ids.extend(ids_in_page) + + # A little optimization - if current page is not + # "full", ie. does not contain PAGE_SIZE video ids then + # we can assume that this page is the last one - there + # are no more ids on further pages - no need to query + # again. + + if len(ids_in_page) < self._GDATA_PAGE_SIZE: + break + + pagenum += 1 + + all_ids_count = len(video_ids) playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) - video_ids = video_ids[playliststart:playlistend] - for id in video_ids: - self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) - return + if playlistend == -1: + video_ids = video_ids[playliststart:] + else: + video_ids = video_ids[playliststart:playlistend] + + self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" % + (username, all_ids_count, len(video_ids))) + + for video_id in video_ids: + self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id) + class DepositFilesIE(InfoExtractor): """Information extractor for depositfiles.com""" From 9e0dd8692ea16d62564c6b05c6fdc3f2e0b2f02d Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sun, 30 Jan 2011 12:58:01 +0100 Subject: [PATCH 2/5] Bump version number --- LATEST_VERSION | 2 +- youtube-dl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LATEST_VERSION b/LATEST_VERSION index a1c4173c8..4ab209346 100644 --- a/LATEST_VERSION +++ b/LATEST_VERSION @@ -1 +1 @@ -2010.12.09 +2011.01.30 diff --git a/youtube-dl b/youtube-dl index c8c2ea88e..e980f41f8 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2405,7 +2405,7 @@ if __name__ == '__main__': # Parse command line parser = optparse.OptionParser( usage='Usage: %prog [options] url...', - version='2010.12.09', + version='2011.01.30', conflict_handler='resolve', ) From 5776c3295b085b9177b24152e33049a8c4b0c90b Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sun, 30 Jan 2011 12:59:18 +0100 Subject: [PATCH 3/5] Update User-Agent string --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index e980f41f8..4d1e942ff 100755 --- a/youtube-dl +++ b/youtube-dl @@ -37,7 +37,7 @@ except ImportError: from cgi import parse_qs std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10' 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From 16c73c2e513829197c4af5ee62bde88b2b2272e4 Mon Sep 17 00:00:00 2001 From: Ricardo Garcia Date: Sun, 30 Jan 2011 13:03:15 +0100 Subject: [PATCH 4/5] Fix SyntaxError problem (oops) --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index 4d1e942ff..a4c8f2494 100755 --- a/youtube-dl +++ b/youtube-dl @@ -37,7 +37,7 @@ except ImportError: from cgi import parse_qs std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10' + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b10) Gecko/20100101 Firefox/4.0b10', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', From f74e22ae280ac1680251350c4672abfeb2a047fe Mon Sep 17 00:00:00 2001 From: Gergely Imreh Date: Mon, 31 Jan 2011 18:54:47 +0800 Subject: [PATCH 5/5] Enable artist playlists in YoutubePlaylistIE Artist playlist pages have different format compared to user playlists, thus more format checking is needed to construct the correct URL. From the artist playlist this method downloads all listed below the "Videos by [Artist Name]" header, plus usually there's one more video on the side, titled "Youtube Mix for [Artist Name]", which has a link format that currently cannot be distinguished from the other videos in the list. --- youtube-dl | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube-dl b/youtube-dl index a4c8f2494..dd875a38e 100755 --- a/youtube-dl +++ b/youtube-dl @@ -2096,8 +2096,8 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*' - _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en' + _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/)([^&]+).*' + _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None @@ -2125,13 +2125,19 @@ class YoutubePlaylistIE(InfoExtractor): return # Download playlist pages - playlist_id = mobj.group(1) + # prefix is 'p' as default for playlists but there are other types that need extra care + playlist_prefix = mobj.group(1) + if playlist_prefix == 'a': + playlist_access = 'artist' + else: + playlist_access = 'view_play_list' + playlist_id = mobj.group(2) video_ids = [] pagenum = 1 while True: self.report_download_page(playlist_id, pagenum) - request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum)) + request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)) try: page = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: