From 92743d423a7dfaf0f803deab14475e6343091f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Thu, 25 Nov 2010 04:24:45 -0200 Subject: [PATCH 1/9] Preliminary downloading from vimeo --- youtube-dl | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/youtube-dl b/youtube-dl index 8dd03daf3..edd1d3f29 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1718,6 +1718,118 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'\nERROR: unable to download video') +class VimeoIE(InfoExtractor): + """Information extractor for vimeo.com.""" + + # _VALID_URL matches Vimeo URLs + _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + @staticmethod + def suitable(url): + return (re.match(VimeoIE._VALID_URL, url) is not None) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self._downloader.to_screen(u'[video.vimeo] %s: Downloading webpage' % video_id) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[video.vimeo] %s: Extracting information' % video_id) + + def _real_initialize(self): + return + + def _real_extract(self, url, new_video=True): + # Extract ID from URL + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + + # At this point we have a new video + self._downloader.increment_downloads() + video_id = mobj.group(1) + video_extension = 'flv' # FIXME + + # Retrieve video webpage to extract further information + request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers) + try: + self.report_download_webpage(video_id) + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) + return + + # Extract uploader and title from webpage + self.report_extraction(video_id) + mobj = re.search(r'(.*)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video title') + return + video_title = mobj.group(1).decode('utf-8') + simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + + mobj = re.search(r'http://vimeo.com/(.*)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video uploader') + return + video_uploader = mobj.group(1).decode('utf-8') + + # Extract video thumbnail + mobj = re.search(r'(.*)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + return + video_thumbnail = mobj.group(1).decode('utf-8') + + # # Extract video description + # mobj = re.search(r'', webpage) + # if mobj is None: + # self._downloader.trouble(u'ERROR: unable to extract video description') + # return + # video_description = mobj.group(1).decode('utf-8') + # if not video_description: video_description = 'No description available.' + video_description = 'Foo.' + + # Extract request signature + mobj = re.search(r'(.*)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract request signature') + return + sig = mobj.group(1).decode('utf-8') + + # Extract request signature expiration + mobj = re.search(r'(.*)', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract request signature expiration') + return + sig_exp = mobj.group(1).decode('utf-8') + + video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp) + + try: + # Process video information + self._downloader.process_info({ + 'id': video_id.decode('utf-8'), + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': u'NA', + 'title': video_title, + 'stitle': simple_title, + 'ext': video_extension.decode('utf-8'), + 'thumbnail': video_thumbnail.decode('utf-8'), + 'description': video_description, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'player_url': None, + }) + except UnavailableVideoError: + self._downloader.trouble(u'ERROR: unable to download video') + + class GenericIE(InfoExtractor): """Generic last-resort information extractor.""" @@ -2537,6 +2649,7 @@ if __name__ == '__main__': parser.error(u'invalid playlist end number specified') # Information extractors + vimeo_ie = VimeoIE() youtube_ie = YoutubeIE() metacafe_ie = MetacafeIE(youtube_ie) dailymotion_ie = DailymotionIE() @@ -2588,6 +2701,7 @@ if __name__ == '__main__': 'nopart': opts.nopart, 'updatetime': opts.updatetime, }) + fd.add_info_extractor(vimeo_ie) fd.add_info_extractor(youtube_search_ie) fd.add_info_extractor(youtube_pl_ie) fd.add_info_extractor(youtube_user_ie) From c5a088d341e3aeaf65fbca02523c02ff3bccee6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Sat, 29 Jan 2011 04:13:54 -0200 Subject: [PATCH 2/9] Use non-greedy regexps, for safety. Since I was very lazy when I coded this, I took the fastest route. Luckily, Vasyl' Vavrychuk pointed this out and I went (after many months) and just did some minor changes. --- youtube-dl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube-dl b/youtube-dl index edd1d3f29..e7459062d 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1765,21 +1765,21 @@ class VimeoIE(InfoExtractor): # Extract uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'(.*)', webpage) + mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) - mobj = re.search(r'http://vimeo.com/(.*)', webpage) + mobj = re.search(r'http://vimeo.com/(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract video uploader') return video_uploader = mobj.group(1).decode('utf-8') # Extract video thumbnail - mobj = re.search(r'(.*)', webpage) + mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract video thumbnail') return @@ -1795,14 +1795,14 @@ class VimeoIE(InfoExtractor): video_description = 'Foo.' # Extract request signature - mobj = re.search(r'(.*)', webpage) + mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract request signature') return sig = mobj.group(1).decode('utf-8') # Extract request signature expiration - mobj = re.search(r'(.*)', webpage) + mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract request signature expiration') return From f24c674b048003d878a1d6436c1b2af47693f2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Fri, 4 Feb 2011 04:02:29 -0200 Subject: [PATCH 3/9] Make some of the comments more descriptive. --- youtube-dl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube-dl b/youtube-dl index b96156be7..a925c9783 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1764,8 +1764,12 @@ class VimeoIE(InfoExtractor): self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) return - # Extract uploader and title from webpage + # Now we begin extracting as much information as we can from what we + # retrieved. First we extract the information common to all extractors, + # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) + + # Extract title mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract video title') @@ -1773,6 +1777,7 @@ class VimeoIE(InfoExtractor): video_title = mobj.group(1).decode('utf-8') simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) + # Extract uploader mobj = re.search(r'http://vimeo.com/(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract video uploader') @@ -1795,14 +1800,14 @@ class VimeoIE(InfoExtractor): # if not video_description: video_description = 'No description available.' video_description = 'Foo.' - # Extract request signature + # Vimeo specific: extract request signature mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract request signature') return sig = mobj.group(1).decode('utf-8') - # Extract request signature expiration + # Vimeo specific: Extract request signature expiration mobj = re.search(r'(.*?)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract request signature expiration') From 8cc98b2358fb4554c7af9dcd38fd4c96262e5ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Fri, 4 Feb 2011 06:15:27 -0200 Subject: [PATCH 4/9] vimeo: Also accept URLs prefixed by www. I hope that this doesn't break anything. `:)` --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index a925c9783..16d234ebf 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1723,7 +1723,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)' + _VALID_URL = r'(?:http://)?(?:www.)?vimeo\.com/([0-9]+)' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) From a7e5259c33851725243b13f01929e75bb40e0ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Thu, 17 Feb 2011 08:25:45 -0200 Subject: [PATCH 5/9] vimeo: Make regexp more robust. This change makes the VimeoIE work with http://player.vimeo.com/video/19267888 --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index 16d234ebf..780a6d9a2 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1723,7 +1723,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:http://)?(?:www.)?vimeo\.com/([0-9]+)' + _VALID_URL = r'(?:http://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) From 0ecedbdb036120849c2a7eb992ec8a993221e5f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Wed, 20 Apr 2011 21:07:57 -0300 Subject: [PATCH 6/9] vimeo: Remove clutter in some messages. We should make a unified way of printing messages, but let's follow suit and do what the main YoutubeIE does here. --- youtube-dl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube-dl b/youtube-dl index 240b2bc7b..080490ded 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1735,11 +1735,11 @@ class VimeoIE(InfoExtractor): def report_download_webpage(self, video_id): """Report webpage download.""" - self._downloader.to_screen(u'[video.vimeo] %s: Downloading webpage' % video_id) + self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id) def report_extraction(self, video_id): """Report information extraction.""" - self._downloader.to_screen(u'[video.vimeo] %s: Extracting information' % video_id) + self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id) def _real_initialize(self): return From 1e055db69ccffbacad5765887f14879bbe350ce2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Wed, 20 Apr 2011 21:15:57 -0300 Subject: [PATCH 7/9] vimeo: Ignore if we are using HTTP/S or not. --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index 080490ded..17fb82da7 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1724,7 +1724,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:http://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)' + _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) From 44c636df8966a1ace617b276f19b5887aa66d612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Wed, 20 Apr 2011 21:20:55 -0300 Subject: [PATCH 8/9] vimeo: Tweak the regexp to allow some extended URLs from vimeo. This, in particular, lets me grab the videos from the beginners channel with URLs like: http://vimeo.com/groups/fivebyfive/videos/22648611 Note that the regexp *will* break for other URLs that we don't know about and that's on purpose: we don't want to accidentally grab videos that would be passed on to other information extractors. --- youtube-dl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube-dl b/youtube-dl index 17fb82da7..f3d7a3f61 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1724,7 +1724,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)' + _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) From 2fc31a48723fd4f84c20cf97f810f0171419bcf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Wed, 20 Apr 2011 21:29:29 -0300 Subject: [PATCH 9/9] vimeo: Apparently, all videos in vimeo are served in ISO containers. --- youtube-dl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube-dl b/youtube-dl index f3d7a3f61..b734c997c 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1754,7 +1754,6 @@ class VimeoIE(InfoExtractor): # At this point we have a new video self._downloader.increment_downloads() video_id = mobj.group(1) - video_extension = 'flv' # FIXME # Retrieve video webpage to extract further information request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers) @@ -1826,7 +1825,7 @@ class VimeoIE(InfoExtractor): 'upload_date': u'NA', 'title': video_title, 'stitle': simple_title, - 'ext': video_extension.decode('utf-8'), + 'ext': u'mp4', 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'thumbnail': video_thumbnail,