From f843300fe56ffbfc8e3005fd0f7a8237e5deaaae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 24 Jun 2015 23:12:13 +0600 Subject: [PATCH] [onionstudios] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/onionstudios.py | 67 ++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/onionstudios.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index dc1a302e6..46cc4cd06 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -388,6 +388,7 @@ from .nytimes import ( from .nuvid import NuvidIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .onionstudios import OnionStudiosIE from .ooyala import ( OoyalaIE, OoyalaExternalIE, diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py new file mode 100644 index 000000000..d5d03fd44 --- /dev/null +++ b/youtube_dl/extractor/onionstudios.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class OnionStudiosIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P\d+)(?!-)' + + _TESTS = [{ + 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', + 'md5': 'd4851405d31adfadf71cd7a487b765bb', + 'info_dict': { + 'id': '2937', + 'ext': 'mp4', + 'title': 'Hannibal charges forward, stops for a cocktail', + 'description': 'md5:545299bda6abf87e5ec666548c6a9448', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'The A.V. Club', + 'uploader_id': 'TheAVClub', + }, + }, { + 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + + formats = [] + for src in re.findall(r']+src="([^"]+)"', webpage): + if determine_ext(src) != 'm3u8': # m3u8 always results in 403 + formats.append({ + 'url': src, + }) + self._sort_formats(formats) + + title = self._search_regex( + r'share_title\s*=\s*"([^"]+)"', webpage, 'title') + description = self._search_regex( + r'share_description\s*=\s*"([^"]+)"', webpage, + 'description', default=None) + thumbnail = self._search_regex( + r'poster="([^"]+)"', webpage, 'thumbnail', default=False) + + uploader_id = self._search_regex( + r'twitter_handle\s*=\s*"([^"]+)"', + webpage, 'uploader id', fatal=False) + uploader = self._search_regex( + r'window\.channelName\s*=\s*"Embedded:([^"]+)"', + webpage, 'uploader', default=False) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + }