diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d9289e5bf..292c39c42 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1619,7 +1619,13 @@ from .xnxx import XNXXIE from .xstream import XstreamIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE -from .xvideos import XVideosIE +from .xvideos import ( + XVideosIE, + XVideosChannelIE, + XVideosPlaylistIE, + XVideosRelatedIE, + XVideosSearchIE, +) from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index e63d4690d..1b2c0a5bc 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -1,15 +1,26 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urlparse, + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, +) from ..utils import ( clean_html, determine_ext, + extract_attributes, ExtractorError, int_or_none, parse_duration, + try_get, + url_basename, + urljoin, ) @@ -17,13 +28,13 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[^/]+\.)?xvideos2?\.com/video| + (?:[^/]+\.)?xvideos2?\.com/(?:video|prof-video-click/model/[^/]+/)| (?:www\.)?xvideos\.es/video| - flashservice\.xvideos\.com/embedframe/| + (?:www|flashservice)\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) - (?P[0-9]+) - ''' + (?P\d+) + ''' _TESTS = [{ 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', 'md5': '14cea69fcb84db54293b1e971466c2e1', @@ -34,6 +45,18 @@ class XVideosIE(InfoExtractor): 'duration': 108, 'age_limit': 18, } + }, { + # Broken HLS formats + 'url': 'https://www.xvideos.com/video65982001/what_s_her_name', + 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5', + 'info_dict': { + 'id': '65982001', + 'ext': 'mp4', + 'title': 'what\'s her name?', + 'duration': 120, + 'age_limit': 18, + 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg', + } }, { 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, @@ -78,6 +101,12 @@ class XVideosIE(InfoExtractor): 'only_matching': True }] + @classmethod + def suitable(cls, url): + EXCLUDE_IE = (XVideosRelatedIE, ) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(XVideosIE, cls).suitable(url)) + def _real_extract(self, url): video_id = self._match_id(url) @@ -125,9 +154,11 @@ class XVideosIE(InfoExtractor): r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): format_id = kind.lower() if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( + hls_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + self._check_formats(hls_formats, video_id) + formats.extend(hls_formats) elif format_id in ('urllow', 'urlhigh'): formats.append({ 'url': format_url, @@ -145,3 +176,215 @@ class XVideosIE(InfoExtractor): 'thumbnails': thumbnails, 'age_limit': 18, } + + +class XVideosPlaylistIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos2?\.com/ + (?:c(?:/[sm]:[^/]+)*| + profiles| + favorite)/ + (?P[^#?/]+) + ''' + _TESTS = [] + + def _extract_videos_from_json_list(self, json_list, path='video'): + return ( + 'https://www.xvideos.com/%s%d' % (path, x.get('id'), ) + for x in json_list if isinstance(x, dict)) + + def _get_playlist_url(self, url, playlist_id): + """URL of first playlist page""" + id_match = re.match(self._VALID_URL, url).groupdict() + video_sort = id_match.get('sort') + if video_sort: + url, _ = compat_urlparse.urldefrag(url) + if url.endswith('/'): + url = url[:-1] + url = '%s/%s' % (url, video_sort.replace('-', '/')) + return url + + def _get_next_page(self, url, num, page): + '''URL of num'th continuation page of url''' + if page.startswith('{'): + url, sub = re.subn(r'(/)(\d{1,7})($|[#?/])', r'\g<1>%d\3' % (num, ), url) + if sub == 0: + url += '/%d' % (num, ) + return url + next_page = self._search_regex( + r'''(?s)(]*?\bclass\s*=\s*(?P'|").*?\bnext-page\b.*?(?P=q)[^>]*?>)''', + page, 'next page', default=None) + if next_page: + next_page = extract_attributes(next_page) + next_page = next_page.get('href') + if next_page: + return urljoin(url, next_page) + return False + + def _extract_videos(self, url, playlist_id, num, page): + """Get iterable videos plus stop flag""" + return (( + 'https://www.xvideos.com/video' + x.group('video_id') + for x in re.finditer(r''']*?id\s*=\s*(\'|")video_(?P[0-9]+)\1''', page)), + None) + + def _real_extract(self, url): + id_match = re.match(self._VALID_URL, url).groupdict() + + playlist_id = id_match['id'] + if url.endswith(playlist_id): + url += '/0' + next_page = self._get_playlist_url(url, playlist_id) + matches = [] + for count in itertools.count(0): + webpage = self._download_webpage( + next_page, + '%s (+%d)' % (playlist_id, count) if count > 0 else playlist_id) + + vids, stop = self._extract_videos(next_page, playlist_id, count, webpage) + + if vids: + matches.append(vids) + + if stop: + break + next_page = self._get_next_page(next_page, count + 1, webpage) + if not next_page: + break + + return self.playlist_from_matches( + itertools.chain.from_iterable(matches), playlist_id) + + +class XVideosRelatedIE(XVideosPlaylistIE): + _VALID_URL = XVideosIE._VALID_URL + r'(?:/[^/]+)*?\#_related-(?Pvideos|playlists)' + + _TESTS = [] + + def _extract_videos(self, url, playlist_id, num, page): + id_match = re.match(self._VALID_URL, url).groupdict() + related = id_match.get('related') + if not related: + return super(XVideosRelatedIE, self)._extract_videos(url, playlist_id, num, page) + + if related == 'videos': + related_json = self._search_regex( + r'(?s)videos_related\s*=\s*(\[.*?])\s*;', + page, 'related', default='[]') + related_json = self._parse_json(related_json, playlist_id, fatal=False) or [] + return (self._extract_videos_from_json_list(related_json), True) + # playlists + related_json = self._download_json( + 'https://www.xvideos.com/video-playlists/' + playlist_id, playlist_id, fatal=False) + return ( + self._extract_videos_from_json_list( + try_get(related_json, lambda x: x['playlists'], list) or [], + path='favorite/'), + True) + + +class XVideosChannelIE(XVideosPlaylistIE): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos2?\.com/ + (?: + (?:amateur-|pornstar-|model-)?channel| + pornstar + )s/ + (?P[^#?/]+) + (?:\#_tab(?PVideos|Favorites|Playlists|AboutMe)(?:,(?P[^,]+))?)? + ''' + _TESTS = [{ + 'url': 'https://www.xvideos.com/pornstar-channels/sienna-west', + 'playlist_mincount': 5, + }, ] + + def _get_playlist_url(self, url, playlist_id): + webpage = self._download_webpage(url, playlist_id) + id_match = re.match(self._VALID_URL, url).groupdict() + tab = (id_match.get('tab') or '').lower() + if tab: + if tab in ('videos', 'favorites'): + url, frag = compat_urlparse.urldefrag(url) + if not url.endswith('/'): + url += '/' + frag = frag.split(',') + url += tab + if tab == 'videos': + url += '/' + (frag[1] if len(frag) > 1 else 'best') + url += '/0' + return url + + # activity + conf = self._search_regex( + r'(?s)\.\s*xv\s*\.\s*conf\s*=\s*(\{.*?})[\s;]* 0) + + if tab == 'favorites': + return (( + 'https://www.xvideos.com' + x.group('playlist') + for x in re.finditer(r''']*?href\s*=\s*('|")(?P/favorite/\d+/[^#?]+?)\1''', page)), + None) + + return super(XVideosChannelIE, self)._extract_videos(url, playlist_id, num, page) + + +class XVideosSearchIE(XVideosPlaylistIE): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos2?\.com/ + \?k=(?P[^#?/&]+) + ''' + _TESTS = [{ + # uninteresting search with probably at least two pages of results, + # but not too many more + 'url': 'http://www.xvideos.com/?k=libya&sort=length', + 'playlist_mincount': 30, + }, ] + + def _get_next_page(self, url, num, page): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_parse_qs(parsed_url.query) + qs['p'] = [num] + parsed_url = ( + list(parsed_url[:4]) + + [compat_urllib_parse_urlencode(qs, True), None]) + return compat_urlparse.urlunparse(parsed_url), False