From b04d78068d43a00cb9e5bb4f9ff07698d7d86b39 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 16 Oct 2023 03:34:50 +0100 Subject: [PATCH] [XVideos] Update XVideosIE including features from PR #30689 * add uploader, tag, performer and view_count extraction (closes #30689) * add dis/like_count extraction --- youtube_dl/extractor/xvideos.py | 139 +++++++++++++++++++++++++++----- 1 file changed, 117 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index b736e18b5..536e9ae47 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -4,22 +4,39 @@ from __future__ import unicode_literals import re import itertools -from .common import InfoExtractor +from math import isinf + +from .common import ( + InfoExtractor, + SearchInfoExtractor, +) from ..compat import ( - compat_parse_qs, + compat_kwargs, compat_str, compat_urlparse, compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, + compat_urllib_request, ) from ..utils import ( clean_html, determine_ext, extract_attributes, ExtractorError, + get_element_by_class, + get_element_by_id, + get_elements_by_class, int_or_none, + join_nonempty, + LazyList, + merge_dicts, + parse_count, parse_duration, - try_get, + remove_end, + remove_start, + T, + traverse_obj, + try_call, + txt_or_none, url_basename, urljoin, ) @@ -27,14 +44,18 @@ from ..utils import ( class XVideosIE(InfoExtractor): _VALID_URL = r'''(?x) - https?:// - (?: - (?:[^/]+\.)?xvideos2?\.com/(?:video|prof-video-click/model/[^/]+/)| - (?:www\.)?xvideos\.es/video| - (?:www|flashservice)\.xvideos\.com/embedframe/| - static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= - ) - (?P\d+) + (?: + https?:// + (?: + # xvideos\d+\.com redirects to xvideos.com + # (?P[a-z]{2})\.xvideos.com too: catch it anyway + (?:[^/]+\.)?xvideos\.com/(?:video|prof-video-click/model/[^/]+/)| + (?:www\.)?xvideos\.es/video| + (?:www|flashservice)\.xvideos\.com/embedframe/| + static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= + )| + xvideos: + )(?P\d+) ''' _TESTS = [{ 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', @@ -45,19 +66,51 @@ class XVideosIE(InfoExtractor): 'title': 'Biker Takes his Girl', 'duration': 108, 'age_limit': 18, - } + }, + 'skip': 'Sorry, this video has been deleted', + }, { + 'url': 'https://www.xvideos.com/video78250973/hot_blonde_gets_excited_in_the_middle_of_the_club.', + 'md5': '0bc6e46ef55907533ffa0542e45958b6', + 'info_dict': { + 'id': '78250973', + 'ext': 'mp4', + 'title': 'Hot blonde gets excited in the middle of the club.', + 'uploader': 'Deny Barbie Official', + 'age_limit': 18, + 'duration': 302, + }, }, { # Broken HLS formats 'url': 'https://www.xvideos.com/video65982001/what_s_her_name', - 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5', + 'md5': '18ff7d57d4edc3c908fc5b06166dd63d', 'info_dict': { 'id': '65982001', 'ext': 'mp4', 'title': 'what\'s her name?', - 'duration': 120, + 'uploader': 'Skakdjskdk', 'age_limit': 18, - 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg', + 'duration': 120, + 'thumbnail': r're:^https://img-[a-z]+.xvideos-cdn.com/.+\.jpg', } + }, { + # from PR #30689 + 'url': 'https://www.xvideos.com/video50011247/when_girls_play_-_adriana_chechik_abella_danger_-_tradimento_-_twistys', + 'md5': 'aa54f96311768b3a8bfe54b8c8fda070', + 'info_dict': { + 'id': '50011247', + 'ext': 'mp4', + 'title': 'When Girls Play - (Adriana Chechik, Abella Danger) - Betrayal - Twistys', + 'duration': 720, + 'age_limit': 18, + 'tags': ['lesbian', 'teen', 'hardcore', 'latina', 'rough', 'squirt', 'big-ass', 'cheater', 'twistys', 'cheat', 'ass-play', 'when-girls-play'], + 'creator': 'Twistys', + 'uploader': 'Twistys', + 'uploader_url': 'https://www.xvideos.com/channels/twistys1', + 'cast': [{'given_name': 'Adriana Chechik', 'url': 'https://www.xvideos.com/pornstars/adriana-chechik'}, {'given_name': 'Abella Danger', 'url': 'https://www.xvideos.com/pornstars/abella-danger'}], + 'view_count': 'lambda c: c >= 4038715', + 'like_count': 'lambda c: c >= 8800', + 'dislike_count': 'lambda c: c >= 3100', + }, }, { 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, @@ -138,7 +191,7 @@ class XVideosIE(InfoExtractor): duration = int_or_none(self._og_search_property( 'duration', webpage, default=None)) or parse_duration( self._search_regex( - r']+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', + r''']*\bclass\s*=\s*["']duration\b[^>]+>.*?(\d[^<]+)''', webpage, 'duration', fatal=False)) formats = [] @@ -169,15 +222,57 @@ class XVideosIE(InfoExtractor): self._sort_formats(formats) - return { + # adapted from PR #30689 + ignore_tags = set(('xvideos', 'xvideos.com', 'x videos', 'x video', 'porn', 'video', 'videos')) + tags = self._html_search_meta('keywords', webpage) or '' + tags = [t for t in re.split(r'\s*,\s*', tags) if t not in ignore_tags] + + mobj = re.search( + r'''(?sx) + (?P
    ]+\bclass\s*=\s*["'](?:[\w-]+\s+)*uploader-tag(?:\s+[\w-]+)*[^>]+>) + \s*]+>\s*(?P.+?)\s*< + ''', webpage) + creator = None + uploader_url = None + if mobj: + uploader_url = urljoin(url, extract_attributes(mobj.group('ul')).get('href')) + creator = mobj.group('name') + + def get_actor_data(mobj): + ul_url = extract_attributes(mobj.group('ul')).get('href') + if '/pornstars/' in ul_url: + return { + 'given_name': mobj.group('name'), + 'url': urljoin(url, ul_url), + } + + actors = traverse_obj(re.finditer( + r'''(?sx) + (?P
      ]+\bclass\s*=\s*["'](?:[\w-]+\s+)*profile(?:\s+[\w-]+)*[^>]+>) + \s*]+>\s*(?P.+?)\s*< + ''', webpage), (Ellipsis, T(get_actor_data))) + + return merge_dicts({ 'id': video_id, 'formats': formats, 'title': title, - 'duration': duration, - 'thumbnails': thumbnails, 'age_limit': 18, - } - + }, { + 'duration': duration, + 'thumbnails': thumbnails or None, + 'tags': tags or None, + 'creator': creator, + 'uploader': creator, + 'uploader_url': uploader_url, + 'cast': actors or None, + 'view_count': parse_count(get_element_by_class( + 'mobile-hide', get_element_by_id('v-views', webpage))), + 'like_count': parse_count(get_element_by_class('rating-good-nbr', webpage)), + 'dislike_count': parse_count(get_element_by_class('rating-bad-nbr', webpage)), + }, { + 'channel': creator, + 'channel_url': uploader_url, + } if '/channels/' in (uploader_url or '') else {}) class XVideosPlaylistIE(InfoExtractor): _VALID_URL = r'''(?x)