[XVideos] Update XVideosIE including features from PR #30689

* add uploader, tag, performer and view_count extraction (closes #30689)
* add dis/like_count extraction
pull/30774/head
dirkf 7 months ago
parent 35230a2b5a
commit b04d78068d

@ -4,22 +4,39 @@ from __future__ import unicode_literals
import re
import itertools
from .common import InfoExtractor
from math import isinf
from .common import (
InfoExtractor,
SearchInfoExtractor,
)
from ..compat import (
compat_parse_qs,
compat_kwargs,
compat_str,
compat_urlparse,
compat_urllib_parse_unquote,
compat_urllib_parse_urlencode,
compat_urllib_request,
)
from ..utils import (
clean_html,
determine_ext,
extract_attributes,
ExtractorError,
get_element_by_class,
get_element_by_id,
get_elements_by_class,
int_or_none,
join_nonempty,
LazyList,
merge_dicts,
parse_count,
parse_duration,
try_get,
remove_end,
remove_start,
T,
traverse_obj,
try_call,
txt_or_none,
url_basename,
urljoin,
)
@ -27,14 +44,18 @@ from ..utils import (
class XVideosIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
(?:[^/]+\.)?xvideos2?\.com/(?:video|prof-video-click/model/[^/]+/)|
(?:www\.)?xvideos\.es/video|
(?:www|flashservice)\.xvideos\.com/embedframe/|
static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
)
(?P<id>\d+)
(?:
https?://
(?:
# xvideos\d+\.com redirects to xvideos.com
# (?P<country>[a-z]{2})\.xvideos.com too: catch it anyway
(?:[^/]+\.)?xvideos\.com/(?:video|prof-video-click/model/[^/]+/)|
(?:www\.)?xvideos\.es/video|
(?:www|flashservice)\.xvideos\.com/embedframe/|
static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
)|
xvideos:
)(?P<id>\d+)
'''
_TESTS = [{
'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
@ -45,19 +66,51 @@ class XVideosIE(InfoExtractor):
'title': 'Biker Takes his Girl',
'duration': 108,
'age_limit': 18,
}
},
'skip': 'Sorry, this video has been deleted',
}, {
'url': 'https://www.xvideos.com/video78250973/hot_blonde_gets_excited_in_the_middle_of_the_club.',
'md5': '0bc6e46ef55907533ffa0542e45958b6',
'info_dict': {
'id': '78250973',
'ext': 'mp4',
'title': 'Hot blonde gets excited in the middle of the club.',
'uploader': 'Deny Barbie Official',
'age_limit': 18,
'duration': 302,
},
}, {
# Broken HLS formats
'url': 'https://www.xvideos.com/video65982001/what_s_her_name',
'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5',
'md5': '18ff7d57d4edc3c908fc5b06166dd63d',
'info_dict': {
'id': '65982001',
'ext': 'mp4',
'title': 'what\'s her name?',
'duration': 120,
'uploader': 'Skakdjskdk',
'age_limit': 18,
'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
'duration': 120,
'thumbnail': r're:^https://img-[a-z]+.xvideos-cdn.com/.+\.jpg',
}
}, {
# from PR #30689
'url': 'https://www.xvideos.com/video50011247/when_girls_play_-_adriana_chechik_abella_danger_-_tradimento_-_twistys',
'md5': 'aa54f96311768b3a8bfe54b8c8fda070',
'info_dict': {
'id': '50011247',
'ext': 'mp4',
'title': 'When Girls Play - (Adriana Chechik, Abella Danger) - Betrayal - Twistys',
'duration': 720,
'age_limit': 18,
'tags': ['lesbian', 'teen', 'hardcore', 'latina', 'rough', 'squirt', 'big-ass', 'cheater', 'twistys', 'cheat', 'ass-play', 'when-girls-play'],
'creator': 'Twistys',
'uploader': 'Twistys',
'uploader_url': 'https://www.xvideos.com/channels/twistys1',
'cast': [{'given_name': 'Adriana Chechik', 'url': 'https://www.xvideos.com/pornstars/adriana-chechik'}, {'given_name': 'Abella Danger', 'url': 'https://www.xvideos.com/pornstars/abella-danger'}],
'view_count': 'lambda c: c >= 4038715',
'like_count': 'lambda c: c >= 8800',
'dislike_count': 'lambda c: c >= 3100',
},
}, {
'url': 'https://flashservice.xvideos.com/embedframe/4588838',
'only_matching': True,
@ -138,7 +191,7 @@ class XVideosIE(InfoExtractor):
duration = int_or_none(self._og_search_property(
'duration', webpage, default=None)) or parse_duration(
self._search_regex(
r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
r'''<span [^>]*\bclass\s*=\s*["']duration\b[^>]+>.*?(\d[^<]+)''',
webpage, 'duration', fatal=False))
formats = []
@ -169,15 +222,57 @@ class XVideosIE(InfoExtractor):
self._sort_formats(formats)
return {
# adapted from PR #30689
ignore_tags = set(('xvideos', 'xvideos.com', 'x videos', 'x video', 'porn', 'video', 'videos'))
tags = self._html_search_meta('keywords', webpage) or ''
tags = [t for t in re.split(r'\s*,\s*', tags) if t not in ignore_tags]
mobj = re.search(
r'''(?sx)
(?P<ul><a\b[^>]+\bclass\s*=\s*["'](?:[\w-]+\s+)*uploader-tag(?:\s+[\w-]+)*[^>]+>)
\s*<span\s+class\s*=\s*["']name\b[^>]+>\s*(?P<name>.+?)\s*<
''', webpage)
creator = None
uploader_url = None
if mobj:
uploader_url = urljoin(url, extract_attributes(mobj.group('ul')).get('href'))
creator = mobj.group('name')
def get_actor_data(mobj):
ul_url = extract_attributes(mobj.group('ul')).get('href')
if '/pornstars/' in ul_url:
return {
'given_name': mobj.group('name'),
'url': urljoin(url, ul_url),
}
actors = traverse_obj(re.finditer(
r'''(?sx)
(?P<ul><a\b[^>]+\bclass\s*=\s*["'](?:[\w-]+\s+)*profile(?:\s+[\w-]+)*[^>]+>)
\s*<span\s+class\s*=\s*["']name\b[^>]+>\s*(?P<name>.+?)\s*<
''', webpage), (Ellipsis, T(get_actor_data)))
return merge_dicts({
'id': video_id,
'formats': formats,
'title': title,
'duration': duration,
'thumbnails': thumbnails,
'age_limit': 18,
}
}, {
'duration': duration,
'thumbnails': thumbnails or None,
'tags': tags or None,
'creator': creator,
'uploader': creator,
'uploader_url': uploader_url,
'cast': actors or None,
'view_count': parse_count(get_element_by_class(
'mobile-hide', get_element_by_id('v-views', webpage))),
'like_count': parse_count(get_element_by_class('rating-good-nbr', webpage)),
'dislike_count': parse_count(get_element_by_class('rating-bad-nbr', webpage)),
}, {
'channel': creator,
'channel_url': uploader_url,
} if '/channels/' in (uploader_url or '') else {})
class XVideosPlaylistIE(InfoExtractor):
_VALID_URL = r'''(?x)

Loading…
Cancel
Save