From 8bc4c6350e0a9c54e79ac3bd28ce97054d8c000d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 Dec 2020 21:58:26 +0700 Subject: [PATCH] [spangbang:playlist] Fix extraction (closes #24087) --- youtube_dl/extractor/spankbang.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index e3ec8602d..8a7102d0c 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -13,6 +13,7 @@ from ..utils import ( str_to_int, url_or_none, urlencode_postdata, + urljoin, ) @@ -166,30 +167,33 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/[^/]+' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 50, + 'playlist_mincount': 40, } def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( - 'https://spankbang.com/%s/video' % video_id, - ie=SpankBangIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r']+\bhref=["\']/?([\da-z]+)/play/', webpage))] + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] title = self._html_search_regex( - r'

([^<]+)\s+playlist

', webpage, 'playlist title', + r'

([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title)