From 836463013c363d38a2e24dabe40b000df76079f1 Mon Sep 17 00:00:00 2001 From: dirkf Date: Mon, 6 Jun 2022 14:23:54 +0100 Subject: [PATCH] [SpankBang] Rework SpankBangPlaylistIE with pagination --- youtube_dl/extractor/spankbang.py | 57 +++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index e33d753c5..b3bff8ba1 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -1,11 +1,16 @@ +# coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, ExtractorError, + get_element_by_class, + get_element_by_id, merge_dicts, parse_duration, parse_resolution, @@ -173,32 +178,56 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)' - _TEST = { + _TESTS = [{ 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 40, - } + 'playlist_mincount': 35, + }, { + # pagination required + 'url': 'https://spankbang.com/51wxk/playlist/dance', + 'info_dict': { + 'id': '51wxk', + 'title': 'Dance', + }, + 'playlist_mincount': 60, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) playlist_id = mobj.group('id') display_id = mobj.group('display_id') - webpage = self._download_webpage( - url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) - - entries = [self.url_result( - urljoin(url, mobj.group('path')), - ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) - for mobj in re.finditer( - r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' - % re.escape(display_id), webpage)] + webpage = self._download_webpage(url, playlist_id) + + def _entries(url, webpage=None): + for ii in itertools.count(1): + if not webpage: + webpage = self._download_webpage( + url, playlist_id, + note='Downloading playlist page %d' % (ii, ), + fatal=False) + if not webpage: + break + # search
...
.innerHTML + for mobj in re.finditer( + r''']*?\bclass\s*=\s*('|")(?:(?:(?!\1).)+?\s)?\s*thumb\b[^>]*>''', + get_element_by_id('container', webpage) or webpage): + item_url = extract_attributes(mobj.group(0)).get('href') + if item_url: + yield urljoin(url, item_url) + next_url = self._search_regex( + r'''\bhref\s*=\s*(["'])(?P(?!\1).+?)/?\1''', + get_element_by_class('next', webpage) or '', + 'continuation page', group='path', default=None) + if next_url is None or next_url in url: + break + url, webpage = urljoin(url, next_url + '/'), None title = self._html_search_regex( r'

([^<]+)\s+playlist\s*<', webpage, 'playlist title', - fatal=False) + fatal=False) or re.sub(r'(\w)\+(\w)', r'\1 \2', display_id).title() - return self.playlist_result(entries, playlist_id, title) + return self.playlist_from_matches(_entries(url, webpage), playlist_id, title, ie=SpankBangIE.ie_key())