From f7a7601f27951db481cd23c3bc8491d6bf86524b Mon Sep 17 00:00:00 2001 From: Alexander Gedeon Date: Thu, 22 Apr 2021 15:13:12 -0400 Subject: [PATCH] [webarchive] Added new extractor for the web archive (closes #13655) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/webarchive.py | 54 ++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 youtube_dl/extractor/webarchive.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ac33cd996..062cc1f4a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1520,6 +1520,7 @@ from .wdr import ( WDRElefantIE, WDRMobileIE, ) +from .webarchive import WebArchiveIE from .webcaster import ( WebcasterIE, WebcasterFeedIE, diff --git a/youtube_dl/extractor/webarchive.py b/youtube_dl/extractor/webarchive.py new file mode 100644 index 000000000..4e1bca7ca --- /dev/null +++ b/youtube_dl/extractor/webarchive.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class WebArchiveIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)?web\.archive\.org\/web\/([0-9]+)\/https?:\/\/(?:www\.)?youtube\.com\/watch\?v=(?P[0-9A-Za-z_-]{1,11})$' + _TEST = { + 'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs', + 'md5': 'ec44dc1177ae37189a8606d4ca1113ae', + 'info_dict': { + 'url': 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/aYAGB11YrSs', + 'id': 'aYAGB11YrSs', + 'ext': 'mp4', + 'title': 'Team Fortress 2 - Sandviches!', + 'author': 'Zeurel', + } + } + + def _real_extract(self, url): + # Get video ID and page + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Extract title and author + title = self._html_search_regex(r'(.+?)', webpage, 'title').strip() + author = self._html_search_regex(r'"author":"([a-zA-Z0-9]+)"', webpage, 'author').strip() + + # Parse title + if title.endswith(' - YouTube'): + title = title[:-10] + + # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 + link_stub = "https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/" + + # Extract hash from url + hash_idx = url.find("watch?v=") + len("watch?v=") + youtube_hash = url[hash_idx:] + + # If there's an ampersand, cut off before it + ampersand = youtube_hash.find('&') + if ampersand != -1: + youtube_hash = youtube_hash[:ampersand] + + # Recreate the fixed pattern url and return + reconstructed_url = link_stub + youtube_hash + return { + 'url': reconstructed_url, + 'id': video_id, + 'title': title, + 'author': author, + 'ext': "mp4" + }