[comedycentral] fix extraction(closes #27905)

3 years ago · fa8f6d8580
parent 3bb7769c40
commit fa8f6d8580
4 changed files with 38 additions and 146 deletions
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@ -1,142 +1,51 @@
 from __future__ import unicode_literals

 from .mtv import MTVServicesInfoExtractor
-from .common import InfoExtractor


 class ComedyCentralIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
-        (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes)))
-        /(?P<title>.*)'''
+    _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})'
    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'

    _TESTS = [{
-        'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
-        'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
+        'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike',
+        'md5': 'b8acb347177c680ff18a292aa2166f80',
        'info_dict': {
-            'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+            'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025',
            'ext': 'mp4',
-            'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother',
-            'description': 'After a certain point, breastfeeding becomes c**kblocking.',
-            'timestamp': 1376798400,
-            'upload_date': '20130818',
+            'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike',
+            'description': 'md5:5334307c433892b85f4f5e5ac9ef7498',
+            'timestamp': 1598670000,
+            'upload_date': '20200829',
        },
    }, {
-        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
+        'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314',
        'only_matching': True,
-    }]
-
-
-class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
-        (?:full-episodes|shows(?=/[^/]+/full-episodes))
-        /(?P<id>[^?]+)'''
-    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
-
-    _TESTS = [{
-        'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028',
-        'info_dict': {
-            'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."',
-            'title': 'November 28, 2016 - Ryan Speedo Green',
-        },
-        'playlist_count': 4,
    }, {
-        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        webpage = self._download_webpage(url, playlist_id)
-        mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1')
-        videos_info = self._get_videos_info(mgid)
-        return videos_info
-
-
-class ToshIE(MTVServicesInfoExtractor):
-    IE_DESC = 'Tosh.0'
-    _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)'
-    _FEED_URL = 'http://tosh.cc.com/feeds/mrss'
-
-    _TESTS = [{
-        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
-        'info_dict': {
-            'description': 'Tosh asked fans to share their summer plans.',
-            'title': 'Twitter Users Share Summer Plans',
-        },
-        'playlist': [{
-            'md5': 'f269e88114c1805bb6d7653fecea9e06',
-            'info_dict': {
-                'id': '90498ec2-ed00-11e0-aca6-0026b9414f30',
-                'ext': 'mp4',
-                'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans',
-                'description': 'Tosh asked fans to share their summer plans.',
-                'thumbnail': r're:^https?://.*\.jpg',
-                # It's really reported to be published on year 2077
-                'upload_date': '20770610',
-                'timestamp': 3390510600,
-                'subtitles': {
-                    'en': 'mincount:3',
-                },
-            },
-        }]
-    }, {
-        'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp',
+        'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate',
        'only_matching': True,
    }]


 class ComedyCentralTVIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})'
    _TESTS = [{
-        'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4',
+        'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1',
        'info_dict': {
-            'id': 'local_playlist-f99b626bdfe13568579a',
-            'ext': 'flv',
-            'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1',
-        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+            'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285',
+            'ext': 'mp4',
+            'title': 'Josh Investigates',
+            'description': 'Steht uns das Ende der Welt bevor?',
        },
-    }, {
-        'url': 'http://www.comedycentral.tv/shows/1074-workaholics',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus',
-        'only_matching': True,
    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        mrss_url = self._search_regex(
-            r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1',
-            webpage, 'mrss url', group='url')
-
-        return self._get_videos_info_from_url(mrss_url, video_id)
-
-
-class ComedyCentralShortnameIE(InfoExtractor):
-    _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$'
-    _TESTS = [{
-        'url': ':tds',
-        'only_matching': True,
-    }, {
-        'url': ':thedailyshow',
-        'only_matching': True,
-    }, {
-        'url': ':theopposition',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        shortcut_map = {
-            'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
-            'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
-            'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes',
+    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
+    _GEO_COUNTRIES = ['DE']
+
+    def _get_feed_query(self, uri):
+        return {
+            'accountOverride': 'intl.mtvi.com',
+            'arcEp': 'web.cc.tv',
+            'ep': 'b9032c3a',
+            'imageEp': 'web.cc.tv',
+            'mgid': uri,
        }
-        return self.url_result(shortcut_map[video_id])
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -235,11 +235,8 @@ from .cnn import (
 )
 from .coub import CoubIE
 from .comedycentral import (
-    ComedyCentralFullEpisodesIE,
    ComedyCentralIE,
-    ComedyCentralShortnameIE,
    ComedyCentralTVIE,
-    ToshIE,
 )
 from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .commonprotocols import (
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor):

        return try_get(feed, lambda x: x['result']['data']['id'], compat_str)

+    @staticmethod
+    def _extract_child_with_type(parent, t):
+        return next(c for c in parent['children'] if c.get('type') == t)
+
    def _extract_mgid(self, webpage):
        try:
            # the url can be http://media.mtvnservices.com/fb/{mgid}.swf
@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
        if not mgid:
            mgid = self._extract_triforce_mgid(webpage)

+        if not mgid:
+            data = self._parse_json(self._search_regex(
+                r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
+            main_container = self._extract_child_with_type(data, 'MainContainer')
+            video_player = self._extract_child_with_type(main_container, 'VideoPlayer')
+            mgid = video_player['props']['media']['video']['config']['uri']
+
        return mgid

    def _real_extract(self, url):
@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor):
        'only_matching': True,
    }]

-    @staticmethod
-    def extract_child_with_type(parent, t):
-        children = parent['children']
-        return next(c for c in children if c.get('type') == t)
-
-    def _extract_mgid(self, webpage):
-        data = self._parse_json(self._search_regex(
-            r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
-        main_container = self.extract_child_with_type(data, 'MainContainer')
-        video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
-        return video_player['props']['media']['video']['config']['uri']
-

 class MTVJapanIE(MTVServicesInfoExtractor):
    IE_NAME = 'mtvjapan'
--- a/youtube_dl/extractor/spike.py
+++ b/youtube_dl/extractor/spike.py
@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor):
    _FEED_URL = 'http://www.bellator.com/feeds/mrss/'
    _GEO_COUNTRIES = ['US']

-    def _extract_mgid(self, webpage):
-        return self._extract_triforce_mgid(webpage)
-

 class ParamountNetworkIE(MTVServicesInfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
    def _get_feed_query(self, uri):
        return {
            'arcEp': 'paramountnetwork.com',
+            'imageEp': 'paramountnetwork.com',
            'mgid': uri,
        }
-
-    def _extract_mgid(self, webpage):
-        root_data = self._parse_json(self._search_regex(
-            r'window\.__DATA__\s*=\s*({.+})',
-            webpage, 'data'), None)
-
-        def find_sub_data(data, data_type):
-            return next(c for c in data['children'] if c.get('type') == data_type)
-
-        c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer')
-        return c['props']['media']['video']['config']['uri']