From ecd14c832de1663a2b4d06827ba584be30167bda Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 27 Feb 2022 13:33:28 +0000 Subject: [PATCH] [extractor/facebook] Update extraction improvements and fix tests * avoid crashing in parse_attachment() on invalid attachment * ignore empty results in search --- youtube_dl/extractor/facebook.py | 40 +++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 786a95849..a3f1bd325 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -83,7 +83,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Asif Nawab Butt', + 'title': 'Asif', 'uploader': 'Asif Nawab Butt', 'uploader_id': '100005115464446', 'upload_date': '20140506', @@ -173,6 +173,7 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is only available for registered users', }, { 'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471', 'info_dict': { @@ -188,15 +189,16 @@ class FacebookIE(InfoExtractor): 'skip_download': True, }, # TODO: parse this webpage - 'skip': 'Cannot parse webpage well', + 'skip': 'This video is only available for registered users (and cannot parse webpage well)', }, { # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '202882990186699', 'ext': 'mp4', + 'title': 'birb (O v O\") | Hello? Yes your uber ride is here', - 'description': 'md5:963dee8a667a2b49f2059cf7ab54fe55', + 'description': 'md5:f122681cf504d04be12bb2bdc66c81fd', 'timestamp': 1486035494, 'upload_date': '20170202', 'uploader': 'Elisabeth Ahtn', @@ -308,15 +310,17 @@ class FacebookIE(InfoExtractor): 'skip': 'Requires logging in', }, { # data.node.comet_sections.content.story.attachments[].styles.attachment.media - 'url': 'https://www.facebook.com/groups/352666925484718/permalink/1112505706167499/', + # gone : 'https://www.facebook.com/groups/352666925484718/permalink/1112505706167499/', + 'url': 'https://www.facebook.com/groups/471738070722101/permalink/471799617382613/', 'info_dict': { - 'id': '1080405282779948', + 'id': '101173872166725', 'ext': 'mp4', - 'title': 'Best Chinese Song', - 'timestamp': 1641225430, - 'upload_date': '20220103', - 'uploader_id': '1847991063', - 'uploader': 'Best Chinese Song', + 'title': 'md5:0ac174e60ed9d9a5eec588343f6a889c', + 'description': 'md5:80efdff1371fa5d92758df2fb1b97bfd', + 'upload_date': '20210517', + 'timestamp': 1621247381, + 'uploader': 'md5:34165f897c692f92b13c2cc1eddd8245', + 'uploader_id': '100957655521680', }, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @@ -417,15 +421,23 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r'(?s)(.*?)', webpage, 'alternative title', default=None) + + # _html_search_meta() finds '' ahead of later non-empty results + def html_search_meta_non_empty(patterns, html, name, **kwargs): + for p in patterns: + result = self._html_search_meta(p, html, name, **kwargs) + if result: + return result + if not video_title: - video_title = self._html_search_meta( + video_title = html_search_meta_non_empty( ['og:title', 'twitter:title', 'description'], webpage, 'title', default=None) if video_title: video_title = limit_length(video_title, 80) else: video_title = 'Facebook video #%s' % video_id - description = self._html_search_meta( + description = html_search_meta_non_empty( ['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) uploader = clean_html(get_element_by_id( @@ -435,7 +447,7 @@ class FacebookIE(InfoExtractor): timestamp = int_or_none(self._search_regex( r']+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) - thumbnail = self._html_search_meta( + thumbnail = html_search_meta_non_empty( ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) # some webpages contain unretrievable thumbnail urls # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 @@ -562,7 +574,7 @@ class FacebookIE(InfoExtractor): entries.append(info) def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} + media = try_get(attachment, lambda x: x[key], dict) or {} if media.get('__typename') == 'Video': return parse_graphql_video(media)