From 317f7ab634174666e458807fa309a2e7ba459267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 Jul 2016 00:55:43 +0700 Subject: [PATCH] [YoutubeDL] Fix format selection with filters (Closes #10083) --- test/test_YoutubeDL.py | 34 ++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 63 ++++++++++++++++++++++++++++++----------- 2 files changed, 81 insertions(+), 16 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ca25025e2..0dfe25c00 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -335,6 +335,40 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) + def test_audio_only_extractor_format_selection(self): + # For extractors with incomplete formats (all formats are audio-only or + # video-only) best and worst should fallback to corresponding best/worst + # video-only or audio-only formats (as per + # https://github.com/rg3/youtube-dl/pull/5556) + formats = [ + {'format_id': 'low', 'ext': 'mp3', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'high', 'ext': 'mp3', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'best'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'high') + + ydl = YDL({'format': 'worst'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'low') + + def test_format_not_available(self): + formats = [ + {'format_id': 'regular', 'ext': 'mp4', 'height': 360, 'url': TEST_URL}, + {'format_id': 'video', 'ext': 'mp4', 'height': 720, 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # This must fail since complete video-audio format does not match filter + # and extractor does not provide incomplete only formats (i.e. only + # video-only or audio-only). + ydl = YDL({'format': 'best[height>360]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_invalid_format_specs(self): def assert_syntax_error(format_spec): ydl = YDL({'format': format_spec}) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ba72ec6f3..cf9cd8297 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, unicode_literals import collections import contextlib +import copy import datetime import errno import fileinput @@ -1051,9 +1052,9 @@ class YoutubeDL(object): if isinstance(selector, list): fs = [_build_selector_function(s) for s in selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - for format in f(formats): + for format in f(ctx): yield format return selector_function elif selector.type == GROUP: @@ -1061,17 +1062,17 @@ class YoutubeDL(object): elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - picked_formats = list(f(formats)) + picked_formats = list(f(ctx)) if picked_formats: return picked_formats return [] elif selector.type == SINGLE: format_spec = selector.selector - def selector_function(formats): - formats = list(formats) + def selector_function(ctx): + formats = list(ctx['formats']) if not formats: return if format_spec == 'all': @@ -1084,9 +1085,10 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: yield audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in formats) or - all(f.get('vcodec') != 'none' for f in formats)): + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) we will fallback to best/worst + # {video,audio}-only format + elif ctx['incomplete_formats']: yield formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1160,17 +1162,18 @@ class YoutubeDL(object): } video_selector, audio_selector = map(_build_selector_function, selector.selector) - def selector_function(formats): - formats = list(formats) - for pair in itertools.product(video_selector(formats), audio_selector(formats)): + def selector_function(ctx): + for pair in itertools.product( + video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] - def final_selector(formats): + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) for _filter in filters: - formats = list(filter(_filter, formats)) - return selector_function(formats) + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) @@ -1377,7 +1380,35 @@ class YoutubeDL(object): req_format_list.append('best') req_format = '/'.join(req_format_list) format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/rg3/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/rg3/youtube-dl/issues/10083). + incomplete_formats = all( + # All formats are video-only or + f.get('vcodec') != 'none' and f.get('acodec') == 'none' or + # all formats are audio-only + f.get('vcodec') == 'none' and f.get('acodec') != 'none' + for f in formats) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True)