From 973f76cf7b2b02426d4d5335cd149aabe38c5bbb Mon Sep 17 00:00:00 2001 From: df Date: Sun, 10 Oct 2021 12:42:51 +0100 Subject: [PATCH] [utils] Detect extension from any RFC Content-Disposition syntax Add support for unquoted token and RFC 5987 extended parameter syntax --- test/test_utils.py | 27 +++++++++++++++++++++++++++ youtube_dl/utils.py | 20 +++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 58a79c8a8..8df794c13 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2069,6 +2069,33 @@ Line 1 'Content-Type': b'audio/mp3', }) self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and unquoted filename + urlh = UrlHandle({ + 'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition including spacing and uppercase + urlh = UrlHandle({ + 'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and extended filename parameter syntax + urlh = UrlHandle({ + 'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3", + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and both filename parameter syntaxes + urlh = UrlHandle({ + 'Content-Disposition': b'''attachment; filename="should ignore.mp4"; + FileName* = iso8859-15''costs%201%A4%20filename.mp3''', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and 'wrong' order of both syntaxes + urlh = UrlHandle({ + 'Content-Disposition': b'''attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3; + filename="should ignore.mp4"''', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') if __name__ == '__main__': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ff3479fbd..e059f4a72 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -66,9 +66,10 @@ from .compat import ( compat_urllib_HTTPError, compat_urllib_parse, compat_urllib_parse_parse_qs as compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urllib_parse_unquote_plus, compat_urllib_request, compat_xpath, ) @@ -4755,9 +4756,22 @@ def urlhandle_detect_ext(url_handle): cd = getheader('Content-Disposition') if cd: - m = re.match(r'attachment;\s*filename="(?P[^"]+)"', cd) + m = re.match(r'''(?xi) + attachment;\s* + (?:filename\s*=[^;]+?;\s*)? # possible initial filename=...;, ignored + filename(?P\*)?\s*=\s* # filename/filename* = + (?(x)(?P\S+?)'[\w-]*'|(?P")?) # if * then charset'...' else maybe " + (?P(?(q)[^"]+(?=")|[^\s;]+)) # actual name of file + ''', cd) if m: - e = determine_ext(m.group('filename'), default_ext=None) + m = m.groupdict() + filename = m.get('filename') + if m.get('x'): + try: + filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8')) + except LookupError: # unrecognised character set name + pass + e = determine_ext(filename, default_ext=None) if e: return e