[utils] Detect extension from any RFC Content-Disposition syntax

Add support for unquoted token and RFC 5987 extended parameter syntax
3 years ago · 973f76cf7b
parent 7990d1e630
commit 973f76cf7b
2 changed files with 44 additions and 3 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -2069,6 +2069,33 @@ Line 1
            'Content-Type': b'audio/mp3',
        })
        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and unquoted filename
+        urlh = UrlHandle({
+            'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition including spacing and uppercase
+        urlh = UrlHandle({
+            'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and extended filename parameter syntax
+        urlh = UrlHandle({
+            'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3",
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and both filename parameter syntaxes
+        urlh = UrlHandle({
+            'Content-Disposition': b'''attachment; filename="should ignore.mp4";
+             FileName* = iso8859-15''costs%201%A4%20filename.mp3''',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and 'wrong' order of both syntaxes
+        urlh = UrlHandle({
+            'Content-Disposition': b'''attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3;
+            filename="should ignore.mp4"''',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')


 if __name__ == '__main__':
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -66,9 +66,10 @@ from .compat import (
    compat_urllib_HTTPError,
    compat_urllib_parse,
    compat_urllib_parse_parse_qs as compat_parse_qs,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
    compat_urllib_parse_urlencode,
    compat_urllib_parse_urlparse,
-    compat_urllib_parse_unquote_plus,
    compat_urllib_request,
    compat_xpath,
 )
@ -4755,9 +4756,22 @@ def urlhandle_detect_ext(url_handle):

    cd = getheader('Content-Disposition')
    if cd:
-        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+        m = re.match(r'''(?xi)
+            attachment;\s*
+            (?:filename\s*=[^;]+?;\s*)?                    # possible initial filename=...;, ignored
+            filename(?P<x>\*)?\s*=\s*                      # filename/filename* =
+                (?(x)(?P<charset>\S+?)'[\w-]*'|(?P<q>")?)  # if * then charset'...' else maybe "
+                (?P<filename>(?(q)[^"]+(?=")|[^\s;]+))         # actual name of file
+            ''', cd)
        if m:
-            e = determine_ext(m.group('filename'), default_ext=None)
+            m = m.groupdict()
+            filename = m.get('filename')
+            if m.get('x'):
+                try:
+                    filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8'))
+                except LookupError:  # unrecognised character set name
+                    pass
+            e = determine_ext(filename, default_ext=None)
            if e:
                return e