[utils] Detect extension from any RFC Content-Disposition syntax

Add support for unquoted token and RFC 5987 extended parameter syntax
pull/29845/head
df 3 years ago committed by dirkf
parent 7990d1e630
commit 973f76cf7b

@ -2069,6 +2069,33 @@ Line 1
'Content-Type': b'audio/mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and unquoted filename
urlh = UrlHandle({
'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition including spacing and uppercase
urlh = UrlHandle({
'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and extended filename parameter syntax
urlh = UrlHandle({
'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3",
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and both filename parameter syntaxes
urlh = UrlHandle({
'Content-Disposition': b'''attachment; filename="should ignore.mp4";
FileName* = iso8859-15''costs%201%A4%20filename.mp3''',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and 'wrong' order of both syntaxes
urlh = UrlHandle({
'Content-Disposition': b'''attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3;
filename="should ignore.mp4"''',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
if __name__ == '__main__':

@ -66,9 +66,10 @@ from .compat import (
compat_urllib_HTTPError,
compat_urllib_parse,
compat_urllib_parse_parse_qs as compat_parse_qs,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urllib_parse_unquote_plus,
compat_urllib_request,
compat_xpath,
)
@ -4755,9 +4756,22 @@ def urlhandle_detect_ext(url_handle):
cd = getheader('Content-Disposition')
if cd:
m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
m = re.match(r'''(?xi)
attachment;\s*
(?:filename\s*=[^;]+?;\s*)? # possible initial filename=...;, ignored
filename(?P<x>\*)?\s*=\s* # filename/filename* =
(?(x)(?P<charset>\S+?)'[\w-]*'|(?P<q>")?) # if * then charset'...' else maybe "
(?P<filename>(?(q)[^"]+(?=")|[^\s;]+)) # actual name of file
''', cd)
if m:
e = determine_ext(m.group('filename'), default_ext=None)
m = m.groupdict()
filename = m.get('filename')
if m.get('x'):
try:
filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8'))
except LookupError: # unrecognised character set name
pass
e = determine_ext(filename, default_ext=None)
if e:
return e

Loading…
Cancel
Save