about summary refs log tree commit diff
diff options
context:
space:
mode:
authordf <fieldhouse@gmx.net>2021-10-10 12:42:51 +0100
committerdf <fieldhouse@gmx.net>2021-10-10 14:05:50 +0100
commit74b53109ff5923df82498ee967c8c263c18f9721 (patch)
tree8176aceaae157934a344c8d61961f2ac471b0182
parentf798b40cf332c0a00f167fcfc9d560fa8f795c13 (diff)
downloadyoutube-dl-74b53109ff5923df82498ee967c8c263c18f9721.tar.gz
youtube-dl-74b53109ff5923df82498ee967c8c263c18f9721.tar.xz
youtube-dl-74b53109ff5923df82498ee967c8c263c18f9721.zip
Detect extension from any RFC Content-Disposition syntax
Add support for unquoted token and RFC 5987 extended parameter syntax
-rw-r--r--test/test_utils.py27
-rw-r--r--youtube_dl/utils.py20
2 files changed, 44 insertions, 3 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index 14607f6b8..50fb5f101 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1505,6 +1505,33 @@ Line 1
             'Content-Type': b'audio/mp3',
         })
         self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and unquoted filename
+        urlh = UrlHandle({
+            'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition including spacing and uppercase
+        urlh = UrlHandle({
+            'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and extended filename parameter syntax
+        urlh = UrlHandle({
+            'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3",
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and both filename parameter syntaxes
+        urlh = UrlHandle({
+            'Content-Disposition': b'''attachment; filename="should ignore.mp4";
+             FileName* = iso8859-15''costs%201%A4%20filename.mp3''',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and 'wrong' order of both syntaxes
+        urlh = UrlHandle({
+            'Content-Disposition': b'''attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3;
+            filename="should ignore.mp4"''',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
 
 
 if __name__ == '__main__':
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 90eb9f93c..02631406c 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -58,9 +58,10 @@ from .compat import (
     compat_struct_unpack,
     compat_urllib_error,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
     compat_urllib_parse_urlencode,
     compat_urllib_parse_urlparse,
-    compat_urllib_parse_unquote_plus,
     compat_urllib_request,
     compat_urlparse,
     compat_xpath,
@@ -4309,9 +4310,22 @@ def urlhandle_detect_ext(url_handle):
 
     cd = encode_compat_str_or_none(getheader('Content-Disposition'))
     if cd:
-        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+        m = re.match(r'''(?xi)
+            attachment;\s*
+            (?:filename\s*=[^;]+?;\s*)?                    # possible initial filename=...;, ignored
+            filename(?P<x>\*)?\s*=\s*                      # filename/filename* =
+                (?(x)(?P<charset>\S+?)'[\w-]*'|(?P<q>")?)  # if * then charset'...' else maybe "
+                (?P<filename>(?(q)[^"]+(?=")|[^\s;]+))         # actual name of file
+            ''', cd)
         if m:
-            e = determine_ext(m.group('filename'), default_ext=None)
+            m = m.groupdict()
+            filename = m.get('filename')
+            if m.get('x'):
+                try:
+                    filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8'))
+                except LookupError:  # unrecognised character set name
+                    pass
+            e = determine_ext(filename, default_ext=None)
             if e:
                 return e