From e7926ae9f4e5fa258696551a39295402819280c9 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 28 Jul 2023 06:03:14 +0100 Subject: [utils] Rework decoding of `Content-Encoding`s * support nested encodings * support optional `br` encoding, if brotli package is installed * support optional 'compress' encoding, if ncompress package is installed * response `Content-Encoding` has only unprocessed encodings, or removed * response `Content-Length` is decoded length (usable for filesize metadata) * use zlib for both deflate and gzip decompression * some elements taken from yt-dlp: thx especially coletdjnz --- youtube_dl/utils.py | 120 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 30 deletions(-) (limited to 'youtube_dl/utils.py') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b5475434f..e73291107 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -15,7 +15,6 @@ import email.utils import email.header import errno import functools -import gzip import inspect import io import itertools @@ -42,6 +41,7 @@ from .compat import ( compat_HTMLParseError, compat_HTMLParser, compat_basestring, + compat_brotli as brotli, compat_casefold, compat_chr, compat_collections_abc, @@ -55,6 +55,7 @@ from .compat import ( compat_http_client, compat_integer_types, compat_kwargs, + compat_ncompress as ncompress, compat_os_name, compat_re_Match, compat_re_Pattern, @@ -2638,11 +2639,44 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): req) @staticmethod - def deflate(data): + def deflate_gz(data): try: - return zlib.decompress(data, -zlib.MAX_WBITS) + # format:zlib,gzip + windowsize:32768 + return data and zlib.decompress(data, 32 + zlib.MAX_WBITS) except zlib.error: - return zlib.decompress(data) + # raw zlib * windowsize:32768 (RFC 9110: "non-conformant") + return zlib.decompress(data, -zlib.MAX_WBITS) + + @staticmethod + def gzip(data): + + from gzip import GzipFile + + def _gzip(data): + with io.BytesIO(data) as data_buf: + gz = GzipFile(fileobj=data_buf, mode='rb') + return gz.read() + + try: + return _gzip(data) + except IOError as original_ioerror: + # There may be junk at the end of the file + # See http://stackoverflow.com/q/4928560/35070 for details + for i in range(1, 1024): + try: + return _gzip(data[:-i]) + except IOError: + continue + else: + raise original_ioerror + + @staticmethod + def brotli(data): + return data and brotli.decompress(data) + + @staticmethod + def compress(data): + return data and ncompress.decompress(data) def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not @@ -2679,33 +2713,59 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def http_response(self, req, resp): old_resp = resp - # gzip - if resp.headers.get('Content-encoding', '') == 'gzip': - content = resp.read() - gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') - try: - uncompressed = io.BytesIO(gz.read()) - except IOError as original_ioerror: - # There may be junk at the end of the file - # See http://stackoverflow.com/q/4928560/35070 for details - for i in range(1, 1024): - try: - gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') - uncompressed = io.BytesIO(gz.read()) - except IOError: - continue - break - else: - raise original_ioerror - resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) - resp.msg = old_resp.msg - del resp.headers['Content-encoding'] - # deflate - if resp.headers.get('Content-encoding', '') == 'deflate': - gz = io.BytesIO(self.deflate(resp.read())) - resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) + + # Content-Encoding header lists the encodings in order that they were applied [1]. + # To decompress, we simply do the reverse. + # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding + decoded_response = None + decoders = { + 'gzip': self.deflate_gz, + 'deflate': self.deflate_gz, + } + if brotli: + decoders['br'] = self.brotli + if ncompress: + decoders['compress'] = self.compress + if sys.platform.startswith('java'): + # Jython zlib implementation misses gzip + decoders['gzip'] = self.gzip + + def encodings(hdrs): + # A header field that allows multiple values can have multiple instances [2]. + # [2]: https://datatracker.ietf.org/doc/html/rfc9110#name-fields + for e in reversed(','.join(hdrs).split(',')): + if e: + yield e.strip() + + encodings_left = [] + try: + resp.headers.get_all + hdrs = resp.headers + except AttributeError: + # Py2 has no get_all() method: headers are rfc822.Message + from email.message import Message + hdrs = Message() + for k, v in resp.headers.items(): + hdrs[k] = v + + decoder, decoded_response = True, None + for encoding in encodings(hdrs.get_all('Content-Encoding', [])): + # "SHOULD consider" x-compress, x-gzip as compress, gzip + decoder = decoder and decoders.get(remove_start(encoding, 'x-')) + if not decoder: + encodings_left.insert(0, encoding) + continue + decoded_response = decoder(decoded_response or resp.read()) + if decoded_response is not None: + resp = compat_urllib_request.addinfourl( + io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg - del resp.headers['Content-encoding'] + del resp.headers['Content-Length'] + resp.headers['Content-Length'] = '%d' % len(decoded_response) + del resp.headers['Content-Encoding'] + if encodings_left: + resp.headers['Content-Encoding'] = ', '.join(encodings_left) + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # https://github.com/ytdl-org/youtube-dl/issues/6457). if 300 <= resp.code < 400: -- cgit 1.4.1