-
Notifications
You must be signed in to change notification settings - Fork 3k
Core decompress body #18581
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Core decompress body #18581
Changes from 12 commits
fca7114
2eb475a
e701b08
eaaf0d9
1ef0f93
7fe6f99
c85194e
2f76efd
74ef986
4ea0e1d
e3362bf
1dcebea
6e20e09
2785d0d
5cb8420
3c687ff
c9cb8ec
c9eed8d
940c2bc
db86c02
b77ce5f
64ffe34
8d29899
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,9 +25,14 @@ | |
# -------------------------------------------------------------------------- | ||
from typing import Any, Optional, AsyncIterator as AsyncIteratorType | ||
from collections.abc import AsyncIterator | ||
try: | ||
import cchardet as chardet | ||
except ImportError: # pragma: no cover | ||
import chardet # type: ignore | ||
|
||
import logging | ||
import asyncio | ||
import codecs | ||
import aiohttp | ||
from multidict import CIMultiDict | ||
from requests.exceptions import StreamConsumedError | ||
|
@@ -51,7 +56,7 @@ class AioHttpTransport(AsyncHttpTransport): | |
|
||
Fully asynchronous implementation using the aiohttp library. | ||
|
||
:param session: The client session. | ||
:param aiohttp.ClientSession session: The client session. | ||
:param loop: The event loop. | ||
:param bool session_owner: Session owner. Defaults True. | ||
|
||
|
@@ -69,7 +74,7 @@ class AioHttpTransport(AsyncHttpTransport): | |
def __init__(self, *, session=None, loop=None, session_owner=True, **kwargs): | ||
self._loop = loop | ||
self._session_owner = session_owner | ||
self.session = session | ||
self.session = session # type: aiohttp.ClientSession | ||
self.connection_config = ConnectionConfiguration(**kwargs) | ||
self._use_env_settings = kwargs.pop('use_env_settings', True) | ||
|
||
|
@@ -145,6 +150,7 @@ async def send(self, request: HttpRequest, **config: Any) -> Optional[AsyncHttpR | |
:keyword str proxy: will define the proxy to use all the time | ||
""" | ||
await self.open() | ||
auto_decompress = self.session.auto_decompress | ||
|
||
proxies = config.pop('proxies', None) | ||
if proxies and 'proxy' not in config: | ||
|
@@ -180,7 +186,9 @@ async def send(self, request: HttpRequest, **config: Any) -> Optional[AsyncHttpR | |
allow_redirects=False, | ||
**config | ||
) | ||
response = AioHttpTransportResponse(request, result, self.connection_config.data_block_size) | ||
response = AioHttpTransportResponse(request, result, | ||
self.connection_config.data_block_size, | ||
decompress=not auto_decompress) | ||
if not stream_response: | ||
await response.load_body() | ||
except aiohttp.client_exceptions.ClientResponseError as err: | ||
|
@@ -250,21 +258,40 @@ class AioHttpTransportResponse(AsyncHttpResponse): | |
:type aiohttp_response: aiohttp.ClientResponse object | ||
:param block_size: block size of data sent over connection. | ||
:type block_size: int | ||
:keyword bool decompress: If True which is default, will attempt to decode the body based | ||
on the ‘content-encoding’ header. | ||
""" | ||
def __init__(self, request: HttpRequest, aiohttp_response: aiohttp.ClientResponse, block_size=None) -> None: | ||
def __init__(self, request: HttpRequest, | ||
aiohttp_response: aiohttp.ClientResponse, | ||
block_size=None, **kwargs) -> None: | ||
super(AioHttpTransportResponse, self).__init__(request, aiohttp_response, block_size=block_size) | ||
# https://aiohttp.readthedocs.io/en/stable/client_reference.html#aiohttp.ClientResponse | ||
self.status_code = aiohttp_response.status | ||
self.headers = CIMultiDict(aiohttp_response.headers) | ||
self.reason = aiohttp_response.reason | ||
self.content_type = aiohttp_response.headers.get('content-type') | ||
self._body = None | ||
self._decompress = kwargs.pop("decompress", True) | ||
if len(kwargs) > 0: | ||
raise TypeError("Got an unexpected keyword argument: {}".format(list(kwargs.keys())[0])) | ||
|
||
def body(self) -> bytes: | ||
"""Return the whole body as bytes in memory. | ||
""" | ||
if self._body is None: | ||
raise ValueError("Body is not available. Call async method load_body, or do your call with stream=False.") | ||
if not self._decompress: | ||
return self._body | ||
enc = self.headers.get('Content-Encoding') | ||
if not enc: | ||
return self._body | ||
enc = enc.lower() | ||
if enc in ("gzip", "deflate"): | ||
import zlib | ||
zlib_mode = 16 + zlib.MAX_WBITS if enc == "gzip" else zlib.MAX_WBITS | ||
decompressor = zlib.decompressobj(wbits=zlib_mode) | ||
body = decompressor.decompress(self._body) | ||
return body | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do have some concerns about us not caching the decompressed body. Because we only need it once, right? Do we have any other access to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't expect (as least I did not see) users need to get body twice. If you want, we can update the code like:
But to be honest, I don't see lots of value for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The don't need to get the body more than once. And it would not be clear to me that getting the body and then the text will decompress the body twice. I don't think we need to keep the compressed data around once it has been decompressed, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds fair. Updated. :) |
||
return self._body | ||
|
||
def text(self, encoding: Optional[str] = None) -> str: | ||
|
@@ -274,10 +301,36 @@ def text(self, encoding: Optional[str] = None) -> str: | |
|
||
:param str encoding: The encoding to apply. | ||
""" | ||
# super().text detects charset based on self._body() which is compressed | ||
# implement the decoding explicitly here | ||
body = self.body() | ||
|
||
ctype = self.headers.get(aiohttp.hdrs.CONTENT_TYPE, "").lower() | ||
mimetype = aiohttp.helpers.parse_mimetype(ctype) | ||
|
||
encoding = mimetype.parameters.get("charset") | ||
if encoding: | ||
try: | ||
codecs.lookup(encoding) | ||
except LookupError: | ||
encoding = None | ||
if not encoding: | ||
if mimetype.type == "application" and ( | ||
mimetype.subtype == "json" or mimetype.subtype == "rdap" | ||
): | ||
# RFC 7159 states that the default encoding is UTF-8. | ||
# RFC 7483 defines application/rdap+json | ||
encoding = "utf-8" | ||
elif body is None: | ||
raise RuntimeError( | ||
"Cannot guess the encoding of a not yet read body" | ||
) | ||
else: | ||
encoding = chardet.detect(body)["encoding"] | ||
annatisch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not encoding: | ||
annatisch marked this conversation as resolved.
Show resolved
Hide resolved
|
||
encoding = self.internal_response.get_encoding() | ||
encoding = "utf-8-sig" | ||
|
||
return super().text(encoding) | ||
return body.decode(encoding) | ||
|
||
async def load_body(self) -> None: | ||
"""Load in memory the body, so it could be accessible from sync methods.""" | ||
|
Uh oh!
There was an error while loading. Please reload this page.