Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve handling of prettified responses without correct content-type encoding #1110

Merged
merged 16 commits into from
Sep 29, 2021
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This project adheres to [Semantic Versioning](https://semver.org/).
- Added support for formatting & coloring of JSON bodies preceded by non-JSON data (e.g., an XXSI prefix). ([#1130](https://github.com/httpie/httpie/issues/1130))
- Added `--format-options=response.as:CONTENT_TYPE` to allow overriding the response `Content-Type`. ([#1134](https://github.com/httpie/httpie/issues/1134))
- Added `--response-as` shortcut for setting the response `Content-Type`-related `--format-options`. ([#1134](https://github.com/httpie/httpie/issues/1134))
- Improved handling of prettified responses without correct `Content-Type` encoding. ([#1110](https://github.com/httpie/httpie/issues/1110))
- Installed plugins are now listed in `--debug` output. ([#1165](https://github.com/httpie/httpie/issues/1165))
- Fixed duplicate keys preservation of JSON data. ([#1163](https://github.com/httpie/httpie/issues/1163))

Expand Down
12 changes: 12 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,18 @@ For example, the following request will force the response to be treated as XML:
$ http --response-as=application/xml pie.dev/get
```

And the following requests will force the response to use the [big5](https://docs.python.org/3/library/codecs.html#standard-encodings) encoding:

```bash
$ http --response-as='charset=big5' pie.dev/get
```

```bash
$ http --response-as='text/plain; charset=big5' pie.dev/get
```

Given the encoding is not sent by the server, HTTPie will auto-detect it.

### Binary data

Binary data is suppressed for terminal output, which makes it safe to perform requests to URLs that send back binary data.
Expand Down
2 changes: 2 additions & 0 deletions httpie/cli/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,8 @@
Override the response Content-Type for formatting purposes, e.g.:

--response-as=application/xml
--response-as=charset=utf-8
--response-as='application/xml; charset=utf-8'

It is a shortcut for:

Expand Down
37 changes: 37 additions & 0 deletions httpie/codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Union

from charset_normalizer import from_bytes

from .constants import UTF8

Bytes = Union[bytearray, bytes]


def detect_encoding(content: Bytes) -> str:
"""Detect the `content` encoding.
Fallback to UTF-8 when no suitable encoding found.

"""
match = from_bytes(bytes(content)).best()
return match.encoding if match else UTF8


def decode(content: Bytes, encoding: str) -> str:
"""Decode `content` using the given `encoding`.
If no `encoding` is provided, the best effort is to guess it from `content`.

Unicode errors are replaced.

"""
if not encoding:
encoding = detect_encoding(content)
return content.decode(encoding, 'replace')


def encode(content: str, encoding: str) -> bytes:
"""Encode `content` using the given `encoding`.

Unicode errors are replaced.

"""
return content.encode(encoding, 'replace')
11 changes: 0 additions & 11 deletions httpie/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@ def headers(self) -> str:
def encoding(self) -> Optional[str]:
"""Return a `str` with the message's encoding, if known."""

@property
def body(self) -> bytes:
"""Return a `bytes` with the message's body."""
raise NotImplementedError()

@property
def content_type(self) -> str:
"""Return the message content type."""
Expand Down Expand Up @@ -86,12 +81,6 @@ def headers(self):
def encoding(self):
return self._orig.encoding or UTF8

@property
def body(self):
# Only now the response body is fetched.
# Shouldn't be touched unless the body is actually needed.
return self._orig.content


class HTTPRequest(HTTPMessage):
"""A :class:`requests.models.Request` wrapper."""
Expand Down
2 changes: 1 addition & 1 deletion httpie/output/formatters/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def pretty_xml(document: 'Document',
}
if standalone is not None and sys.version_info >= (3, 9):
kwargs['standalone'] = standalone
body = document.toprettyxml(**kwargs).decode()
body = document.toprettyxml(**kwargs).decode(kwargs['encoding'])

# Remove blank lines automatically added by `toprettyxml()`.
return '\n'.join(line for line in body.splitlines() if line.strip())
Expand Down
35 changes: 22 additions & 13 deletions httpie/output/streams.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from abc import ABCMeta, abstractmethod
from itertools import chain
from typing import Callable, Iterable, Union
from typing import Any, Callable, Dict, Iterable, Tuple, Union

from .. import codec
from ..cli.constants import EMPTY_FORMAT_OPTION
from ..context import Environment
from ..constants import UTF8
Expand Down Expand Up @@ -114,8 +115,8 @@ def iter_body(self) -> Iterable[bytes]:
for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
if b'\0' in line:
raise BinarySuppressedError()
yield line.decode(self.msg.encoding) \
.encode(self.output_encoding, 'replace') + lf
line = codec.decode(line, self.msg.encoding)
yield codec.encode(line, self.output_encoding) + lf


class PrettyStream(EncodedStream):
Expand All @@ -137,15 +138,23 @@ def __init__(
super().__init__(**kwargs)
self.formatting = formatting
self.conversion = conversion
self.mime = self.get_mime()
self.mime, mime_options = self._get_mime_and_options()
self.encoding = mime_options.get('charset') or ''

def get_mime(self) -> str:
mime = parse_header_content_type(self.msg.content_type)[0]
if isinstance(self.msg, HTTPResponse):
forced_content_type = self.formatting.options['response']['as']
if forced_content_type != EMPTY_FORMAT_OPTION:
mime = parse_header_content_type(forced_content_type)[0] or mime
return mime
def _get_mime_and_options(self) -> Tuple[str, Dict[str, Any]]:
# Defaults from the `Content-Type` header.
mime, options = parse_header_content_type(self.msg.content_type)

if not isinstance(self.msg, HTTPResponse):
return mime, options

# Override from the `--response-as` option.
forced_content_type = self.formatting.options['response']['as']
if forced_content_type == EMPTY_FORMAT_OPTION:
return mime, options

forced_mime, forced_options = parse_header_content_type(forced_content_type)
return (forced_mime or mime, forced_options or options)

def get_headers(self) -> bytes:
return self.formatting.format_headers(
Expand Down Expand Up @@ -176,9 +185,9 @@ def process_body(self, chunk: Union[str, bytes]) -> bytes:
if not isinstance(chunk, str):
# Text when a converter has been used,
# otherwise it will always be bytes.
chunk = chunk.decode(self.msg.encoding, 'replace')
chunk = codec.decode(chunk, self.encoding)
chunk = self.formatting.format_body(content=chunk, mime=self.mime)
return chunk.encode(self.output_encoding, 'replace')
return codec.encode(chunk, self.output_encoding)


class BufferedPrettyStream(PrettyStream):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
'wheel',
]
install_requires = [
'charset_normalizer>=2.0.0',
'defusedxml>=0.6.0',
'requests[socks]>=2.22.0',
'Pygments>=2.5.2',
Expand Down
7 changes: 7 additions & 0 deletions tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,10 @@ def test_max_headers_limit(httpbin_both):

def test_max_headers_no_limit(httpbin_both):
assert HTTP_OK in http('--max-headers=0', httpbin_both + '/get')


def test_charset_argument_unknown_encoding(httpbin_both):
with raises(LookupError) as e:
http('--response-as', 'charset=foobar',
'GET', httpbin_both + '/get')
assert 'unknown encoding: foobar' in str(e.value)
102 changes: 101 additions & 1 deletion tests/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,17 @@
Various unicode handling related tests.

"""
from .utils import http, HTTP_OK
import pytest
import responses

from httpie.cli.constants import PRETTY_MAP
from httpie.constants import UTF8

from .utils import http, HTTP_OK, URL_EXAMPLE
from .fixtures import UNICODE

ENCODINGS = [UTF8, 'windows-1250']


def test_unicode_headers(httpbin):
# httpbin doesn't interpret UFT-8 headers
Expand Down Expand Up @@ -109,3 +117,95 @@ def test_unicode_digest_auth(httpbin):
http('--auth-type=digest',
'--auth', f'test:{UNICODE}',
f'{httpbin.url}/digest-auth/auth/test/{UNICODE}')


@pytest.mark.parametrize('encoding', ENCODINGS)
@responses.activate
def test_GET_encoding_detection_from_content_type_header(encoding):
responses.add(responses.GET,
URL_EXAMPLE,
body='<?xml version="1.0"?>\n<c>Financiën</c>'.encode(encoding),
content_type=f'text/xml; charset={encoding.upper()}')
r = http('GET', URL_EXAMPLE)
assert 'Financiën' in r


@pytest.mark.parametrize('encoding', ENCODINGS)
@responses.activate
def test_GET_encoding_detection_from_content(encoding):
body = f'<?xml version="1.0" encoding="{encoding.upper()}"?>\n<c>Financiën</c>'
responses.add(responses.GET,
URL_EXAMPLE,
body=body.encode(encoding),
content_type='text/xml')
r = http('GET', URL_EXAMPLE)
assert 'Financiën' in r


@responses.activate
def test_GET_encoding_provided_by_format_options():
responses.add(responses.GET,
URL_EXAMPLE,
body='▒▒▒'.encode('johab'),
content_type='text/plain')
r = http('--format-options', 'response.as:text/plain; charset=johab',
'GET', URL_EXAMPLE)
assert '▒▒▒' in r


@responses.activate
def test_GET_encoding_provided_by_shortcut_option():
responses.add(responses.GET,
URL_EXAMPLE,
body='▒▒▒'.encode('johab'),
content_type='text/plain')
r = http('--response-as', 'text/plain; charset=johab',
'GET', URL_EXAMPLE)
assert '▒▒▒' in r


@pytest.mark.parametrize('encoding', ENCODINGS)
@responses.activate
def test_GET_encoding_provided_by_empty_shortcut_option_should_use_content_detection(encoding):
body = f'<?xml version="1.0" encoding="{encoding.upper()}"?>\n<c>Financiën</c>'
responses.add(responses.GET,
URL_EXAMPLE,
body=body.encode(encoding),
content_type='text/xml')
r = http('--response-as', '', 'GET', URL_EXAMPLE)
assert 'Financiën' in r


@pytest.mark.parametrize('encoding', ENCODINGS)
@responses.activate
def test_POST_encoding_detection_from_content_type_header(encoding):
responses.add(responses.POST,
URL_EXAMPLE,
body='Všichni lidé jsou si rovni.'.encode(encoding),
content_type=f'text/plain; charset={encoding.upper()}')
r = http('--form', 'POST', URL_EXAMPLE)
assert 'Všichni lidé jsou si rovni.' in r


@pytest.mark.parametrize('encoding', ENCODINGS)
@responses.activate
def test_POST_encoding_detection_from_content(encoding):
responses.add(responses.POST,
URL_EXAMPLE,
body='Všichni lidé jsou si rovni.'.encode(encoding),
content_type='text/plain')
r = http('--form', 'POST', URL_EXAMPLE)
assert 'Všichni lidé jsou si rovni.' in r


@pytest.mark.parametrize('encoding', ENCODINGS)
@pytest.mark.parametrize('pretty', PRETTY_MAP.keys())
@responses.activate
def test_stream_encoding_detection_from_content_type_header(encoding, pretty):
responses.add(responses.GET,
URL_EXAMPLE,
body='<?xml version="1.0"?>\n<c>Financiën</c>'.encode(encoding),
stream=True,
content_type=f'text/xml; charset={encoding.upper()}')
r = http('--pretty=' + pretty, '--stream', 'GET', URL_EXAMPLE)
assert 'Financiën' in r