httpie · jkbrzt · Sep 29, 2021 · Jul 29, 2021 · Sep 28, 2021 · Sep 28, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ This project adheres to [Semantic Versioning](https://semver.org/).
 - Added support for formatting & coloring of JSON bodies preceded by non-JSON data (e.g., an XXSI prefix). ([#1130](https://github.com/httpie/httpie/issues/1130))
 - Added `--format-options=response.as:CONTENT_TYPE` to allow overriding the response `Content-Type`. ([#1134](https://github.com/httpie/httpie/issues/1134))
 - Added `--response-as` shortcut for setting the response `Content-Type`-related `--format-options`. ([#1134](https://github.com/httpie/httpie/issues/1134))
+- Improved handling of prettified responses without correct `Content-Type` encoding. ([#1110](https://github.com/httpie/httpie/issues/1110))
 - Installed plugins are now listed in `--debug` output. ([#1165](https://github.com/httpie/httpie/issues/1165))
 - Fixed duplicate keys preservation of JSON data. ([#1163](https://github.com/httpie/httpie/issues/1163))
 

diff --git a/docs/README.md b/docs/README.md
@@ -1249,6 +1249,18 @@ For example, the following request will force the response to be treated as XML:
 $ http --response-as=application/xml pie.dev/get
 ```
 
+And the following requests will force the response to use the [big5](https://docs.python.org/3/library/codecs.html#standard-encodings) encoding:
+
+```bash
+$ http --response-as='charset=big5' pie.dev/get
+```
+
+```bash
+$ http --response-as='text/plain; charset=big5' pie.dev/get
+```
+
+Given the encoding is not sent by the server, HTTPie will auto-detect it.
+
 ### Binary data
 
 Binary data is suppressed for terminal output, which makes it safe to perform requests to URLs that send back binary data.

diff --git a/httpie/cli/definition.py b/httpie/cli/definition.py
@@ -316,6 +316,8 @@
     Override the response Content-Type for formatting purposes, e.g.:
 
         --response-as=application/xml
+        --response-as=charset=utf-8
+        --response-as='application/xml; charset=utf-8'
 
     It is a shortcut for:
 

diff --git a/httpie/codec.py b/httpie/codec.py
@@ -0,0 +1,37 @@
+from typing import Union
+
+from charset_normalizer import from_bytes
+
+from .constants import UTF8
+
+Bytes = Union[bytearray, bytes]
+
+
+def detect_encoding(content: Bytes) -> str:
+    """Detect the `content` encoding.
+    Fallback to UTF-8 when no suitable encoding found.
+
+    """
+    match = from_bytes(bytes(content)).best()
+    return match.encoding if match else UTF8
+
+
+def decode(content: Bytes, encoding: str) -> str:
+    """Decode `content` using the given `encoding`.
+    If no `encoding` is provided, the best effort is to guess it from `content`.
+
+    Unicode errors are replaced.
+
+    """
+    if not encoding:
+        encoding = detect_encoding(content)
+    return content.decode(encoding, 'replace')
+
+
+def encode(content: str, encoding: str) -> bytes:
+    """Encode `content` using the given `encoding`.
+
+    Unicode errors are replaced.
+
+    """
+    return content.encode(encoding, 'replace')
diff --git a/httpie/models.py b/httpie/models.py
@@ -30,11 +30,6 @@ def headers(self) -> str:
     def encoding(self) -> Optional[str]:
         """Return a `str` with the message's encoding, if known."""
 
-    @property
-    def body(self) -> bytes:
-        """Return a `bytes` with the message's body."""
-        raise NotImplementedError()
-
     @property
     def content_type(self) -> str:
         """Return the message content type."""
@@ -86,12 +81,6 @@ def headers(self):
     def encoding(self):
         return self._orig.encoding or UTF8
 
-    @property
-    def body(self):
-        # Only now the response body is fetched.
-        # Shouldn't be touched unless the body is actually needed.
-        return self._orig.content
-
 
 class HTTPRequest(HTTPMessage):
     """A :class:`requests.models.Request` wrapper."""

diff --git a/httpie/output/formatters/xml.py b/httpie/output/formatters/xml.py
@@ -25,7 +25,7 @@ def pretty_xml(document: 'Document',
     }
     if standalone is not None and sys.version_info >= (3, 9):
         kwargs['standalone'] = standalone
-    body = document.toprettyxml(**kwargs).decode()
+    body = document.toprettyxml(**kwargs).decode(kwargs['encoding'])
 
     # Remove blank lines automatically added by `toprettyxml()`.
     return '\n'.join(line for line in body.splitlines() if line.strip())

diff --git a/httpie/output/streams.py b/httpie/output/streams.py
@@ -1,7 +1,8 @@
 from abc import ABCMeta, abstractmethod
 from itertools import chain
-from typing import Callable, Iterable, Union
+from typing import Any, Callable, Dict, Iterable, Tuple, Union
 
+from .. import codec
 from ..cli.constants import EMPTY_FORMAT_OPTION
 from ..context import Environment
 from ..constants import UTF8
@@ -114,8 +115,8 @@ def iter_body(self) -> Iterable[bytes]:
         for line, lf in self.msg.iter_lines(self.CHUNK_SIZE):
             if b'\0' in line:
                 raise BinarySuppressedError()
-            yield line.decode(self.msg.encoding) \
-                      .encode(self.output_encoding, 'replace') + lf
+            line = codec.decode(line, self.msg.encoding)
+            yield codec.encode(line, self.output_encoding) + lf
 
 
 class PrettyStream(EncodedStream):
@@ -137,15 +138,23 @@ def __init__(
         super().__init__(**kwargs)
         self.formatting = formatting
         self.conversion = conversion
-        self.mime = self.get_mime()
+        self.mime, mime_options = self._get_mime_and_options()
+        self.encoding = mime_options.get('charset') or ''
 
-    def get_mime(self) -> str:
-        mime = parse_header_content_type(self.msg.content_type)[0]
-        if isinstance(self.msg, HTTPResponse):
-            forced_content_type = self.formatting.options['response']['as']
-            if forced_content_type != EMPTY_FORMAT_OPTION:
-                mime = parse_header_content_type(forced_content_type)[0] or mime
-        return mime
+    def _get_mime_and_options(self) -> Tuple[str, Dict[str, Any]]:
+        # Defaults from the `Content-Type` header.
+        mime, options = parse_header_content_type(self.msg.content_type)
+
+        if not isinstance(self.msg, HTTPResponse):
+            return mime, options
+
+        # Override from the `--response-as` option.
+        forced_content_type = self.formatting.options['response']['as']
+        if forced_content_type == EMPTY_FORMAT_OPTION:
+            return mime, options
+
+        forced_mime, forced_options = parse_header_content_type(forced_content_type)
+        return (forced_mime or mime, forced_options or options)
 
     def get_headers(self) -> bytes:
         return self.formatting.format_headers(
@@ -176,9 +185,9 @@ def process_body(self, chunk: Union[str, bytes]) -> bytes:
         if not isinstance(chunk, str):
             # Text when a converter has been used,
             # otherwise it will always be bytes.
-            chunk = chunk.decode(self.msg.encoding, 'replace')
+            chunk = codec.decode(chunk, self.encoding)
         chunk = self.formatting.format_body(content=chunk, mime=self.mime)
-        return chunk.encode(self.output_encoding, 'replace')
+        return codec.encode(chunk, self.output_encoding)
 
 
 class BufferedPrettyStream(PrettyStream):

diff --git a/setup.py b/setup.py
@@ -25,6 +25,7 @@
     'wheel',
 ]
 install_requires = [
+    'charset_normalizer>=2.0.0',
     'defusedxml>=0.6.0',
     'requests[socks]>=2.22.0',
     'Pygments>=2.5.2',

diff --git a/tests/test_errors.py b/tests/test_errors.py
@@ -39,3 +39,10 @@ def test_max_headers_limit(httpbin_both):
 
 def test_max_headers_no_limit(httpbin_both):
     assert HTTP_OK in http('--max-headers=0', httpbin_both + '/get')
+
+
+def test_charset_argument_unknown_encoding(httpbin_both):
+    with raises(LookupError) as e:
+        http('--response-as', 'charset=foobar',
+             'GET', httpbin_both + '/get')
+    assert 'unknown encoding: foobar' in str(e.value)
diff --git a/tests/test_unicode.py b/tests/test_unicode.py
@@ -2,9 +2,17 @@
 Various unicode handling related tests.
 
 """
-from .utils import http, HTTP_OK
+import pytest
+import responses
+
+from httpie.cli.constants import PRETTY_MAP
+from httpie.constants import UTF8
+
+from .utils import http, HTTP_OK, URL_EXAMPLE
 from .fixtures import UNICODE
 
+ENCODINGS = [UTF8, 'windows-1250']
+
 
 def test_unicode_headers(httpbin):
     # httpbin doesn't interpret UFT-8 headers
@@ -109,3 +117,95 @@ def test_unicode_digest_auth(httpbin):
     http('--auth-type=digest',
          '--auth', f'test:{UNICODE}',
          f'{httpbin.url}/digest-auth/auth/test/{UNICODE}')
+
+
+@pytest.mark.parametrize('encoding', ENCODINGS)
+@responses.activate
+def test_GET_encoding_detection_from_content_type_header(encoding):
+    responses.add(responses.GET,
+                  URL_EXAMPLE,
+                  body='<?xml version="1.0"?>\n<c>Financiën</c>'.encode(encoding),
+                  content_type=f'text/xml; charset={encoding.upper()}')
+    r = http('GET', URL_EXAMPLE)
+    assert 'Financiën' in r
+
+
+@pytest.mark.parametrize('encoding', ENCODINGS)
+@responses.activate
+def test_GET_encoding_detection_from_content(encoding):
+    body = f'<?xml version="1.0" encoding="{encoding.upper()}"?>\n<c>Financiën</c>'
+    responses.add(responses.GET,
+                  URL_EXAMPLE,
+                  body=body.encode(encoding),
+                  content_type='text/xml')
+    r = http('GET', URL_EXAMPLE)
+    assert 'Financiën' in r
+
+
+@responses.activate
+def test_GET_encoding_provided_by_format_options():
+    responses.add(responses.GET,
+                  URL_EXAMPLE,
+                  body='▒▒▒'.encode('johab'),
+                  content_type='text/plain')
+    r = http('--format-options', 'response.as:text/plain; charset=johab',
+             'GET', URL_EXAMPLE)
+    assert '▒▒▒' in r
+
+
+@responses.activate
+def test_GET_encoding_provided_by_shortcut_option():
+    responses.add(responses.GET,
+                  URL_EXAMPLE,
+                  body='▒▒▒'.encode('johab'),
+                  content_type='text/plain')
+    r = http('--response-as', 'text/plain; charset=johab',
+             'GET', URL_EXAMPLE)
+    assert '▒▒▒' in r
+
+
+@pytest.mark.parametrize('encoding', ENCODINGS)
+@responses.activate
+def test_GET_encoding_provided_by_empty_shortcut_option_should_use_content_detection(encoding):
+    body = f'<?xml version="1.0" encoding="{encoding.upper()}"?>\n<c>Financiën</c>'
+    responses.add(responses.GET,
+                  URL_EXAMPLE,
+                  body=body.encode(encoding),
+                  content_type='text/xml')
+    r = http('--response-as', '', 'GET', URL_EXAMPLE)
+    assert 'Financiën' in r
+
+
+@pytest.mark.parametrize('encoding', ENCODINGS)
+@responses.activate
+def test_POST_encoding_detection_from_content_type_header(encoding):
+    responses.add(responses.POST,
+                  URL_EXAMPLE,
+                  body='Všichni lidé jsou si rovni.'.encode(encoding),
+                  content_type=f'text/plain; charset={encoding.upper()}')
+    r = http('--form', 'POST', URL_EXAMPLE)
+    assert 'Všichni lidé jsou si rovni.' in r
+
+
+@pytest.mark.parametrize('encoding', ENCODINGS)
+@responses.activate
+def test_POST_encoding_detection_from_content(encoding):
+    responses.add(responses.POST,
+                  URL_EXAMPLE,
+                  body='Všichni lidé jsou si rovni.'.encode(encoding),
+                  content_type='text/plain')
+    r = http('--form', 'POST', URL_EXAMPLE)
+    assert 'Všichni lidé jsou si rovni.' in r
+
+
+@pytest.mark.parametrize('encoding', ENCODINGS)
+@pytest.mark.parametrize('pretty', PRETTY_MAP.keys())
+@responses.activate
+def test_stream_encoding_detection_from_content_type_header(encoding, pretty):
+    responses.add(responses.GET,
+                  URL_EXAMPLE,
+                  body='<?xml version="1.0"?>\n<c>Financiën</c>'.encode(encoding),
+                  stream=True,
+                  content_type=f'text/xml; charset={encoding.upper()}')
+    r = http('--pretty=' + pretty, '--stream', 'GET', URL_EXAMPLE)
+    assert 'Financiën' in r