diff --git a/flask/json.py b/flask/json.py index 45ba3240..1ee0586e 100644 --- a/flask/json.py +++ b/flask/json.py @@ -8,6 +8,7 @@ :copyright: (c) 2012 by Armin Ronacher. :license: BSD, see LICENSE for more details. """ +import codecs import io import uuid from datetime import datetime @@ -111,6 +112,49 @@ def _load_arg_defaults(kwargs): kwargs.setdefault('cls', JSONDecoder) +def detect_encoding(data): + """Detect which UTF codec was used to encode the given bytes. + + The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is + accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big + or little endian. Some editors or libraries may prepend a BOM. + + :param data: Bytes in unknown UTF encoding. + :return: UTF encoding name + """ + head = data[:4] + + if head[:3] == codecs.BOM_UTF8: + return 'utf-8-sig' + + if b'\x00' not in head: + return 'utf-8' + + if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE): + return 'utf-32' + + if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE): + return 'utf-16' + + if len(head) == 4: + if head[:3] == b'\x00\x00\x00': + return 'utf-32-be' + + if head[::2] == b'\x00\x00': + return 'utf-16-be' + + if head[1:] == b'\x00\x00\x00': + return 'utf-32-le' + + if head[1::2] == b'\x00\x00': + return 'utf-16-le' + + if len(head) == 2: + return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le' + + return 'utf-8' + + def dumps(obj, **kwargs): """Serialize ``obj`` to a JSON formatted ``str`` by using the application's configured encoder (:attr:`~flask.Flask.json_encoder`) if there is an @@ -145,7 +189,10 @@ def loads(s, **kwargs): """ _load_arg_defaults(kwargs) if isinstance(s, bytes): - s = s.decode(kwargs.pop('encoding', None) or 'utf-8') + encoding = kwargs.pop('encoding', None) + if encoding is None: + encoding = detect_encoding(s) + s = s.decode(encoding) return _json.loads(s, **kwargs) diff --git a/flask/testsuite/helpers.py b/flask/testsuite/helpers.py index 636f67fa..12e10d96 100644 --- a/flask/testsuite/helpers.py +++ b/flask/testsuite/helpers.py @@ -15,6 +15,8 @@ import unittest from logging import StreamHandler from flask.testsuite import FlaskTestCase, catch_warnings, catch_stderr from werkzeug.http import parse_cache_control_header, parse_options_header + +from flask import json from flask._compat import StringIO, text_type @@ -29,6 +31,16 @@ def has_encoding(name): class JSONTestCase(FlaskTestCase): + def test_detect_encoding(self): + values = (1, 't', True, False, None, [], [1,2,3], {}, {'foo': u'🐍'},) + encodings = ('utf-8', 'utf-8-sig', 'utf-16-le', 'utf-16-be', 'utf-16', 'utf-32-le', 'utf-32-be', 'utf-32',) + + for encoding in encodings: + for value in values: + data = json.dumps(value).encode(encoding) + self.assert_equal(json.detect_encoding(data), encoding) + self.assert_equal(json.loads(data),value) + def test_json_bad_requests(self): app = flask.Flask(__name__) @app.route('/json', methods=['POST']) @@ -38,18 +50,6 @@ class JSONTestCase(FlaskTestCase): rv = c.post('/json', data='malformed', content_type='application/json') self.assert_equal(rv.status_code, 400) - def test_json_body_encoding(self): - app = flask.Flask(__name__) - app.testing = True - @app.route('/') - def index(): - return flask.request.get_json() - - c = app.test_client() - resp = c.get('/', data=u'"Hällo Wörld"'.encode('iso-8859-15'), - content_type='application/json; charset=iso-8859-15') - self.assert_equal(resp.data, u'Hällo Wörld'.encode('utf-8')) - def test_jsonify(self): d = dict(a=23, b=42, c=[1, 2, 3]) app = flask.Flask(__name__) diff --git a/flask/wrappers.py b/flask/wrappers.py index 1a17824a..0d6f068d 100644 --- a/flask/wrappers.py +++ b/flask/wrappers.py @@ -127,17 +127,10 @@ class Request(RequestBase): if self.mimetype != 'application/json' and not force: return None - # We accept a request charset against the specification as - # certain clients have been using this in the past. This - # fits our general approach of being nice in what we accept - # and strict in what we send out. - request_charset = self.mimetype_params.get('charset') + data = _get_data(self, cache) + try: - data = _get_data(self, cache) - if request_charset is not None: - rv = json.loads(data, encoding=request_charset) - else: - rv = json.loads(data) + rv = json.loads(data) except ValueError as e: if silent: rv = None