|
|
8a7204 |
From 50062c4d8c4108d17b7f12d9518ce883956d3921 Mon Sep 17 00:00:00 2001
|
|
|
8a7204 |
From: David Lord <davidism@gmail.com>
|
|
|
8a7204 |
Date: Tue, 10 Apr 2018 09:29:48 -0700
|
|
|
8a7204 |
Subject: [PATCH] detect UTF encodings when loading json
|
|
|
8a7204 |
|
|
|
8a7204 |
(cherry picked from commit 0e1e9a04aaf29ab78f721cfc79ac2a691f6e3929)
|
|
|
8a7204 |
---
|
|
|
8a7204 |
flask/json.py | 49 ++++++++++++++++++++++++++++++++++++++++++-
|
|
|
8a7204 |
flask/wrappers.py | 13 +++---------
|
|
|
8a7204 |
tests/test_helpers.py | 28 ++++++++++++++-----------
|
|
|
8a7204 |
3 files changed, 67 insertions(+), 23 deletions(-)
|
|
|
8a7204 |
|
|
|
8a7204 |
diff --git a/flask/json.py b/flask/json.py
|
|
|
8a7204 |
index 16e0c29..114873e 100644
|
|
|
8a7204 |
--- a/flask/json.py
|
|
|
8a7204 |
+++ b/flask/json.py
|
|
|
8a7204 |
@@ -8,6 +8,7 @@
|
|
|
8a7204 |
:copyright: (c) 2015 by Armin Ronacher.
|
|
|
8a7204 |
:license: BSD, see LICENSE for more details.
|
|
|
8a7204 |
"""
|
|
|
8a7204 |
+import codecs
|
|
|
8a7204 |
import io
|
|
|
8a7204 |
import uuid
|
|
|
8a7204 |
from datetime import date
|
|
|
8a7204 |
@@ -108,6 +109,49 @@ def _load_arg_defaults(kwargs):
|
|
|
8a7204 |
kwargs.setdefault('cls', JSONDecoder)
|
|
|
8a7204 |
|
|
|
8a7204 |
|
|
|
8a7204 |
+def detect_encoding(data):
|
|
|
8a7204 |
+ """Detect which UTF codec was used to encode the given bytes.
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ The latest JSON standard (:rfc:`8259`) suggests that only UTF-8 is
|
|
|
8a7204 |
+ accepted. Older documents allowed 8, 16, or 32. 16 and 32 can be big
|
|
|
8a7204 |
+ or little endian. Some editors or libraries may prepend a BOM.
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ :param data: Bytes in unknown UTF encoding.
|
|
|
8a7204 |
+ :return: UTF encoding name
|
|
|
8a7204 |
+ """
|
|
|
8a7204 |
+ head = data[:4]
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if head[:3] == codecs.BOM_UTF8:
|
|
|
8a7204 |
+ return 'utf-8-sig'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if b'\x00' not in head:
|
|
|
8a7204 |
+ return 'utf-8'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if head in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE):
|
|
|
8a7204 |
+ return 'utf-32'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if head[:2] in (codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
|
|
|
8a7204 |
+ return 'utf-16'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if len(head) == 4:
|
|
|
8a7204 |
+ if head[:3] == b'\x00\x00\x00':
|
|
|
8a7204 |
+ return 'utf-32-be'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if head[::2] == b'\x00\x00':
|
|
|
8a7204 |
+ return 'utf-16-be'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if head[1:] == b'\x00\x00\x00':
|
|
|
8a7204 |
+ return 'utf-32-le'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if head[1::2] == b'\x00\x00':
|
|
|
8a7204 |
+ return 'utf-16-le'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ if len(head) == 2:
|
|
|
8a7204 |
+ return 'utf-16-be' if head.startswith(b'\x00') else 'utf-16-le'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+ return 'utf-8'
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+
|
|
|
8a7204 |
def dumps(obj, **kwargs):
|
|
|
8a7204 |
"""Serialize ``obj`` to a JSON formatted ``str`` by using the application's
|
|
|
8a7204 |
configured encoder (:attr:`~flask.Flask.json_encoder`) if there is an
|
|
|
8a7204 |
@@ -142,7 +186,10 @@ def loads(s, **kwargs):
|
|
|
8a7204 |
"""
|
|
|
8a7204 |
_load_arg_defaults(kwargs)
|
|
|
8a7204 |
if isinstance(s, bytes):
|
|
|
8a7204 |
- s = s.decode(kwargs.pop('encoding', None) or 'utf-8')
|
|
|
8a7204 |
+ encoding = kwargs.pop('encoding', None)
|
|
|
8a7204 |
+ if encoding is None:
|
|
|
8a7204 |
+ encoding = detect_encoding(s)
|
|
|
8a7204 |
+ s = s.decode(encoding)
|
|
|
8a7204 |
return _json.loads(s, **kwargs)
|
|
|
8a7204 |
|
|
|
8a7204 |
|
|
|
8a7204 |
diff --git a/flask/wrappers.py b/flask/wrappers.py
|
|
|
8a7204 |
index 04bdcb5..3e600fc 100644
|
|
|
8a7204 |
--- a/flask/wrappers.py
|
|
|
8a7204 |
+++ b/flask/wrappers.py
|
|
|
8a7204 |
@@ -144,17 +144,10 @@ class Request(RequestBase):
|
|
|
8a7204 |
if not (force or self.is_json):
|
|
|
8a7204 |
return None
|
|
|
8a7204 |
|
|
|
8a7204 |
- # We accept a request charset against the specification as
|
|
|
8a7204 |
- # certain clients have been using this in the past. This
|
|
|
8a7204 |
- # fits our general approach of being nice in what we accept
|
|
|
8a7204 |
- # and strict in what we send out.
|
|
|
8a7204 |
- request_charset = self.mimetype_params.get('charset')
|
|
|
8a7204 |
+ data = _get_data(self, cache)
|
|
|
8a7204 |
+
|
|
|
8a7204 |
try:
|
|
|
8a7204 |
- data = _get_data(self, cache)
|
|
|
8a7204 |
- if request_charset is not None:
|
|
|
8a7204 |
- rv = json.loads(data, encoding=request_charset)
|
|
|
8a7204 |
- else:
|
|
|
8a7204 |
- rv = json.loads(data)
|
|
|
8a7204 |
+ rv = json.loads(data)
|
|
|
8a7204 |
except ValueError as e:
|
|
|
8a7204 |
if silent:
|
|
|
8a7204 |
rv = None
|
|
|
8a7204 |
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
|
|
|
8a7204 |
index 9320ef7..9990782 100644
|
|
|
8a7204 |
--- a/tests/test_helpers.py
|
|
|
8a7204 |
+++ b/tests/test_helpers.py
|
|
|
8a7204 |
@@ -21,6 +21,8 @@ from werkzeug.datastructures import Range
|
|
|
8a7204 |
from werkzeug.exceptions import BadRequest, NotFound
|
|
|
8a7204 |
from werkzeug.http import parse_cache_control_header, parse_options_header
|
|
|
8a7204 |
from werkzeug.http import http_date
|
|
|
8a7204 |
+
|
|
|
8a7204 |
+from flask import json
|
|
|
8a7204 |
from flask._compat import StringIO, text_type
|
|
|
8a7204 |
|
|
|
8a7204 |
|
|
|
8a7204 |
@@ -34,6 +36,20 @@ def has_encoding(name):
|
|
|
8a7204 |
|
|
|
8a7204 |
|
|
|
8a7204 |
class TestJSON(object):
|
|
|
8a7204 |
+ @pytest.mark.parametrize('value', (
|
|
|
8a7204 |
+ 1, 't', True, False, None,
|
|
|
8a7204 |
+ [], [1, 2, 3],
|
|
|
8a7204 |
+ {}, {'foo': u'🐍'},
|
|
|
8a7204 |
+ ))
|
|
|
8a7204 |
+ @pytest.mark.parametrize('encoding', (
|
|
|
8a7204 |
+ 'utf-8', 'utf-8-sig',
|
|
|
8a7204 |
+ 'utf-16-le', 'utf-16-be', 'utf-16',
|
|
|
8a7204 |
+ 'utf-32-le', 'utf-32-be', 'utf-32',
|
|
|
8a7204 |
+ ))
|
|
|
8a7204 |
+ def test_detect_encoding(self, value, encoding):
|
|
|
8a7204 |
+ data = json.dumps(value).encode(encoding)
|
|
|
8a7204 |
+ assert json.detect_encoding(data) == encoding
|
|
|
8a7204 |
+ assert json.loads(data) == value
|
|
|
8a7204 |
|
|
|
8a7204 |
def test_ignore_cached_json(self):
|
|
|
8a7204 |
app = flask.Flask(__name__)
|
|
|
8a7204 |
@@ -85,18 +101,6 @@ class TestJSON(object):
|
|
|
8a7204 |
rv = c.post('/json', data='"foo"', content_type='application/x+json')
|
|
|
8a7204 |
assert rv.data == b'foo'
|
|
|
8a7204 |
|
|
|
8a7204 |
- def test_json_body_encoding(self):
|
|
|
8a7204 |
- app = flask.Flask(__name__)
|
|
|
8a7204 |
- app.testing = True
|
|
|
8a7204 |
- @app.route('/')
|
|
|
8a7204 |
- def index():
|
|
|
8a7204 |
- return flask.request.get_json()
|
|
|
8a7204 |
-
|
|
|
8a7204 |
- c = app.test_client()
|
|
|
8a7204 |
- resp = c.get('/', data=u'"Hällo Wörld"'.encode('iso-8859-15'),
|
|
|
8a7204 |
- content_type='application/json; charset=iso-8859-15')
|
|
|
8a7204 |
- assert resp.data == u'Hällo Wörld'.encode('utf-8')
|
|
|
8a7204 |
-
|
|
|
8a7204 |
def test_json_as_unicode(self):
|
|
|
8a7204 |
app = flask.Flask(__name__)
|
|
|
8a7204 |
|
|
|
8a7204 |
--
|
|
|
8a7204 |
2.17.1
|
|
|
8a7204 |
|