|
|
04a680 |
From 90986ef48c0df602ab38aa831a24e99e9ed61e7e Mon Sep 17 00:00:00 2001
|
|
|
04a680 |
From: Charalampos Stratakis <cstratak@redhat.com>
|
|
|
04a680 |
Date: Mon, 4 Apr 2016 15:55:28 +0200
|
|
|
04a680 |
Subject: [PATCH] JSON decoder now accepts lone surrogates
|
|
|
04a680 |
|
|
|
04a680 |
---
|
|
|
04a680 |
Lib/json/decoder.py | 35 ++++++++++++------------
|
|
|
04a680 |
Lib/json/tests/test_scanstring.py | 56 ++++++++++++++++++++++++++++++++++++---
|
|
|
04a680 |
Modules/_json.c | 49 +++++++++-------------------------
|
|
|
04a680 |
3 files changed, 83 insertions(+), 57 deletions(-)
|
|
|
04a680 |
|
|
|
04a680 |
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
|
|
|
04a680 |
index dfcc628..1b43238 100644
|
|
|
04a680 |
--- a/Lib/json/decoder.py
|
|
|
04a680 |
+++ b/Lib/json/decoder.py
|
|
|
04a680 |
@@ -62,6 +62,16 @@ BACKSLASH = {
|
|
|
04a680 |
|
|
|
04a680 |
DEFAULT_ENCODING = "utf-8"
|
|
|
04a680 |
|
|
|
04a680 |
+def _decode_uXXXX(s, pos):
|
|
|
04a680 |
+ esc = s[pos + 1:pos + 5]
|
|
|
04a680 |
+ if len(esc) == 4 and esc[1] not in 'xX':
|
|
|
04a680 |
+ try:
|
|
|
04a680 |
+ return int(esc, 16)
|
|
|
04a680 |
+ except ValueError:
|
|
|
04a680 |
+ pass
|
|
|
04a680 |
+ msg = "Invalid \\uXXXX escape"
|
|
|
04a680 |
+ raise ValueError(errmsg(msg, s, pos))
|
|
|
04a680 |
+
|
|
|
04a680 |
def py_scanstring(s, end, encoding=None, strict=True,
|
|
|
04a680 |
_b=BACKSLASH, _m=STRINGCHUNK.match):
|
|
|
04a680 |
"""Scan the string s for a JSON string. End is the index of the
|
|
|
04a680 |
@@ -116,25 +126,16 @@ def py_scanstring(s, end, encoding=None, strict=True,
|
|
|
04a680 |
end += 1
|
|
|
04a680 |
else:
|
|
|
04a680 |
# Unicode escape sequence
|
|
|
04a680 |
- esc = s[end + 1:end + 5]
|
|
|
04a680 |
- next_end = end + 5
|
|
|
04a680 |
- if len(esc) != 4:
|
|
|
04a680 |
- msg = "Invalid \\uXXXX escape"
|
|
|
04a680 |
- raise ValueError(errmsg(msg, s, end))
|
|
|
04a680 |
- uni = int(esc, 16)
|
|
|
04a680 |
+ uni = _decode_uXXXX(s, end)
|
|
|
04a680 |
+ end += 5
|
|
|
04a680 |
# Check for surrogate pair on UCS-4 systems
|
|
|
04a680 |
- if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
|
|
|
04a680 |
- msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
|
|
|
04a680 |
- if not s[end + 5:end + 7] == '\\u':
|
|
|
04a680 |
- raise ValueError(errmsg(msg, s, end))
|
|
|
04a680 |
- esc2 = s[end + 7:end + 11]
|
|
|
04a680 |
- if len(esc2) != 4:
|
|
|
04a680 |
- raise ValueError(errmsg(msg, s, end))
|
|
|
04a680 |
- uni2 = int(esc2, 16)
|
|
|
04a680 |
- uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
|
|
04a680 |
- next_end += 6
|
|
|
04a680 |
+ if sys.maxunicode > 65535 and \
|
|
|
04a680 |
+ 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
|
|
|
04a680 |
+ uni2 = _decode_uXXXX(s, end + 1)
|
|
|
04a680 |
+ if 0xdc00 <= uni2 <= 0xdfff:
|
|
|
04a680 |
+ uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
|
|
|
04a680 |
+ end += 6
|
|
|
04a680 |
char = unichr(uni)
|
|
|
04a680 |
- end = next_end
|
|
|
04a680 |
# Append the unescaped character
|
|
|
04a680 |
_append(char)
|
|
|
04a680 |
return u''.join(chunks), end
|
|
|
04a680 |
diff --git a/Lib/json/tests/test_scanstring.py b/Lib/json/tests/test_scanstring.py
|
|
|
04a680 |
index 4fef8cb..ed80a41 100644
|
|
|
04a680 |
--- a/Lib/json/tests/test_scanstring.py
|
|
|
04a680 |
+++ b/Lib/json/tests/test_scanstring.py
|
|
|
04a680 |
@@ -5,10 +5,6 @@ from json.tests import PyTest, CTest
|
|
|
04a680 |
class TestScanstring(object):
|
|
|
04a680 |
def test_scanstring(self):
|
|
|
04a680 |
scanstring = self.json.decoder.scanstring
|
|
|
04a680 |
- self.assertEqual(
|
|
|
04a680 |
- scanstring('"z\\ud834\\udd20x"', 1, None, True),
|
|
|
04a680 |
- (u'z\U0001d120x', 16))
|
|
|
04a680 |
-
|
|
|
04a680 |
if sys.maxunicode == 65535:
|
|
|
04a680 |
self.assertEqual(
|
|
|
04a680 |
scanstring(u'"z\U0001d120x"', 1, None, True),
|
|
|
04a680 |
@@ -94,6 +90,58 @@ class TestScanstring(object):
|
|
|
04a680 |
scanstring('["Bad value", truth]', 2, None, True),
|
|
|
04a680 |
(u'Bad value', 12))
|
|
|
04a680 |
|
|
|
04a680 |
+ def test_surrogates(self):
|
|
|
04a680 |
+ scanstring = self.json.decoder.scanstring
|
|
|
04a680 |
+ def assertScan(given, expect):
|
|
|
04a680 |
+ self.assertEqual(scanstring(given, 1, None, True),
|
|
|
04a680 |
+ (expect, len(given)))
|
|
|
04a680 |
+ if not isinstance(given, unicode):
|
|
|
04a680 |
+ given = unicode(given)
|
|
|
04a680 |
+ self.assertEqual(scanstring(given, 1, None, True),
|
|
|
04a680 |
+ (expect, len(given)))
|
|
|
04a680 |
+
|
|
|
04a680 |
+ surrogates = unichr(0xd834) + unichr(0xdd20)
|
|
|
04a680 |
+ assertScan('"z\\ud834\\u0079x"', u'z\ud834yx')
|
|
|
04a680 |
+ assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x')
|
|
|
04a680 |
+ assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x')
|
|
|
04a680 |
+ assertScan('"z\\ud834x"', u'z\ud834x')
|
|
|
04a680 |
+ assertScan(u'"z\\ud834\udd20x12345"', u'z%sx12345' % surrogates)
|
|
|
04a680 |
+ assertScan('"z\\udd20x"', u'z\udd20x')
|
|
|
04a680 |
+ assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x')
|
|
|
04a680 |
+ assertScan(u'"z\ud834\\udd20x"', u'z%sx' % surrogates)
|
|
|
04a680 |
+ assertScan(u'"z\ud834x"', u'z\ud834x')
|
|
|
04a680 |
+
|
|
|
04a680 |
+ def test_bad_escapes(self):
|
|
|
04a680 |
+ scanstring = self.json.decoder.scanstring
|
|
|
04a680 |
+ bad_escapes = [
|
|
|
04a680 |
+ '"\\"',
|
|
|
04a680 |
+ '"\\x"',
|
|
|
04a680 |
+ '"\\u"',
|
|
|
04a680 |
+ '"\\u0"',
|
|
|
04a680 |
+ '"\\u01"',
|
|
|
04a680 |
+ '"\\u012"',
|
|
|
04a680 |
+ '"\\uz012"',
|
|
|
04a680 |
+ '"\\u0z12"',
|
|
|
04a680 |
+ '"\\u01z2"',
|
|
|
04a680 |
+ '"\\u012z"',
|
|
|
04a680 |
+ '"\\u0x12"',
|
|
|
04a680 |
+ '"\\u0X12"',
|
|
|
04a680 |
+ '"\\ud834\\"',
|
|
|
04a680 |
+ '"\\ud834\\u"',
|
|
|
04a680 |
+ '"\\ud834\\ud"',
|
|
|
04a680 |
+ '"\\ud834\\udd"',
|
|
|
04a680 |
+ '"\\ud834\\udd2"',
|
|
|
04a680 |
+ '"\\ud834\\uzdd2"',
|
|
|
04a680 |
+ '"\\ud834\\udzd2"',
|
|
|
04a680 |
+ '"\\ud834\\uddz2"',
|
|
|
04a680 |
+ '"\\ud834\\udd2z"',
|
|
|
04a680 |
+ '"\\ud834\\u0x20"',
|
|
|
04a680 |
+ '"\\ud834\\u0X20"',
|
|
|
04a680 |
+ ]
|
|
|
04a680 |
+ for s in bad_escapes:
|
|
|
04a680 |
+ with self.assertRaises(ValueError):
|
|
|
04a680 |
+ scanstring(s, 1, None, True)
|
|
|
04a680 |
+
|
|
|
04a680 |
def test_issue3623(self):
|
|
|
04a680 |
self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1,
|
|
|
04a680 |
"xxx")
|
|
|
04a680 |
diff --git a/Modules/_json.c b/Modules/_json.c
|
|
|
04a680 |
index 7c925fd..56d9ee4 100644
|
|
|
04a680 |
--- a/Modules/_json.c
|
|
|
04a680 |
+++ b/Modules/_json.c
|
|
|
04a680 |
@@ -524,16 +524,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
|
|
|
04a680 |
}
|
|
|
04a680 |
#ifdef Py_UNICODE_WIDE
|
|
|
04a680 |
/* Surrogate pair */
|
|
|
04a680 |
- if ((c & 0xfc00) == 0xd800) {
|
|
|
04a680 |
+ if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
|
|
|
04a680 |
+ buf[next++] == '\\' &&
|
|
|
04a680 |
+ buf[next++] == 'u') {
|
|
|
04a680 |
Py_UNICODE c2 = 0;
|
|
|
04a680 |
- if (end + 6 >= len) {
|
|
|
04a680 |
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
- }
|
|
|
04a680 |
- if (buf[next++] != '\\' || buf[next++] != 'u') {
|
|
|
04a680 |
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
- }
|
|
|
04a680 |
end += 6;
|
|
|
04a680 |
/* Decode 4 hex digits */
|
|
|
04a680 |
for (; next < end; next++) {
|
|
|
04a680 |
@@ -554,15 +548,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
|
|
|
04a680 |
goto bail;
|
|
|
04a680 |
}
|
|
|
04a680 |
}
|
|
|
04a680 |
- if ((c2 & 0xfc00) != 0xdc00) {
|
|
|
04a680 |
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
- }
|
|
|
04a680 |
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
|
|
|
04a680 |
- }
|
|
|
04a680 |
- else if ((c & 0xfc00) == 0xdc00) {
|
|
|
04a680 |
- raise_errmsg("Unpaired low surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
+ if ((c2 & 0xfc00) == 0xdc00)
|
|
|
04a680 |
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
|
|
|
04a680 |
+ else
|
|
|
04a680 |
+ end -= 6;
|
|
|
04a680 |
}
|
|
|
04a680 |
#endif
|
|
|
04a680 |
}
|
|
|
04a680 |
@@ -703,16 +692,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
|
|
04a680 |
}
|
|
|
04a680 |
#ifdef Py_UNICODE_WIDE
|
|
|
04a680 |
/* Surrogate pair */
|
|
|
04a680 |
- if ((c & 0xfc00) == 0xd800) {
|
|
|
04a680 |
+ if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
|
|
|
04a680 |
+ buf[next++] == '\\' && buf[next++] == 'u') {
|
|
|
04a680 |
Py_UNICODE c2 = 0;
|
|
|
04a680 |
- if (end + 6 >= len) {
|
|
|
04a680 |
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
- }
|
|
|
04a680 |
- if (buf[next++] != '\\' || buf[next++] != 'u') {
|
|
|
04a680 |
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
- }
|
|
|
04a680 |
end += 6;
|
|
|
04a680 |
/* Decode 4 hex digits */
|
|
|
04a680 |
for (; next < end; next++) {
|
|
|
04a680 |
@@ -733,15 +715,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
|
|
04a680 |
goto bail;
|
|
|
04a680 |
}
|
|
|
04a680 |
}
|
|
|
04a680 |
- if ((c2 & 0xfc00) != 0xdc00) {
|
|
|
04a680 |
- raise_errmsg("Unpaired high surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
- }
|
|
|
04a680 |
- c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
|
|
|
04a680 |
- }
|
|
|
04a680 |
- else if ((c & 0xfc00) == 0xdc00) {
|
|
|
04a680 |
- raise_errmsg("Unpaired low surrogate", pystr, end - 5);
|
|
|
04a680 |
- goto bail;
|
|
|
04a680 |
+ if ((c2 & 0xfc00) == 0xdc00)
|
|
|
04a680 |
+ c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
|
|
|
04a680 |
+ else
|
|
|
04a680 |
+ end -= 6;
|
|
|
04a680 |
}
|
|
|
04a680 |
#endif
|
|
|
04a680 |
}
|
|
|
04a680 |
--
|
|
|
04a680 |
2.5.5
|
|
|
04a680 |
|