Blame SOURCES/00235-JSON-decoder-lone-surrogates-fix.patch

04a680
From 90986ef48c0df602ab38aa831a24e99e9ed61e7e Mon Sep 17 00:00:00 2001
04a680
From: Charalampos Stratakis <cstratak@redhat.com>
04a680
Date: Mon, 4 Apr 2016 15:55:28 +0200
04a680
Subject: [PATCH] JSON decoder now accepts lone surrogates
04a680
04a680
---
04a680
 Lib/json/decoder.py               | 35 ++++++++++++------------
04a680
 Lib/json/tests/test_scanstring.py | 56 ++++++++++++++++++++++++++++++++++++---
04a680
 Modules/_json.c                   | 49 +++++++++-------------------------
04a680
 3 files changed, 83 insertions(+), 57 deletions(-)
04a680
04a680
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
04a680
index dfcc628..1b43238 100644
04a680
--- a/Lib/json/decoder.py
04a680
+++ b/Lib/json/decoder.py
04a680
@@ -62,6 +62,16 @@ BACKSLASH = {
04a680
04a680
 DEFAULT_ENCODING = "utf-8"
04a680
04a680
+def _decode_uXXXX(s, pos):
04a680
+    esc = s[pos + 1:pos + 5]
04a680
+    if len(esc) == 4 and esc[1] not in 'xX':
04a680
+        try:
04a680
+            return int(esc, 16)
04a680
+        except ValueError:
04a680
+            pass
04a680
+    msg = "Invalid \\uXXXX escape"
04a680
+    raise ValueError(errmsg(msg, s, pos))
04a680
+
04a680
 def py_scanstring(s, end, encoding=None, strict=True,
04a680
         _b=BACKSLASH, _m=STRINGCHUNK.match):
04a680
     """Scan the string s for a JSON string. End is the index of the
04a680
@@ -116,25 +126,16 @@ def py_scanstring(s, end, encoding=None, strict=True,
04a680
             end += 1
04a680
         else:
04a680
             # Unicode escape sequence
04a680
-            esc = s[end + 1:end + 5]
04a680
-            next_end = end + 5
04a680
-            if len(esc) != 4:
04a680
-                msg = "Invalid \\uXXXX escape"
04a680
-                raise ValueError(errmsg(msg, s, end))
04a680
-            uni = int(esc, 16)
04a680
+            uni = _decode_uXXXX(s, end)
04a680
+            end += 5
04a680
             # Check for surrogate pair on UCS-4 systems
04a680
-            if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
04a680
-                msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
04a680
-                if not s[end + 5:end + 7] == '\\u':
04a680
-                    raise ValueError(errmsg(msg, s, end))
04a680
-                esc2 = s[end + 7:end + 11]
04a680
-                if len(esc2) != 4:
04a680
-                    raise ValueError(errmsg(msg, s, end))
04a680
-                uni2 = int(esc2, 16)
04a680
-                uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
04a680
-                next_end += 6
04a680
+            if sys.maxunicode > 65535 and \
04a680
+               0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
04a680
+                uni2 = _decode_uXXXX(s, end + 1)
04a680
+                if 0xdc00 <= uni2 <= 0xdfff:
04a680
+                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
04a680
+                    end += 6
04a680
             char = unichr(uni)
04a680
-            end = next_end
04a680
         # Append the unescaped character
04a680
         _append(char)
04a680
     return u''.join(chunks), end
04a680
diff --git a/Lib/json/tests/test_scanstring.py b/Lib/json/tests/test_scanstring.py
04a680
index 4fef8cb..ed80a41 100644
04a680
--- a/Lib/json/tests/test_scanstring.py
04a680
+++ b/Lib/json/tests/test_scanstring.py
04a680
@@ -5,10 +5,6 @@ from json.tests import PyTest, CTest
04a680
 class TestScanstring(object):
04a680
     def test_scanstring(self):
04a680
         scanstring = self.json.decoder.scanstring
04a680
-        self.assertEqual(
04a680
-            scanstring('"z\\ud834\\udd20x"', 1, None, True),
04a680
-            (u'z\U0001d120x', 16))
04a680
-
04a680
         if sys.maxunicode == 65535:
04a680
             self.assertEqual(
04a680
                 scanstring(u'"z\U0001d120x"', 1, None, True),
04a680
@@ -94,6 +90,58 @@ class TestScanstring(object):
04a680
             scanstring('["Bad value", truth]', 2, None, True),
04a680
             (u'Bad value', 12))
04a680
04a680
+    def test_surrogates(self):
04a680
+        scanstring = self.json.decoder.scanstring
04a680
+        def assertScan(given, expect):
04a680
+            self.assertEqual(scanstring(given, 1, None, True),
04a680
+                             (expect, len(given)))
04a680
+            if not isinstance(given, unicode):
04a680
+                given = unicode(given)
04a680
+                self.assertEqual(scanstring(given, 1, None, True),
04a680
+                                 (expect, len(given)))
04a680
+
04a680
+        surrogates = unichr(0xd834) + unichr(0xdd20)
04a680
+        assertScan('"z\\ud834\\u0079x"', u'z\ud834yx')
04a680
+        assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x')
04a680
+        assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x')
04a680
+        assertScan('"z\\ud834x"', u'z\ud834x')
04a680
+        assertScan(u'"z\\ud834\udd20x12345"', u'z%sx12345' % surrogates)
04a680
+        assertScan('"z\\udd20x"', u'z\udd20x')
04a680
+        assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x')
04a680
+        assertScan(u'"z\ud834\\udd20x"', u'z%sx' % surrogates)
04a680
+        assertScan(u'"z\ud834x"', u'z\ud834x')
04a680
+
04a680
+    def test_bad_escapes(self):
04a680
+        scanstring = self.json.decoder.scanstring
04a680
+        bad_escapes = [
04a680
+            '"\\"',
04a680
+            '"\\x"',
04a680
+            '"\\u"',
04a680
+            '"\\u0"',
04a680
+            '"\\u01"',
04a680
+            '"\\u012"',
04a680
+            '"\\uz012"',
04a680
+            '"\\u0z12"',
04a680
+            '"\\u01z2"',
04a680
+            '"\\u012z"',
04a680
+            '"\\u0x12"',
04a680
+            '"\\u0X12"',
04a680
+            '"\\ud834\\"',
04a680
+            '"\\ud834\\u"',
04a680
+            '"\\ud834\\ud"',
04a680
+            '"\\ud834\\udd"',
04a680
+            '"\\ud834\\udd2"',
04a680
+            '"\\ud834\\uzdd2"',
04a680
+            '"\\ud834\\udzd2"',
04a680
+            '"\\ud834\\uddz2"',
04a680
+            '"\\ud834\\udd2z"',
04a680
+            '"\\ud834\\u0x20"',
04a680
+            '"\\ud834\\u0X20"',
04a680
+        ]
04a680
+        for s in bad_escapes:
04a680
+            with self.assertRaises(ValueError):
04a680
+                scanstring(s, 1, None, True)
04a680
+
04a680
     def test_issue3623(self):
04a680
         self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1,
04a680
                           "xxx")
04a680
diff --git a/Modules/_json.c b/Modules/_json.c
04a680
index 7c925fd..56d9ee4 100644
04a680
--- a/Modules/_json.c
04a680
+++ b/Modules/_json.c
04a680
@@ -524,16 +524,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
04a680
             }
04a680
 #ifdef Py_UNICODE_WIDE
04a680
             /* Surrogate pair */
04a680
-            if ((c & 0xfc00) == 0xd800) {
04a680
+            if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
04a680
+                buf[next++] == '\\' &&
04a680
+                buf[next++] == 'u') {
04a680
                 Py_UNICODE c2 = 0;
04a680
-                if (end + 6 >= len) {
04a680
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
04a680
-                    goto bail;
04a680
-                }
04a680
-                if (buf[next++] != '\\' || buf[next++] != 'u') {
04a680
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
04a680
-                    goto bail;
04a680
-                }
04a680
                 end += 6;
04a680
                 /* Decode 4 hex digits */
04a680
                 for (; next < end; next++) {
04a680
@@ -554,15 +548,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
04a680
                             goto bail;
04a680
                     }
04a680
                 }
04a680
-                if ((c2 & 0xfc00) != 0xdc00) {
04a680
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
04a680
-                    goto bail;
04a680
-                }
04a680
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
04a680
-            }
04a680
-            else if ((c & 0xfc00) == 0xdc00) {
04a680
-                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
04a680
-                goto bail;
04a680
+                if ((c2 & 0xfc00) == 0xdc00)
04a680
+                    c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
04a680
+                else
04a680
+                    end -= 6;
04a680
             }
04a680
 #endif
04a680
         }
04a680
@@ -703,16 +692,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
04a680
             }
04a680
 #ifdef Py_UNICODE_WIDE
04a680
             /* Surrogate pair */
04a680
-            if ((c & 0xfc00) == 0xd800) {
04a680
+            if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
04a680
+                buf[next++] == '\\' && buf[next++] == 'u') {
04a680
                 Py_UNICODE c2 = 0;
04a680
-                if (end + 6 >= len) {
04a680
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
04a680
-                    goto bail;
04a680
-                }
04a680
-                if (buf[next++] != '\\' || buf[next++] != 'u') {
04a680
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
04a680
-                    goto bail;
04a680
-                }
04a680
                 end += 6;
04a680
                 /* Decode 4 hex digits */
04a680
                 for (; next < end; next++) {
04a680
@@ -733,15 +715,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
04a680
                             goto bail;
04a680
                     }
04a680
                 }
04a680
-                if ((c2 & 0xfc00) != 0xdc00) {
04a680
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
04a680
-                    goto bail;
04a680
-                }
04a680
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
04a680
-            }
04a680
-            else if ((c & 0xfc00) == 0xdc00) {
04a680
-                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
04a680
-                goto bail;
04a680
+                if ((c2 & 0xfc00) == 0xdc00)
04a680
+                    c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
04a680
+                else
04a680
+                    end -= 6;
04a680
             }
04a680
 #endif
04a680
         }
04a680
--
04a680
2.5.5
04a680