An interpreted, interactive, object-oriented programming language
CentOS Sources
2017-08-01 71084d584ff953f5463757ec6536406320560b4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
From 90986ef48c0df602ab38aa831a24e99e9ed61e7e Mon Sep 17 00:00:00 2001
From: Charalampos Stratakis <cstratak@redhat.com>
Date: Mon, 4 Apr 2016 15:55:28 +0200
Subject: [PATCH] JSON decoder now accepts lone surrogates
 
---
 Lib/json/decoder.py               | 35 ++++++++++++------------
 Lib/json/tests/test_scanstring.py | 56 ++++++++++++++++++++++++++++++++++++---
 Modules/_json.c                   | 49 +++++++++-------------------------
 3 files changed, 83 insertions(+), 57 deletions(-)
 
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index dfcc628..1b43238 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -62,6 +62,16 @@ BACKSLASH = {
 
 DEFAULT_ENCODING = "utf-8"
 
+def _decode_uXXXX(s, pos):
+    esc = s[pos + 1:pos + 5]
+    if len(esc) == 4 and esc[1] not in 'xX':
+        try:
+            return int(esc, 16)
+        except ValueError:
+            pass
+    msg = "Invalid \\uXXXX escape"
+    raise ValueError(errmsg(msg, s, pos))
+
 def py_scanstring(s, end, encoding=None, strict=True,
         _b=BACKSLASH, _m=STRINGCHUNK.match):
     """Scan the string s for a JSON string. End is the index of the
@@ -116,25 +126,16 @@ def py_scanstring(s, end, encoding=None, strict=True,
             end += 1
         else:
             # Unicode escape sequence
-            esc = s[end + 1:end + 5]
-            next_end = end + 5
-            if len(esc) != 4:
-                msg = "Invalid \\uXXXX escape"
-                raise ValueError(errmsg(msg, s, end))
-            uni = int(esc, 16)
+            uni = _decode_uXXXX(s, end)
+            end += 5
             # Check for surrogate pair on UCS-4 systems
-            if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
-                msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
-                if not s[end + 5:end + 7] == '\\u':
-                    raise ValueError(errmsg(msg, s, end))
-                esc2 = s[end + 7:end + 11]
-                if len(esc2) != 4:
-                    raise ValueError(errmsg(msg, s, end))
-                uni2 = int(esc2, 16)
-                uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
-                next_end += 6
+            if sys.maxunicode > 65535 and \
+               0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
+                uni2 = _decode_uXXXX(s, end + 1)
+                if 0xdc00 <= uni2 <= 0xdfff:
+                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
+                    end += 6
             char = unichr(uni)
-            end = next_end
         # Append the unescaped character
         _append(char)
     return u''.join(chunks), end
diff --git a/Lib/json/tests/test_scanstring.py b/Lib/json/tests/test_scanstring.py
index 4fef8cb..ed80a41 100644
--- a/Lib/json/tests/test_scanstring.py
+++ b/Lib/json/tests/test_scanstring.py
@@ -5,10 +5,6 @@ from json.tests import PyTest, CTest
 class TestScanstring(object):
     def test_scanstring(self):
         scanstring = self.json.decoder.scanstring
-        self.assertEqual(
-            scanstring('"z\\ud834\\udd20x"', 1, None, True),
-            (u'z\U0001d120x', 16))
-
         if sys.maxunicode == 65535:
             self.assertEqual(
                 scanstring(u'"z\U0001d120x"', 1, None, True),
@@ -94,6 +90,58 @@ class TestScanstring(object):
             scanstring('["Bad value", truth]', 2, None, True),
             (u'Bad value', 12))
 
+    def test_surrogates(self):
+        scanstring = self.json.decoder.scanstring
+        def assertScan(given, expect):
+            self.assertEqual(scanstring(given, 1, None, True),
+                             (expect, len(given)))
+            if not isinstance(given, unicode):
+                given = unicode(given)
+                self.assertEqual(scanstring(given, 1, None, True),
+                                 (expect, len(given)))
+
+        surrogates = unichr(0xd834) + unichr(0xdd20)
+        assertScan('"z\\ud834\\u0079x"', u'z\ud834yx')
+        assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x')
+        assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x')
+        assertScan('"z\\ud834x"', u'z\ud834x')
+        assertScan(u'"z\\ud834\udd20x12345"', u'z%sx12345' % surrogates)
+        assertScan('"z\\udd20x"', u'z\udd20x')
+        assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x')
+        assertScan(u'"z\ud834\\udd20x"', u'z%sx' % surrogates)
+        assertScan(u'"z\ud834x"', u'z\ud834x')
+
+    def test_bad_escapes(self):
+        scanstring = self.json.decoder.scanstring
+        bad_escapes = [
+            '"\\"',
+            '"\\x"',
+            '"\\u"',
+            '"\\u0"',
+            '"\\u01"',
+            '"\\u012"',
+            '"\\uz012"',
+            '"\\u0z12"',
+            '"\\u01z2"',
+            '"\\u012z"',
+            '"\\u0x12"',
+            '"\\u0X12"',
+            '"\\ud834\\"',
+            '"\\ud834\\u"',
+            '"\\ud834\\ud"',
+            '"\\ud834\\udd"',
+            '"\\ud834\\udd2"',
+            '"\\ud834\\uzdd2"',
+            '"\\ud834\\udzd2"',
+            '"\\ud834\\uddz2"',
+            '"\\ud834\\udd2z"',
+            '"\\ud834\\u0x20"',
+            '"\\ud834\\u0X20"',
+        ]
+        for s in bad_escapes:
+            with self.assertRaises(ValueError):
+                scanstring(s, 1, None, True)
+
     def test_issue3623(self):
         self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1,
                           "xxx")
diff --git a/Modules/_json.c b/Modules/_json.c
index 7c925fd..56d9ee4 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -524,16 +524,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
             }
 #ifdef Py_UNICODE_WIDE
             /* Surrogate pair */
-            if ((c & 0xfc00) == 0xd800) {
+            if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
+                buf[next++] == '\\' &&
+                buf[next++] == 'u') {
                 Py_UNICODE c2 = 0;
-                if (end + 6 >= len) {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
-                if (buf[next++] != '\\' || buf[next++] != 'u') {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
                 end += 6;
                 /* Decode 4 hex digits */
                 for (; next < end; next++) {
@@ -554,15 +548,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
                             goto bail;
                     }
                 }
-                if ((c2 & 0xfc00) != 0xdc00) {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
-            }
-            else if ((c & 0xfc00) == 0xdc00) {
-                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
-                goto bail;
+                if ((c2 & 0xfc00) == 0xdc00)
+                    c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+                else
+                    end -= 6;
             }
 #endif
         }
@@ -703,16 +692,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
             }
 #ifdef Py_UNICODE_WIDE
             /* Surrogate pair */
-            if ((c & 0xfc00) == 0xd800) {
+            if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
+                buf[next++] == '\\' && buf[next++] == 'u') {
                 Py_UNICODE c2 = 0;
-                if (end + 6 >= len) {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
-                if (buf[next++] != '\\' || buf[next++] != 'u') {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
                 end += 6;
                 /* Decode 4 hex digits */
                 for (; next < end; next++) {
@@ -733,15 +715,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
                             goto bail;
                     }
                 }
-                if ((c2 & 0xfc00) != 0xdc00) {
-                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
-                    goto bail;
-                }
-                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
-            }
-            else if ((c & 0xfc00) == 0xdc00) {
-                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
-                goto bail;
+                if ((c2 & 0xfc00) == 0xdc00)
+                    c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
+                else
+                    end -= 6;
             }
 #endif
         }
--
2.5.5