An interpreted, interactive, object-oriented programming language
CentOS Sources
2017-08-01 71084d584ff953f5463757ec6536406320560b4d
commit | author | age
04a680 1 From 90986ef48c0df602ab38aa831a24e99e9ed61e7e Mon Sep 17 00:00:00 2001
CS 2 From: Charalampos Stratakis <cstratak@redhat.com>
3 Date: Mon, 4 Apr 2016 15:55:28 +0200
4 Subject: [PATCH] JSON decoder now accepts lone surrogates
5
6 ---
7  Lib/json/decoder.py               | 35 ++++++++++++------------
8  Lib/json/tests/test_scanstring.py | 56 ++++++++++++++++++++++++++++++++++++---
9  Modules/_json.c                   | 49 +++++++++-------------------------
10  3 files changed, 83 insertions(+), 57 deletions(-)
11
12 diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
13 index dfcc628..1b43238 100644
14 --- a/Lib/json/decoder.py
15 +++ b/Lib/json/decoder.py
16 @@ -62,6 +62,16 @@ BACKSLASH = {
17
18  DEFAULT_ENCODING = "utf-8"
19
20 +def _decode_uXXXX(s, pos):
21 +    esc = s[pos + 1:pos + 5]
22 +    if len(esc) == 4 and esc[1] not in 'xX':
23 +        try:
24 +            return int(esc, 16)
25 +        except ValueError:
26 +            pass
27 +    msg = "Invalid \\uXXXX escape"
28 +    raise ValueError(errmsg(msg, s, pos))
29 +
30  def py_scanstring(s, end, encoding=None, strict=True,
31          _b=BACKSLASH, _m=STRINGCHUNK.match):
32      """Scan the string s for a JSON string. End is the index of the
33 @@ -116,25 +126,16 @@ def py_scanstring(s, end, encoding=None, strict=True,
34              end += 1
35          else:
36              # Unicode escape sequence
37 -            esc = s[end + 1:end + 5]
38 -            next_end = end + 5
39 -            if len(esc) != 4:
40 -                msg = "Invalid \\uXXXX escape"
41 -                raise ValueError(errmsg(msg, s, end))
42 -            uni = int(esc, 16)
43 +            uni = _decode_uXXXX(s, end)
44 +            end += 5
45              # Check for surrogate pair on UCS-4 systems
46 -            if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
47 -                msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
48 -                if not s[end + 5:end + 7] == '\\u':
49 -                    raise ValueError(errmsg(msg, s, end))
50 -                esc2 = s[end + 7:end + 11]
51 -                if len(esc2) != 4:
52 -                    raise ValueError(errmsg(msg, s, end))
53 -                uni2 = int(esc2, 16)
54 -                uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
55 -                next_end += 6
56 +            if sys.maxunicode > 65535 and \
57 +               0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
58 +                uni2 = _decode_uXXXX(s, end + 1)
59 +                if 0xdc00 <= uni2 <= 0xdfff:
60 +                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
61 +                    end += 6
62              char = unichr(uni)
63 -            end = next_end
64          # Append the unescaped character
65          _append(char)
66      return u''.join(chunks), end
67 diff --git a/Lib/json/tests/test_scanstring.py b/Lib/json/tests/test_scanstring.py
68 index 4fef8cb..ed80a41 100644
69 --- a/Lib/json/tests/test_scanstring.py
70 +++ b/Lib/json/tests/test_scanstring.py
71 @@ -5,10 +5,6 @@ from json.tests import PyTest, CTest
72  class TestScanstring(object):
73      def test_scanstring(self):
74          scanstring = self.json.decoder.scanstring
75 -        self.assertEqual(
76 -            scanstring('"z\\ud834\\udd20x"', 1, None, True),
77 -            (u'z\U0001d120x', 16))
78 -
79          if sys.maxunicode == 65535:
80              self.assertEqual(
81                  scanstring(u'"z\U0001d120x"', 1, None, True),
82 @@ -94,6 +90,58 @@ class TestScanstring(object):
83              scanstring('["Bad value", truth]', 2, None, True),
84              (u'Bad value', 12))
85
86 +    def test_surrogates(self):
87 +        scanstring = self.json.decoder.scanstring
88 +        def assertScan(given, expect):
89 +            self.assertEqual(scanstring(given, 1, None, True),
90 +                             (expect, len(given)))
91 +            if not isinstance(given, unicode):
92 +                given = unicode(given)
93 +                self.assertEqual(scanstring(given, 1, None, True),
94 +                                 (expect, len(given)))
95 +
96 +        surrogates = unichr(0xd834) + unichr(0xdd20)
97 +        assertScan('"z\\ud834\\u0079x"', u'z\ud834yx')
98 +        assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x')
99 +        assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x')
100 +        assertScan('"z\\ud834x"', u'z\ud834x')
101 +        assertScan(u'"z\\ud834\udd20x12345"', u'z%sx12345' % surrogates)
102 +        assertScan('"z\\udd20x"', u'z\udd20x')
103 +        assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x')
104 +        assertScan(u'"z\ud834\\udd20x"', u'z%sx' % surrogates)
105 +        assertScan(u'"z\ud834x"', u'z\ud834x')
106 +
107 +    def test_bad_escapes(self):
108 +        scanstring = self.json.decoder.scanstring
109 +        bad_escapes = [
110 +            '"\\"',
111 +            '"\\x"',
112 +            '"\\u"',
113 +            '"\\u0"',
114 +            '"\\u01"',
115 +            '"\\u012"',
116 +            '"\\uz012"',
117 +            '"\\u0z12"',
118 +            '"\\u01z2"',
119 +            '"\\u012z"',
120 +            '"\\u0x12"',
121 +            '"\\u0X12"',
122 +            '"\\ud834\\"',
123 +            '"\\ud834\\u"',
124 +            '"\\ud834\\ud"',
125 +            '"\\ud834\\udd"',
126 +            '"\\ud834\\udd2"',
127 +            '"\\ud834\\uzdd2"',
128 +            '"\\ud834\\udzd2"',
129 +            '"\\ud834\\uddz2"',
130 +            '"\\ud834\\udd2z"',
131 +            '"\\ud834\\u0x20"',
132 +            '"\\ud834\\u0X20"',
133 +        ]
134 +        for s in bad_escapes:
135 +            with self.assertRaises(ValueError):
136 +                scanstring(s, 1, None, True)
137 +
138      def test_issue3623(self):
139          self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1,
140                            "xxx")
141 diff --git a/Modules/_json.c b/Modules/_json.c
142 index 7c925fd..56d9ee4 100644
143 --- a/Modules/_json.c
144 +++ b/Modules/_json.c
145 @@ -524,16 +524,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
146              }
147  #ifdef Py_UNICODE_WIDE
148              /* Surrogate pair */
149 -            if ((c & 0xfc00) == 0xd800) {
150 +            if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
151 +                buf[next++] == '\\' &&
152 +                buf[next++] == 'u') {
153                  Py_UNICODE c2 = 0;
154 -                if (end + 6 >= len) {
155 -                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
156 -                    goto bail;
157 -                }
158 -                if (buf[next++] != '\\' || buf[next++] != 'u') {
159 -                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
160 -                    goto bail;
161 -                }
162                  end += 6;
163                  /* Decode 4 hex digits */
164                  for (; next < end; next++) {
165 @@ -554,15 +548,10 @@ scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_s
166                              goto bail;
167                      }
168                  }
169 -                if ((c2 & 0xfc00) != 0xdc00) {
170 -                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
171 -                    goto bail;
172 -                }
173 -                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
174 -            }
175 -            else if ((c & 0xfc00) == 0xdc00) {
176 -                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
177 -                goto bail;
178 +                if ((c2 & 0xfc00) == 0xdc00)
179 +                    c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
180 +                else
181 +                    end -= 6;
182              }
183  #endif
184          }
185 @@ -703,16 +692,9 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
186              }
187  #ifdef Py_UNICODE_WIDE
188              /* Surrogate pair */
189 -            if ((c & 0xfc00) == 0xd800) {
190 +            if ((c & 0xfc00) == 0xd800 && end + 6 < len &&
191 +                buf[next++] == '\\' && buf[next++] == 'u') {
192                  Py_UNICODE c2 = 0;
193 -                if (end + 6 >= len) {
194 -                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
195 -                    goto bail;
196 -                }
197 -                if (buf[next++] != '\\' || buf[next++] != 'u') {
198 -                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
199 -                    goto bail;
200 -                }
201                  end += 6;
202                  /* Decode 4 hex digits */
203                  for (; next < end; next++) {
204 @@ -733,15 +715,10 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
205                              goto bail;
206                      }
207                  }
208 -                if ((c2 & 0xfc00) != 0xdc00) {
209 -                    raise_errmsg("Unpaired high surrogate", pystr, end - 5);
210 -                    goto bail;
211 -                }
212 -                c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
213 -            }
214 -            else if ((c & 0xfc00) == 0xdc00) {
215 -                raise_errmsg("Unpaired low surrogate", pystr, end - 5);
216 -                goto bail;
217 +                if ((c2 & 0xfc00) == 0xdc00)
218 +                    c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));
219 +                else
220 +                    end -= 6;
221              }
222  #endif
223          }
224 --
225 2.5.5
226