Blame SOURCES/00211-pep466-UTF-7-decoder-fix-illegal-unicode.patch

ae2451
ae2451
# HG changeset patch
ae2451
# User Serhiy Storchaka <storchaka@gmail.com>
ae2451
# Date 1382204269 -10800
ae2451
# Node ID 214c0aac7540947d88a38ff0061734547ef86710
ae2451
# Parent  c207ac413457a1b834e4b7dcf1a6836cd6e036e3
ae2451
Issue #19279: UTF-7 decoder no more produces illegal unicode strings.
ae2451
ae2451
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
ae2451
--- a/Lib/test/test_codecs.py
ae2451
+++ b/Lib/test/test_codecs.py
ae2451
@@ -611,6 +611,35 @@ class UTF7Test(ReadTest):
ae2451
             ]
ae2451
         )
ae2451
 
ae2451
+    def test_errors(self):
ae2451
+        tests = [
ae2451
+            ('a\xffb', u'a\ufffdb'),
ae2451
+            ('a+IK', u'a\ufffd'),
ae2451
+            ('a+IK-b', u'a\ufffdb'),
ae2451
+            ('a+IK,b', u'a\ufffdb'),
ae2451
+            ('a+IKx', u'a\u20ac\ufffd'),
ae2451
+            ('a+IKx-b', u'a\u20ac\ufffdb'),
ae2451
+            ('a+IKwgr', u'a\u20ac\ufffd'),
ae2451
+            ('a+IKwgr-b', u'a\u20ac\ufffdb'),
ae2451
+            ('a+IKwgr,', u'a\u20ac\ufffd'),
ae2451
+            ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
ae2451
+            ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
ae2451
+            ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
ae2451
+            ('a+/,+IKw-b', u'a\ufffd\u20acb'),
ae2451
+            ('a+//,+IKw-b', u'a\ufffd\u20acb'),
ae2451
+            ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
ae2451
+            ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
ae2451
+        ]
ae2451
+        for raw, expected in tests:
ae2451
+            self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
ae2451
+                              raw, 'strict', True)
ae2451
+            self.assertEqual(raw.decode('utf-7', 'replace'), expected)
ae2451
+
ae2451
+    def test_nonbmp(self):
ae2451
+        self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
ae2451
+        self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
ae2451
+        self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
ae2451
+
ae2451
 class UTF16ExTest(unittest.TestCase):
ae2451
 
ae2451
     def test_errors(self):
ae2451
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
ae2451
--- a/Objects/unicodeobject.c
ae2451
+++ b/Objects/unicodeobject.c
ae2451
@@ -1671,6 +1671,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(c
ae2451
                                        (base64buffer >> (base64bits-16));
ae2451
                     base64bits -= 16;
ae2451
                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
ae2451
+                    assert(outCh <= 0xffff);
ae2451
                     if (surrogate) {
ae2451
                         /* expecting a second surrogate */
ae2451
                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
ae2451
@@ -1737,6 +1738,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(c
ae2451
                 inShift = 1;
ae2451
                 shiftOutStart = p;
ae2451
                 base64bits = 0;
ae2451
+                base64buffer = 0;
ae2451
             }
ae2451
         }
ae2451
         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
ae2451