Blame SOURCES/00324-disallow-control-chars-in-http-urls.patch

925e6b
diff --git a/Lib/httplib.py b/Lib/httplib.py
925e6b
index da2f346..fc8e895 100644
925e6b
--- a/Lib/httplib.py
925e6b
+++ b/Lib/httplib.py
925e6b
@@ -247,6 +247,15 @@ _MAXHEADERS = 100
925e6b
 _is_legal_header_name = re.compile(r'\A[^:\s][^:\r\n]*\Z').match
925e6b
 _is_illegal_header_value = re.compile(r'\n(?![ \t])|\r(?![ \t\n])').search
925e6b
 
925e6b
+# These characters are not allowed within HTTP URL paths.
925e6b
+#  See https://tools.ietf.org/html/rfc3986#section-3.3 and the
925e6b
+#  https://tools.ietf.org/html/rfc3986#appendix-A pchar definition.
925e6b
+# Prevents CVE-2019-9740.  Includes control characters such as \r\n.
925e6b
+# Restrict non-ASCII characters above \x7f (0x80-0xff).
925e6b
+_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f-\xff]')
925e6b
+# Arguably only these _should_ allowed:
925e6b
+#  _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$")
925e6b
+# We are more lenient for assumed real world compatibility purposes.
925e6b
 
925e6b
 class HTTPMessage(mimetools.Message):
925e6b
 
925e6b
@@ -926,6 +935,12 @@ class HTTPConnection:
925e6b
         self._method = method
925e6b
         if not url:
925e6b
             url = '/'
925e6b
+        # Prevent CVE-2019-9740.
925e6b
+        match = _contains_disallowed_url_pchar_re.search(url)
925e6b
+        if match:
925e6b
+            raise InvalidURL("URL can't contain control characters. %r "
925e6b
+                             "(found at least %r)"
925e6b
+                             % (url, match.group()))
925e6b
         hdr = '%s %s %s' % (method, url, self._http_vsn_str)
925e6b
 
925e6b
         self._output(hdr)
925e6b
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
925e6b
index 3845012..d2da0f8 100644
925e6b
--- a/Lib/test/test_urllib.py
925e6b
+++ b/Lib/test/test_urllib.py
925e6b
@@ -198,6 +198,31 @@ class urlopen_HttpTests(unittest.TestCase, FakeHTTPMixin):
925e6b
         finally:
925e6b
             self.unfakehttp()
925e6b
 
925e6b
+    def test_url_with_control_char_rejected(self):
925e6b
+        for char_no in range(0, 0x21) + range(0x7f, 0x100):
925e6b
+            char = chr(char_no)
925e6b
+            schemeless_url = "//localhost:7777/test%s/" % char
925e6b
+            self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
925e6b
+            try:
925e6b
+                # urllib quotes the URL so there is no injection.
925e6b
+                resp = urllib.urlopen("http:" + schemeless_url)
925e6b
+                self.assertNotIn(char, resp.geturl())
925e6b
+            finally:
925e6b
+                self.unfakehttp()
925e6b
+
925e6b
+    def test_url_with_newline_header_injection_rejected(self):
925e6b
+        self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
925e6b
+        host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123"
925e6b
+        schemeless_url = "//" + host + ":8080/test/?test=a"
925e6b
+        try:
925e6b
+            # urllib quotes the URL so there is no injection.
925e6b
+            resp = urllib.urlopen("http:" + schemeless_url)
925e6b
+            self.assertNotIn(' ', resp.geturl())
925e6b
+            self.assertNotIn('\r', resp.geturl())
925e6b
+            self.assertNotIn('\n', resp.geturl())
925e6b
+        finally:
925e6b
+            self.unfakehttp()
925e6b
+
925e6b
     def test_read_bogus(self):
925e6b
         # urlopen() should raise IOError for many error codes.
925e6b
         self.fakehttp('''HTTP/1.1 401 Authentication Required
925e6b
@@ -786,6 +811,35 @@ class Pathname_Tests(unittest.TestCase):
925e6b
 class Utility_Tests(unittest.TestCase):
925e6b
     """Testcase to test the various utility functions in the urllib."""
925e6b
 
925e6b
+    def test_splithost(self):
925e6b
+        splithost = urllib.splithost
925e6b
+        self.assertEqual(splithost('//www.example.org:80/foo/bar/baz.html'),
925e6b
+                         ('www.example.org:80', '/foo/bar/baz.html'))
925e6b
+        self.assertEqual(splithost('//www.example.org:80'),
925e6b
+                         ('www.example.org:80', ''))
925e6b
+        self.assertEqual(splithost('/foo/bar/baz.html'),
925e6b
+                         (None, '/foo/bar/baz.html'))
925e6b
+
925e6b
+        # bpo-30500: # starts a fragment.
925e6b
+        self.assertEqual(splithost('//127.0.0.1#@host.com'),
925e6b
+                         ('127.0.0.1', '/#@host.com'))
925e6b
+        self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
925e6b
+                         ('127.0.0.1', '/#@host.com:80'))
925e6b
+        self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
925e6b
+                         ('127.0.0.1:80', '/#@host.com'))
925e6b
+
925e6b
+        # Empty host is returned as empty string.
925e6b
+        self.assertEqual(splithost("///file"),
925e6b
+                         ('', '/file'))
925e6b
+
925e6b
+        # Trailing semicolon, question mark and hash symbol are kept.
925e6b
+        self.assertEqual(splithost("//example.net/file;"),
925e6b
+                         ('example.net', '/file;'))
925e6b
+        self.assertEqual(splithost("//example.net/file?"),
925e6b
+                         ('example.net', '/file?'))
925e6b
+        self.assertEqual(splithost("//example.net/file#"),
925e6b
+                         ('example.net', '/file#'))
925e6b
+
925e6b
     def test_splitpasswd(self):
925e6b
         """Some of the password examples are not sensible, but it is added to
925e6b
         confirming to RFC2617 and addressing issue4675.
925e6b
diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py
925e6b
index c317b8d..63fefd6 100644
925e6b
--- a/Lib/test/test_urllib2.py
925e6b
+++ b/Lib/test/test_urllib2.py
925e6b
@@ -7,12 +7,16 @@ import StringIO
925e6b
 
925e6b
 import urllib2
925e6b
 from urllib2 import Request, OpenerDirector
925e6b
+import httplib
925e6b
 
925e6b
 try:
925e6b
     import ssl
925e6b
 except ImportError:
925e6b
     ssl = None
925e6b
 
925e6b
+from test.test_urllib import FakeHTTPMixin
925e6b
+
925e6b
+
925e6b
 # XXX
925e6b
 # Request
925e6b
 # CacheFTPHandler (hard to write)
925e6b
@@ -1243,7 +1247,7 @@ class HandlerTests(unittest.TestCase):
925e6b
         self.assertEqual(len(http_handler.requests), 1)
925e6b
         self.assertFalse(http_handler.requests[0].has_header(auth_header))
925e6b
 
925e6b
-class MiscTests(unittest.TestCase):
925e6b
+class MiscTests(unittest.TestCase, FakeHTTPMixin):
925e6b
 
925e6b
     def test_build_opener(self):
925e6b
         class MyHTTPHandler(urllib2.HTTPHandler): pass
925e6b
@@ -1289,6 +1293,53 @@ class MiscTests(unittest.TestCase):
925e6b
         else:
925e6b
             self.assertTrue(False)
925e6b
 
925e6b
+    @unittest.skipUnless(ssl, "ssl module required")
925e6b
+    def test_url_with_control_char_rejected(self):
925e6b
+        for char_no in range(0, 0x21) + range(0x7f, 0x100):
925e6b
+            char = chr(char_no)
925e6b
+            schemeless_url = "//localhost:7777/test%s/" % char
925e6b
+            self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
925e6b
+            try:
925e6b
+                # We explicitly test urllib.request.urlopen() instead of the top
925e6b
+                # level 'def urlopen()' function defined in this... (quite ugly)
925e6b
+                # test suite.  They use different url opening codepaths.  Plain
925e6b
+                # urlopen uses FancyURLOpener which goes via a codepath that
925e6b
+                # calls urllib.parse.quote() on the URL which makes all of the
925e6b
+                # above attempts at injection within the url _path_ safe.
925e6b
+                escaped_char_repr = repr(char).replace('\\', r'\\')
925e6b
+                InvalidURL = httplib.InvalidURL
925e6b
+                with self.assertRaisesRegexp(
925e6b
+                    InvalidURL, "contain control.*" + escaped_char_repr):
925e6b
+                    urllib2.urlopen("http:" + schemeless_url)
925e6b
+                with self.assertRaisesRegexp(
925e6b
+                    InvalidURL, "contain control.*" + escaped_char_repr):
925e6b
+                    urllib2.urlopen("https:" + schemeless_url)
925e6b
+            finally:
925e6b
+                self.unfakehttp()
925e6b
+
925e6b
+    @unittest.skipUnless(ssl, "ssl module required")
925e6b
+    def test_url_with_newline_header_injection_rejected(self):
925e6b
+        self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
925e6b
+        host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123"
925e6b
+        schemeless_url = "//" + host + ":8080/test/?test=a"
925e6b
+        try:
925e6b
+            # We explicitly test urllib2.urlopen() instead of the top
925e6b
+            # level 'def urlopen()' function defined in this... (quite ugly)
925e6b
+            # test suite.  They use different url opening codepaths.  Plain
925e6b
+            # urlopen uses FancyURLOpener which goes via a codepath that
925e6b
+            # calls urllib.parse.quote() on the URL which makes all of the
925e6b
+            # above attempts at injection within the url _path_ safe.
925e6b
+            InvalidURL = httplib.InvalidURL
925e6b
+            with self.assertRaisesRegexp(
925e6b
+                InvalidURL, r"contain control.*\\r.*(found at least . .)"):
925e6b
+                urllib2.urlopen("http:" + schemeless_url)
925e6b
+            with self.assertRaisesRegexp(InvalidURL, r"contain control.*\\n"):
925e6b
+                urllib2.urlopen("https:" + schemeless_url)
925e6b
+        finally:
925e6b
+            self.unfakehttp()
925e6b
+
925e6b
+
925e6b
+
925e6b
 class RequestTests(unittest.TestCase):
925e6b
 
925e6b
     def setUp(self):
925e6b
diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py
925e6b
index 79e862a..347b494 100644
925e6b
--- a/Lib/test/test_xmlrpc.py
925e6b
+++ b/Lib/test/test_xmlrpc.py
925e6b
@@ -592,7 +592,13 @@ class SimpleServerTestCase(BaseServerTestCase):
925e6b
     def test_partial_post(self):
925e6b
         # Check that a partial POST doesn't make the server loop: issue #14001.
925e6b
         conn = httplib.HTTPConnection(ADDR, PORT)
925e6b
-        conn.request('POST', '/RPC2 HTTP/1.0\r\nContent-Length: 100\r\n\r\nbye')
925e6b
+        conn.send('POST /RPC2 HTTP/1.0\r\n'
925e6b
+                  'Content-Length: 100\r\n\r\n'
925e6b
+                  'bye HTTP/1.1\r\n'
925e6b
+                  'Host: %s:%s\r\n'
925e6b
+                  'Accept-Encoding: identity\r\n'
925e6b
+                  'Content-Length: 0\r\n\r\n'
925e6b
+                  % (ADDR, PORT))
925e6b
         conn.close()
925e6b
 
925e6b
 class MultiPathServerTestCase(BaseServerTestCase):
925e6b
diff --git a/Lib/urllib.py b/Lib/urllib.py
925e6b
index 9b31df1..2201e3e 100644
925e6b
--- a/Lib/urllib.py
925e6b
+++ b/Lib/urllib.py
925e6b
@@ -1079,8 +1079,7 @@ def splithost(url):
925e6b
     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
925e6b
     global _hostprog
925e6b
     if _hostprog is None:
925e6b
-        import re
925e6b
-        _hostprog = re.compile('^//([^/?]*)(.*)$')
925e6b
+        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
925e6b
 
925e6b
     match = _hostprog.match(url)
925e6b
     if match: