diff --git a/Lib/httplib.py b/Lib/httplib.py index da2f346..fc8e895 100644 --- a/Lib/httplib.py +++ b/Lib/httplib.py @@ -247,6 +247,15 @@ _MAXHEADERS = 100 _is_legal_header_name = re.compile(r'\A[^:\s][^:\r\n]*\Z').match _is_illegal_header_value = re.compile(r'\n(?![ \t])|\r(?![ \t\n])').search +# These characters are not allowed within HTTP URL paths. +# See https://tools.ietf.org/html/rfc3986#section-3.3 and the +# https://tools.ietf.org/html/rfc3986#appendix-A pchar definition. +# Prevents CVE-2019-9740. Includes control characters such as \r\n. +# Restrict non-ASCII characters above \x7f (0x80-0xff). +_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f-\xff]') +# Arguably only these _should_ allowed: +# _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$") +# We are more lenient for assumed real world compatibility purposes. class HTTPMessage(mimetools.Message): @@ -926,6 +935,12 @@ class HTTPConnection: self._method = method if not url: url = '/' + # Prevent CVE-2019-9740. + match = _contains_disallowed_url_pchar_re.search(url) + if match: + raise InvalidURL("URL can't contain control characters. %r " + "(found at least %r)" + % (url, match.group())) hdr = '%s %s %s' % (method, url, self._http_vsn_str) self._output(hdr) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 3845012..d2da0f8 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -198,6 +198,31 @@ class urlopen_HttpTests(unittest.TestCase, FakeHTTPMixin): finally: self.unfakehttp() + def test_url_with_control_char_rejected(self): + for char_no in range(0, 0x21) + range(0x7f, 0x100): + char = chr(char_no) + schemeless_url = "//localhost:7777/test%s/" % char + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + try: + # urllib quotes the URL so there is no injection. + resp = urllib.urlopen("http:" + schemeless_url) + self.assertNotIn(char, resp.geturl()) + finally: + self.unfakehttp() + + def test_url_with_newline_header_injection_rejected(self): + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123" + schemeless_url = "//" + host + ":8080/test/?test=a" + try: + # urllib quotes the URL so there is no injection. + resp = urllib.urlopen("http:" + schemeless_url) + self.assertNotIn(' ', resp.geturl()) + self.assertNotIn('\r', resp.geturl()) + self.assertNotIn('\n', resp.geturl()) + finally: + self.unfakehttp() + def test_read_bogus(self): # urlopen() should raise IOError for many error codes. self.fakehttp('''HTTP/1.1 401 Authentication Required @@ -786,6 +811,35 @@ class Pathname_Tests(unittest.TestCase): class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" + def test_splithost(self): + splithost = urllib.splithost + self.assertEqual(splithost('//www.example.org:80/foo/bar/baz.html'), + ('www.example.org:80', '/foo/bar/baz.html')) + self.assertEqual(splithost('//www.example.org:80'), + ('www.example.org:80', '')) + self.assertEqual(splithost('/foo/bar/baz.html'), + (None, '/foo/bar/baz.html')) + + # bpo-30500: # starts a fragment. + self.assertEqual(splithost('//127.0.0.1#@host.com'), + ('127.0.0.1', '/#@host.com')) + self.assertEqual(splithost('//127.0.0.1#@host.com:80'), + ('127.0.0.1', '/#@host.com:80')) + self.assertEqual(splithost('//127.0.0.1:80#@host.com'), + ('127.0.0.1:80', '/#@host.com')) + + # Empty host is returned as empty string. + self.assertEqual(splithost("///file"), + ('', '/file')) + + # Trailing semicolon, question mark and hash symbol are kept. + self.assertEqual(splithost("//example.net/file;"), + ('example.net', '/file;')) + self.assertEqual(splithost("//example.net/file?"), + ('example.net', '/file?')) + self.assertEqual(splithost("//example.net/file#"), + ('example.net', '/file#')) + def test_splitpasswd(self): """Some of the password examples are not sensible, but it is added to confirming to RFC2617 and addressing issue4675. diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index c317b8d..63fefd6 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -7,12 +7,16 @@ import StringIO import urllib2 from urllib2 import Request, OpenerDirector +import httplib try: import ssl except ImportError: ssl = None +from test.test_urllib import FakeHTTPMixin + + # XXX # Request # CacheFTPHandler (hard to write) @@ -1243,7 +1247,7 @@ class HandlerTests(unittest.TestCase): self.assertEqual(len(http_handler.requests), 1) self.assertFalse(http_handler.requests[0].has_header(auth_header)) -class MiscTests(unittest.TestCase): +class MiscTests(unittest.TestCase, FakeHTTPMixin): def test_build_opener(self): class MyHTTPHandler(urllib2.HTTPHandler): pass @@ -1289,6 +1293,53 @@ class MiscTests(unittest.TestCase): else: self.assertTrue(False) + @unittest.skipUnless(ssl, "ssl module required") + def test_url_with_control_char_rejected(self): + for char_no in range(0, 0x21) + range(0x7f, 0x100): + char = chr(char_no) + schemeless_url = "//localhost:7777/test%s/" % char + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + try: + # We explicitly test urllib.request.urlopen() instead of the top + # level 'def urlopen()' function defined in this... (quite ugly) + # test suite. They use different url opening codepaths. Plain + # urlopen uses FancyURLOpener which goes via a codepath that + # calls urllib.parse.quote() on the URL which makes all of the + # above attempts at injection within the url _path_ safe. + escaped_char_repr = repr(char).replace('\\', r'\\') + InvalidURL = httplib.InvalidURL + with self.assertRaisesRegexp( + InvalidURL, "contain control.*" + escaped_char_repr): + urllib2.urlopen("http:" + schemeless_url) + with self.assertRaisesRegexp( + InvalidURL, "contain control.*" + escaped_char_repr): + urllib2.urlopen("https:" + schemeless_url) + finally: + self.unfakehttp() + + @unittest.skipUnless(ssl, "ssl module required") + def test_url_with_newline_header_injection_rejected(self): + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123" + schemeless_url = "//" + host + ":8080/test/?test=a" + try: + # We explicitly test urllib2.urlopen() instead of the top + # level 'def urlopen()' function defined in this... (quite ugly) + # test suite. They use different url opening codepaths. Plain + # urlopen uses FancyURLOpener which goes via a codepath that + # calls urllib.parse.quote() on the URL which makes all of the + # above attempts at injection within the url _path_ safe. + InvalidURL = httplib.InvalidURL + with self.assertRaisesRegexp( + InvalidURL, r"contain control.*\\r.*(found at least . .)"): + urllib2.urlopen("http:" + schemeless_url) + with self.assertRaisesRegexp(InvalidURL, r"contain control.*\\n"): + urllib2.urlopen("https:" + schemeless_url) + finally: + self.unfakehttp() + + + class RequestTests(unittest.TestCase): def setUp(self): diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py index 79e862a..347b494 100644 --- a/Lib/test/test_xmlrpc.py +++ b/Lib/test/test_xmlrpc.py @@ -592,7 +592,13 @@ class SimpleServerTestCase(BaseServerTestCase): def test_partial_post(self): # Check that a partial POST doesn't make the server loop: issue #14001. conn = httplib.HTTPConnection(ADDR, PORT) - conn.request('POST', '/RPC2 HTTP/1.0\r\nContent-Length: 100\r\n\r\nbye') + conn.send('POST /RPC2 HTTP/1.0\r\n' + 'Content-Length: 100\r\n\r\n' + 'bye HTTP/1.1\r\n' + 'Host: %s:%s\r\n' + 'Accept-Encoding: identity\r\n' + 'Content-Length: 0\r\n\r\n' + % (ADDR, PORT)) conn.close() class MultiPathServerTestCase(BaseServerTestCase): diff --git a/Lib/urllib.py b/Lib/urllib.py index 9b31df1..2201e3e 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -1079,8 +1079,7 @@ def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" global _hostprog if _hostprog is None: - import re - _hostprog = re.compile('^//([^/?]*)(.*)$') + _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) match = _hostprog.match(url) if match: