Blob Blame History Raw
From c983b1ee6471d9835e94ff153dd40b6e7c1fef86 Mon Sep 17 00:00:00 2001
From: Lumir Balhar <lbalhar@redhat.com>
Date: Thu, 1 Apr 2021 08:18:07 +0200
Subject: [PATCH] CVE-2021-23336: Add `separator` argument to parse_qs; warn
 with default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Partially backports https://bugs.python.org/issue42967 : [security] Address a web cache-poisoning issue reported in urllib.parse.parse_qsl().
However, this solution is different than the upstream solution in Python 3.6.13.

An optional argument seperator is added to specify the separator.
It is recommended to set it to '&' or ';' to match the application or proxy in use.
The default can be set with an env variable of a config file.
If neither the argument, env var or config file specifies a separator, "&" is used
but a warning is raised if parse_qs is used on input that contains ';'.

Co-authors of the upstream change (who do not necessarily agree with this):
Co-authored-by: Adam Goldschmidt <adamgold7@gmail.com>
Co-authored-by: Ken Jin <28750310+Fidget-Spinner@users.noreply.github.com>
Co-authored-by: Éric Araujo <merwok@netwok.org>
---
 Doc/library/cgi.rst          |   2 +-
 Doc/library/urllib.parse.rst |  12 +++--
 Lib/cgi.py                   |   4 +-
 Lib/test/test_cgi.py         |  29 ++++++++++
 Lib/test/test_urlparse.py    | 102 ++++++++++++++++++++++++++++++++---
 Lib/urllib/parse.py          |  78 ++++++++++++++++++++++++---
 6 files changed, 209 insertions(+), 18 deletions(-)

diff --git a/Doc/library/cgi.rst b/Doc/library/cgi.rst
index 880074b..d8a6dc1 100644
--- a/Doc/library/cgi.rst
+++ b/Doc/library/cgi.rst
@@ -277,7 +277,7 @@ These are useful if you want more control, or if you want to employ some of the
 algorithms implemented in this module in other circumstances.
 
 
-.. function:: parse(fp=None, environ=os.environ, keep_blank_values=False, strict_parsing=False, separator="&")
+.. function:: parse(fp=None, environ=os.environ, keep_blank_values=False, strict_parsing=False, separator=None)
 
    Parse a query in the environment or from a file (the file defaults to
    ``sys.stdin``).  The *keep_blank_values*, *strict_parsing* and *separator* parameters are
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
index fcad707..9bcef69 100644
--- a/Doc/library/urllib.parse.rst
+++ b/Doc/library/urllib.parse.rst
@@ -165,7 +165,7 @@ or on combining URL components into a URL string.
       now raise :exc:`ValueError`.
 
 
-.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, separator='&')
+.. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, separator=None)
 
    Parse a query string given as a string argument (data of type
    :mimetype:`application/x-www-form-urlencoded`).  Data are returned as a
@@ -191,7 +191,13 @@ or on combining URL components into a URL string.
    *max_num_fields* fields read.
 
    The optional argument *separator* is the symbol to use for separating the
-   query arguments. It defaults to ``&``.
+   query arguments. It is recommended to set it to ``'&'`` or ``';'``.
+   It defaults to ``'&'``; a warning is raised if this default is used.
+   This default may be changed with the following environment variable settings:
+
+   - ``PYTHON_URLLIB_QS_SEPARATOR='&'``: use only ``&`` as separator, without warning (as in Python 3.6.13+ or 3.10)
+   - ``PYTHON_URLLIB_QS_SEPARATOR=';'``: use only ``;`` as separator
+   - ``PYTHON_URLLIB_QS_SEPARATOR=legacy``: use both ``&`` and ``;`` (as in previous versions of Python)
 
    Use the :func:`urllib.parse.urlencode` function (with the ``doseq``
    parameter set to ``True``) to convert such dictionaries into query
@@ -236,7 +242,7 @@ or on combining URL components into a URL string.
    *max_num_fields* fields read.
 
    The optional argument *separator* is the symbol to use for separating the
-   query arguments. It defaults to ``&``.
+   query arguments. It works as in :py:func:`parse_qs`.
 
    Use the :func:`urllib.parse.urlencode` function to convert such lists of pairs into
    query strings.
diff --git a/Lib/cgi.py b/Lib/cgi.py
index 1e880e5..d7b994b 100755
--- a/Lib/cgi.py
+++ b/Lib/cgi.py
@@ -116,7 +116,7 @@ log = initlog           # The current logging function
 maxlen = 0
 
 def parse(fp=None, environ=os.environ, keep_blank_values=0,
-          strict_parsing=0, separator='&'):
+          strict_parsing=0, separator=None):
     """Parse a query in the environment or from a file (default stdin)
 
         Arguments, all optional:
@@ -319,7 +319,7 @@ class FieldStorage:
     def __init__(self, fp=None, headers=None, outerboundary=b'',
                  environ=os.environ, keep_blank_values=0, strict_parsing=0,
                  limit=None, encoding='utf-8', errors='replace',
-                 max_num_fields=None, separator='&'):
+                 max_num_fields=None, separator=None):
         """Constructor.  Read multipart/* until last part.
 
         Arguments, all optional:
diff --git a/Lib/test/test_cgi.py b/Lib/test/test_cgi.py
index 4e1506a..49b6926 100644
--- a/Lib/test/test_cgi.py
+++ b/Lib/test/test_cgi.py
@@ -180,6 +180,35 @@ Content-Length: 3
 
             env = {'QUERY_STRING': orig}
             fs = cgi.FieldStorage(environ=env)
+            if isinstance(expect, dict):
+                # test dict interface
+                self.assertEqual(len(expect), len(fs))
+                self.assertCountEqual(expect.keys(), fs.keys())
+                self.assertEqual(fs.getvalue("nonexistent field", "default"), "default")
+                # test individual fields
+                for key in expect.keys():
+                    expect_val = expect[key]
+                    self.assertIn(key, fs)
+                    if len(expect_val) > 1:
+                        self.assertEqual(fs.getvalue(key), expect_val)
+                    else:
+                        self.assertEqual(fs.getvalue(key), expect_val[0])
+
+    def test_separator(self):
+        parse_semicolon = [
+            ("x=1;y=2.0", {'x': ['1'], 'y': ['2.0']}),
+            ("x=1;y=2.0;z=2-3.%2b0", {'x': ['1'], 'y': ['2.0'], 'z': ['2-3.+0']}),
+            (";", ValueError("bad query field: ''")),
+            (";;", ValueError("bad query field: ''")),
+            ("=;a", ValueError("bad query field: 'a'")),
+            (";b=a", ValueError("bad query field: ''")),
+            ("b;=a", ValueError("bad query field: 'b'")),
+            ("a=a+b;b=b+c", {'a': ['a b'], 'b': ['b c']}),
+            ("a=a+b;a=b+a", {'a': ['a b', 'b a']}),
+        ]
+        for orig, expect in parse_semicolon:
+            env = {'QUERY_STRING': orig}
+            fs = cgi.FieldStorage(separator=';', environ=env)
             if isinstance(expect, dict):
                 # test dict interface
                 self.assertEqual(len(expect), len(fs))
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
index d2ec0da..bb64974 100644
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -2,6 +2,11 @@ import sys
 import unicodedata
 import unittest
 import urllib.parse
+from test.support import EnvironmentVarGuard
+from warnings import catch_warnings
+import tempfile
+import contextlib
+import os.path
 
 RFC1808_BASE = "http://a/b/c/d;p?q#f"
 RFC2396_BASE = "http://a/b/c/d;p?q"
@@ -32,10 +37,34 @@ parse_qsl_test_cases = [
     (b"&a=b", [(b'a', b'b')]),
     (b"a=a+b&b=b+c", [(b'a', b'a b'), (b'b', b'b c')]),
     (b"a=1&a=2", [(b'a', b'1'), (b'a', b'2')]),
+]
+
+parse_qsl_test_cases_semicolon = [
+    (";", []),
+    (";;", []),
+    (";a=b", [('a', 'b')]),
+    ("a=a+b;b=b+c", [('a', 'a b'), ('b', 'b c')]),
+    ("a=1;a=2", [('a', '1'), ('a', '2')]),
+    (b";", []),
+    (b";;", []),
+    (b";a=b", [(b'a', b'b')]),
+    (b"a=a+b;b=b+c", [(b'a', b'a b'), (b'b', b'b c')]),
+    (b"a=1;a=2", [(b'a', b'1'), (b'a', b'2')]),
+]
+
+parse_qsl_test_cases_legacy = [
+    (b"a=1;a=2&a=3", [(b'a', b'1'), (b'a', b'2'), (b'a', b'3')]),
+    (b"a=1;b=2&c=3", [(b'a', b'1'), (b'b', b'2'), (b'c', b'3')]),
+    (b"a=1&b=2&c=3;", [(b'a', b'1'), (b'b', b'2'), (b'c', b'3')]),
+]
+
+parse_qsl_test_cases_warn = [
     (";a=b", [(';a', 'b')]),
     ("a=a+b;b=b+c", [('a', 'a b;b=b c')]),
     (b";a=b", [(b';a', b'b')]),
     (b"a=a+b;b=b+c", [(b'a', b'a b;b=b c')]),
+    ("a=1;a=2&a=3", [('a', '1;a=2'), ('a', '3')]),
+    (b"a=1;a=2&a=3", [(b'a', b'1;a=2'), (b'a', b'3')]),
 ]
 
 # Each parse_qs testcase is a two-tuple that contains
@@ -62,10 +91,37 @@ parse_qs_test_cases = [
     (b"&a=b", {b'a': [b'b']}),
     (b"a=a+b&b=b+c", {b'a': [b'a b'], b'b': [b'b c']}),
     (b"a=1&a=2", {b'a': [b'1', b'2']}),
+]
+
+parse_qs_test_cases_semicolon = [
+    (";", {}),
+    (";;", {}),
+    (";a=b", {'a': ['b']}),
+    ("a=a+b;b=b+c", {'a': ['a b'], 'b': ['b c']}),
+    ("a=1;a=2", {'a': ['1', '2']}),
+    (b";", {}),
+    (b";;", {}),
+    (b";a=b", {b'a': [b'b']}),
+    (b"a=a+b;b=b+c", {b'a': [b'a b'], b'b': [b'b c']}),
+    (b"a=1;a=2", {b'a': [b'1', b'2']}),
+]
+
+parse_qs_test_cases_legacy = [
+    ("a=1;a=2&a=3", {'a': ['1', '2', '3']}),
+    ("a=1;b=2&c=3", {'a': ['1'], 'b': ['2'], 'c': ['3']}),
+    ("a=1&b=2&c=3;", {'a': ['1'], 'b': ['2'], 'c': ['3']}),
+    (b"a=1;a=2&a=3", {b'a': [b'1', b'2', b'3']}),
+    (b"a=1;b=2&c=3", {b'a': [b'1'], b'b': [b'2'], b'c': [b'3']}),
+    (b"a=1&b=2&c=3;", {b'a': [b'1'], b'b': [b'2'], b'c': [b'3']}),
+]
+
+parse_qs_test_cases_warn = [
     (";a=b", {';a': ['b']}),
     ("a=a+b;b=b+c", {'a': ['a b;b=b c']}),
     (b";a=b", {b';a': [b'b']}),
     (b"a=a+b;b=b+c", {b'a':[ b'a b;b=b c']}),
+    ("a=1;a=2&a=3", {'a': ['1;a=2', '3']}),
+    (b"a=1;a=2&a=3", {b'a': [b'1;a=2', b'3']}),
 ]
 
 class UrlParseTestCase(unittest.TestCase):
@@ -123,23 +179,57 @@ class UrlParseTestCase(unittest.TestCase):
 
     def test_qsl(self):
         for orig, expect in parse_qsl_test_cases:
-            result = urllib.parse.parse_qsl(orig, keep_blank_values=True)
+            result = urllib.parse.parse_qsl(orig, keep_blank_values=True, separator="&")
             self.assertEqual(result, expect, "Error parsing %r" % orig)
             expect_without_blanks = [v for v in expect if len(v[1])]
-            result = urllib.parse.parse_qsl(orig, keep_blank_values=False)
+            result = urllib.parse.parse_qsl(orig, keep_blank_values=False, separator="&")
             self.assertEqual(result, expect_without_blanks,
                             "Error parsing %r" % orig)
 
     def test_qs(self):
         for orig, expect in parse_qs_test_cases:
-            result = urllib.parse.parse_qs(orig, keep_blank_values=True)
+            result = urllib.parse.parse_qs(orig, keep_blank_values=True, separator="&")
             self.assertEqual(result, expect, "Error parsing %r" % orig)
             expect_without_blanks = {v: expect[v]
                                      for v in expect if len(expect[v][0])}
-            result = urllib.parse.parse_qs(orig, keep_blank_values=False)
+            result = urllib.parse.parse_qs(orig, keep_blank_values=False, separator="&")
             self.assertEqual(result, expect_without_blanks,
                             "Error parsing %r" % orig)
 
+    def test_qs_default_warn(self):
+        for orig, expect in parse_qs_test_cases_warn:
+            with self.subTest(orig=orig, expect=expect):
+                with catch_warnings(record=True) as w:
+                    result = urllib.parse.parse_qs(orig, keep_blank_values=True)
+                    self.assertEqual(result, expect, "Error parsing %r" % orig)
+                self.assertEqual(len(w), 1)
+                self.assertEqual(w[0].category, urllib.parse._QueryStringSeparatorWarning)
+
+    def test_qsl_default_warn(self):
+        for orig, expect in parse_qsl_test_cases_warn:
+            with self.subTest(orig=orig, expect=expect):
+                with catch_warnings(record=True) as w:
+                    result = urllib.parse.parse_qsl(orig, keep_blank_values=True)
+                    self.assertEqual(result, expect, "Error parsing %r" % orig)
+                self.assertEqual(len(w), 1)
+                self.assertEqual(w[0].category, urllib.parse._QueryStringSeparatorWarning)
+
+    def test_default_qs_no_warnings(self):
+        for orig, expect in parse_qs_test_cases:
+            with self.subTest(orig=orig, expect=expect):
+                with catch_warnings(record=True) as w:
+                    result = urllib.parse.parse_qs(orig, keep_blank_values=True)
+                    self.assertEqual(result, expect, "Error parsing %r" % orig)
+                self.assertEqual(len(w), 0)
+
+    def test_default_qsl_no_warnings(self):
+        for orig, expect in parse_qsl_test_cases:
+            with self.subTest(orig=orig, expect=expect):
+                with catch_warnings(record=True) as w:
+                    result = urllib.parse.parse_qsl(orig, keep_blank_values=True)
+                    self.assertEqual(result, expect, "Error parsing %r" % orig)
+                self.assertEqual(len(w), 0)
+
     def test_roundtrips(self):
         str_cases = [
             ('file:///tmp/junk.txt',
@@ -871,8 +961,8 @@ class UrlParseTestCase(unittest.TestCase):
 
     def test_parse_qsl_max_num_fields(self):
         with self.assertRaises(ValueError):
-            urllib.parse.parse_qs('&'.join(['a=a']*11), max_num_fields=10)
-        urllib.parse.parse_qs('&'.join(['a=a']*10), max_num_fields=10)
+            urllib.parse.parse_qs('&'.join(['a=a']*11), max_num_fields=10, separator='&')
+        urllib.parse.parse_qs('&'.join(['a=a']*10), max_num_fields=10, separator='&')
 
     def test_parse_qs_separator(self):
         parse_qs_semicolon_cases = [
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 36fd8fe..83638bb 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -28,6 +28,7 @@ test_urlparse.py provides a good indicator of parsing behavior.
 """
 
 import re
+import os
 import sys
 import collections
 import warnings
@@ -650,7 +651,7 @@ def unquote(string, encoding='utf-8', errors='replace'):
 
 
 def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
-             encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
+             encoding='utf-8', errors='replace', max_num_fields=None, separator=None):
     """Parse a query given as a string argument.
 
         Arguments:
@@ -690,9 +691,16 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
             parsed_result[name] = [value]
     return parsed_result
 
+class _QueryStringSeparatorWarning(RuntimeWarning):
+    """Warning for using default `separator` in parse_qs or parse_qsl"""
+
+# The default "separator" for parse_qsl can be specified in a config file.
+# It's cached after first read.
+_QS_SEPARATOR_CONFIG_FILENAME = '/etc/python/urllib.cfg'
+_default_qs_separator = None
 
 def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
-              encoding='utf-8', errors='replace', max_num_fields=None, separator='&'):
+              encoding='utf-8', errors='replace', max_num_fields=None, separator=None):
     """Parse a query given as a string argument.
 
         Arguments:
@@ -721,20 +729,78 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
         Returns a list, as G-d intended.
     """
     qs, _coerce_result = _coerce_args(qs)
-    separator, _ = _coerce_args(separator)
 
-    if not separator or (not isinstance(separator, (str, bytes))):
+    if isinstance(separator, bytes):
+        separator = separator.decode('ascii')
+
+    if (not separator or (not isinstance(separator, (str, bytes)))) and separator is not None:
         raise ValueError("Separator must be of type string or bytes.")
 
+    # Used when both "&" and ";" act as separators. (Need a non-string value.)
+    _legacy = object()
+
+    if separator is None:
+        global _default_qs_separator
+        separator = _default_qs_separator
+        envvar_name = 'PYTHON_URLLIB_QS_SEPARATOR'
+        if separator is None:
+            # Set default separator from environment variable
+            separator = os.environ.get(envvar_name)
+            config_source = 'environment variable'
+        if separator is None:
+            # Set default separator from the configuration file
+            try:
+                file = open(_QS_SEPARATOR_CONFIG_FILENAME)
+            except FileNotFoundError:
+                pass
+            else:
+                with file:
+                    import configparser
+                    config = configparser.ConfigParser(
+                        interpolation=None,
+                        comment_prefixes=('#', ),
+                    )
+                    config.read_file(file)
+                    separator = config.get('parse_qs', envvar_name, fallback=None)
+                    _default_qs_separator = separator
+                config_source = _QS_SEPARATOR_CONFIG_FILENAME
+        if separator is None:
+            # The default is '&', but warn if not specified explicitly
+            if ';' in qs:
+                from warnings import warn
+                warn("The default separator of urllib.parse.parse_qsl and "
+                    + "parse_qs was changed to '&' to avoid a web cache "
+                    + "poisoning issue (CVE-2021-23336). "
+                    + "By default, semicolons no longer act as query field "
+                    + "separators. "
+                    + "See https://access.redhat.com/articles/5860431 for "
+                    + "more details.",
+                    _QueryStringSeparatorWarning, stacklevel=2)
+            separator = '&'
+        elif separator == 'legacy':
+            separator = _legacy
+        elif len(separator) != 1:
+            raise ValueError(
+                f'{envvar_name} (from {config_source}) must contain '
+                + '1 character, or "legacy". See '
+                + 'https://access.redhat.com/articles/5860431 for more details.'
+            )
+
     # If max_num_fields is defined then check that the number of fields
     # is less than max_num_fields. This prevents a memory exhaustion DOS
     # attack via post bodies with many fields.
     if max_num_fields is not None:
-        num_fields = 1 + qs.count(separator)
+        if separator is _legacy:
+            num_fields = 1 + qs.count('&') + qs.count(';')
+        else:
+            num_fields = 1 + qs.count(separator)
         if max_num_fields < num_fields:
             raise ValueError('Max number of fields exceeded')
 
-    pairs = [s1 for s1 in qs.split(separator)]
+    if separator is _legacy:
+        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+    else:
+        pairs = [s1 for s1 in qs.split(separator)]
     r = []
     for name_value in pairs:
         if not name_value and not strict_parsing:
-- 
2.31.1