Blame SOURCES/expat-2.1.0-Protect-against-malicious-namespace-declarations.patch

4cc7ea
commit f9ba0addb8e60cb72f8a8524fb4379d741cd4e7d
4cc7ea
Author: Tomas Korbar <tkorbar@redhat.com>
4cc7ea
Date:   Mon Mar 21 12:42:50 2022 +0100
4cc7ea
4cc7ea
    Protect against malicious namespace declarations
4cc7ea
4cc7ea
diff --git a/lib/xmlparse.c b/lib/xmlparse.c
4cc7ea
index 87d1a98..4bfb860 100644
4cc7ea
--- a/lib/xmlparse.c
4cc7ea
+++ b/lib/xmlparse.c
4cc7ea
@@ -679,8 +679,7 @@ XML_ParserCreate(const XML_Char *encodingName)
4cc7ea
 XML_Parser XMLCALL
4cc7ea
 XML_ParserCreateNS(const XML_Char *encodingName, XML_Char nsSep)
4cc7ea
 {
4cc7ea
-  XML_Char tmp[2];
4cc7ea
-  *tmp = nsSep;
4cc7ea
+  XML_Char tmp[2] = {nsSep, 0};
4cc7ea
   return XML_ParserCreate_MM(encodingName, NULL, tmp);
4cc7ea
 }
4cc7ea
 
4cc7ea
@@ -1044,8 +1043,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser,
4cc7ea
      would be otherwise.
4cc7ea
   */
4cc7ea
   if (ns) {
4cc7ea
-    XML_Char tmp[2];
4cc7ea
-    *tmp = namespaceSeparator;
4cc7ea
+    XML_Char tmp[2] = {parser->m_namespaceSeparator, 0};
4cc7ea
     parser = parserCreate(encodingName, &parser->m_mem, tmp, newDtd);
4cc7ea
   }
4cc7ea
   else {
4cc7ea
@@ -3173,6 +3171,117 @@ storeAtts(XML_Parser parser, const ENCODING *enc,
4cc7ea
   return XML_ERROR_NONE;
4cc7ea
 }
4cc7ea
 
4cc7ea
+static XML_Bool
4cc7ea
+is_rfc3986_uri_char(XML_Char candidate) {
4cc7ea
+  // For the RFC 3986 ANBF grammar see
4cc7ea
+  // https://datatracker.ietf.org/doc/html/rfc3986#appendix-A
4cc7ea
+
4cc7ea
+  switch (candidate) {
4cc7ea
+  // From rule "ALPHA" (uppercase half)
4cc7ea
+  case 'A':
4cc7ea
+  case 'B':
4cc7ea
+  case 'C':
4cc7ea
+  case 'D':
4cc7ea
+  case 'E':
4cc7ea
+  case 'F':
4cc7ea
+  case 'G':
4cc7ea
+  case 'H':
4cc7ea
+  case 'I':
4cc7ea
+  case 'J':
4cc7ea
+  case 'K':
4cc7ea
+  case 'L':
4cc7ea
+  case 'M':
4cc7ea
+  case 'N':
4cc7ea
+  case 'O':
4cc7ea
+  case 'P':
4cc7ea
+  case 'Q':
4cc7ea
+  case 'R':
4cc7ea
+  case 'S':
4cc7ea
+  case 'T':
4cc7ea
+  case 'U':
4cc7ea
+  case 'V':
4cc7ea
+  case 'W':
4cc7ea
+  case 'X':
4cc7ea
+  case 'Y':
4cc7ea
+  case 'Z':
4cc7ea
+
4cc7ea
+  // From rule "ALPHA" (lowercase half)
4cc7ea
+  case 'a':
4cc7ea
+  case 'b':
4cc7ea
+  case 'c':
4cc7ea
+  case 'd':
4cc7ea
+  case 'e':
4cc7ea
+  case 'f':
4cc7ea
+  case 'g':
4cc7ea
+  case 'h':
4cc7ea
+  case 'i':
4cc7ea
+  case 'j':
4cc7ea
+  case 'k':
4cc7ea
+  case 'l':
4cc7ea
+  case 'm':
4cc7ea
+  case 'n':
4cc7ea
+  case 'o':
4cc7ea
+  case 'p':
4cc7ea
+  case 'q':
4cc7ea
+  case 'r':
4cc7ea
+  case 's':
4cc7ea
+  case 't':
4cc7ea
+  case 'u':
4cc7ea
+  case 'v':
4cc7ea
+  case 'w':
4cc7ea
+  case 'x':
4cc7ea
+  case 'y':
4cc7ea
+  case 'z':
4cc7ea
+
4cc7ea
+  // From rule "DIGIT"
4cc7ea
+  case '0':
4cc7ea
+  case '1':
4cc7ea
+  case '2':
4cc7ea
+  case '3':
4cc7ea
+  case '4':
4cc7ea
+  case '5':
4cc7ea
+  case '6':
4cc7ea
+  case '7':
4cc7ea
+  case '8':
4cc7ea
+  case '9':
4cc7ea
+
4cc7ea
+  // From rule "pct-encoded"
4cc7ea
+  case '%':
4cc7ea
+
4cc7ea
+  // From rule "unreserved"
4cc7ea
+  case '-':
4cc7ea
+  case '.':
4cc7ea
+  case '_':
4cc7ea
+  case '~':
4cc7ea
+
4cc7ea
+  // From rule "gen-delims"
4cc7ea
+  case ':':
4cc7ea
+  case '/':
4cc7ea
+  case '?':
4cc7ea
+  case '#':
4cc7ea
+  case '[':
4cc7ea
+  case ']':
4cc7ea
+  case '@':
4cc7ea
+
4cc7ea
+  // From rule "sub-delims"
4cc7ea
+  case '!':
4cc7ea
+  case '$':
4cc7ea
+  case '&':
4cc7ea
+  case '\'':
4cc7ea
+  case '(':
4cc7ea
+  case ')':
4cc7ea
+  case '*':
4cc7ea
+  case '+':
4cc7ea
+  case ',':
4cc7ea
+  case ';':
4cc7ea
+  case '=':
4cc7ea
+    return XML_TRUE;
4cc7ea
+
4cc7ea
+  default:
4cc7ea
+    return XML_FALSE;
4cc7ea
+  }
4cc7ea
+}
4cc7ea
+
4cc7ea
 /* addBinding() overwrites the value of prefix->binding without checking.
4cc7ea
    Therefore one must keep track of the old value outside of addBinding().
4cc7ea
 */
4cc7ea
@@ -3233,6 +3342,29 @@ addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId,
4cc7ea
     if (!mustBeXML && isXMLNS
4cc7ea
         && (len > xmlnsLen || uri[len] != xmlnsNamespace[len]))
4cc7ea
       isXMLNS = XML_FALSE;
4cc7ea
+
4cc7ea
+    // NOTE: While Expat does not validate namespace URIs against RFC 3986
4cc7ea
+    //       today (and is not REQUIRED to do so with regard to the XML 1.0
4cc7ea
+    //       namespaces specification) we have to at least make sure, that
4cc7ea
+    //       the application on top of Expat (that is likely splitting expanded
4cc7ea
+    //       element names ("qualified names") of form
4cc7ea
+    //       "[uri sep] local [sep prefix] '\0'" back into 1, 2 or 3 pieces
4cc7ea
+    //       in its element handler code) cannot be confused by an attacker
4cc7ea
+    //       putting additional namespace separator characters into namespace
4cc7ea
+    //       declarations.  That would be ambiguous and not to be expected.
4cc7ea
+    //
4cc7ea
+    //       While the HTML API docs of function XML_ParserCreateNS have been
4cc7ea
+    //       advising against use of a namespace separator character that can
4cc7ea
+    //       appear in a URI for >20 years now, some widespread applications
4cc7ea
+    //       are using URI characters (':' (colon) in particular) for a
4cc7ea
+    //       namespace separator, in practice.  To keep these applications
4cc7ea
+    //       functional, we only reject namespaces URIs containing the
4cc7ea
+    //       application-chosen namespace separator if the chosen separator
4cc7ea
+    //       is a non-URI character with regard to RFC 3986.
4cc7ea
+    if (parser->m_ns && (uri[len] == parser->m_namespaceSeparator)
4cc7ea
+        && ! is_rfc3986_uri_char(uri[len])) {
4cc7ea
+      return XML_ERROR_SYNTAX;
4cc7ea
+    }
4cc7ea
   }
4cc7ea
   isXML = isXML && len == xmlLen;
4cc7ea
   isXMLNS = isXMLNS && len == xmlnsLen;
4cc7ea
diff --git a/tests/runtests.c b/tests/runtests.c
4cc7ea
index b99e375..86f8b18 100644
4cc7ea
--- a/tests/runtests.c
4cc7ea
+++ b/tests/runtests.c
4cc7ea
@@ -31,6 +31,13 @@
4cc7ea
 
4cc7ea
 static XML_Parser parser;
4cc7ea
 
4cc7ea
+#ifndef UNUSED_P
4cc7ea
+# ifdef __GNUC__
4cc7ea
+#  define UNUSED_P(p) UNUSED_ ## p __attribute__((__unused__))
4cc7ea
+# else
4cc7ea
+#  define UNUSED_P(p) UNUSED_ ## p
4cc7ea
+# endif
4cc7ea
+#endif
4cc7ea
 
4cc7ea
 static void
4cc7ea
 basic_setup(void)
4cc7ea
@@ -40,6 +47,10 @@ basic_setup(void)
4cc7ea
         fail("Parser not created.");
4cc7ea
 }
4cc7ea
 
4cc7ea
+static void
4cc7ea
+dummy_end_element(void *UNUSED_P(userData), const XML_Char *UNUSED_P(name))
4cc7ea
+{}
4cc7ea
+
4cc7ea
 static void
4cc7ea
 basic_teardown(void)
4cc7ea
 {
4cc7ea
@@ -1499,6 +1510,138 @@ START_TEST(test_ns_unbound_prefix_on_element)
4cc7ea
 }
4cc7ea
 END_TEST
4cc7ea
 
4cc7ea
+START_TEST(test_ns_separator_in_uri) {
4cc7ea
+  struct test_case {
4cc7ea
+    enum XML_Status expectedStatus;
4cc7ea
+    const char *doc;
4cc7ea
+    XML_Char namesep;
4cc7ea
+  };
4cc7ea
+  struct test_case cases[] = {
4cc7ea
+      {XML_STATUS_OK, "<doc xmlns='one_two' />", '\n'},
4cc7ea
+      {XML_STATUS_ERROR, "<doc xmlns='one
two' />", '\n'},
4cc7ea
+      {XML_STATUS_OK, "<doc xmlns='one:two' />", ':'},
4cc7ea
+  };
4cc7ea
+
4cc7ea
+  size_t i = 0;
4cc7ea
+  size_t failCount = 0;
4cc7ea
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
4cc7ea
+    XML_Parser parser = XML_ParserCreateNS(NULL, cases[i].namesep);
4cc7ea
+    XML_SetElementHandler(parser, dummy_start_element, dummy_end_element);
4cc7ea
+    if (XML_Parse(parser, cases[i].doc, (int)strlen(cases[i].doc),
4cc7ea
+                  /*isFinal*/ XML_TRUE)
4cc7ea
+        != cases[i].expectedStatus) {
4cc7ea
+      failCount++;
4cc7ea
+    }
4cc7ea
+    XML_ParserFree(parser);
4cc7ea
+  }
4cc7ea
+
4cc7ea
+  if (failCount) {
4cc7ea
+    fail("Namespace separator handling is broken");
4cc7ea
+  }
4cc7ea
+}
4cc7ea
+END_TEST
4cc7ea
+
4cc7ea
+
4cc7ea
+START_TEST(test_utf8_in_start_tags) {
4cc7ea
+  struct test_case {
4cc7ea
+    bool goodName;
4cc7ea
+    bool goodNameStart;
4cc7ea
+    const char *tagName;
4cc7ea
+  };
4cc7ea
+
4cc7ea
+  // The idea with the tests below is this:
4cc7ea
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
4cc7ea
+  // go to isNever and are hence not a concern.
4cc7ea
+  //
4cc7ea
+  // We start with a character that is a valid name character
4cc7ea
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
4cc7ea
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
4cc7ea
+  // and (2) we stay in the same n-byte sequence family.
4cc7ea
+  //
4cc7ea
+  // The flipped bits are highlighted in angle brackets in comments,
4cc7ea
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
4cc7ea
+  // the most significant bit to 1 to leave UTF-8 encoding space.
4cc7ea
+  struct test_case cases[] = {
4cc7ea
+      // 1-byte UTF-8: [0xxx xxxx]
4cc7ea
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
4cc7ea
+      {false, false, "\xBA"}, // [<1>011 1010]
4cc7ea
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
4cc7ea
+      {false, false, "\xB9"}, // [<1>011 1001]
4cc7ea
+
4cc7ea
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
4cc7ea
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
4cc7ea
+                                  // Arabic small waw U+06E5
4cc7ea
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
4cc7ea
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
4cc7ea
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
4cc7ea
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
4cc7ea
+                                  // combining char U+0301
4cc7ea
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
4cc7ea
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
4cc7ea
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
4cc7ea
+
4cc7ea
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
4cc7ea
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
4cc7ea
+                                      // Devanagari Letter A U+0905
4cc7ea
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
4cc7ea
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
4cc7ea
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
4cc7ea
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
4cc7ea
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
4cc7ea
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
4cc7ea
+                                      // combining char U+0901
4cc7ea
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
4cc7ea
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
4cc7ea
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
4cc7ea
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
4cc7ea
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
4cc7ea
+  };
4cc7ea
+  const bool atNameStart[] = {true, false};
4cc7ea
+
4cc7ea
+  size_t i = 0;
4cc7ea
+  char doc[1024];
4cc7ea
+  size_t failCount = 0;
4cc7ea
+
4cc7ea
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
4cc7ea
+    size_t j = 0;
4cc7ea
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
4cc7ea
+      const bool expectedSuccess
4cc7ea
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
4cc7ea
+      sprintf(doc, "<%s%s>
4cc7ea
+      XML_Parser parser = XML_ParserCreate(NULL);
4cc7ea
+
4cc7ea
+      const enum XML_Status status
4cc7ea
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
4cc7ea
+
4cc7ea
+      bool success = true;
4cc7ea
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
4cc7ea
+        success = false;
4cc7ea
+      }
4cc7ea
+      if ((status == XML_STATUS_ERROR)
4cc7ea
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
4cc7ea
+        success = false;
4cc7ea
+      }
4cc7ea
+
4cc7ea
+      if (! success) {
4cc7ea
+        fprintf(
4cc7ea
+            stderr,
4cc7ea
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
4cc7ea
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
4cc7ea
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
4cc7ea
+        failCount++;
4cc7ea
+      }
4cc7ea
+
4cc7ea
+      XML_ParserFree(parser);
4cc7ea
+    
4cc7ea
+  }
4cc7ea
+
4cc7ea
+  if (failCount > 0) {
4cc7ea
+    fail("UTF-8 regression detected");
4cc7ea
+  }
4cc7ea
+}
4cc7ea
+}
4cc7ea
+END_TEST
4cc7ea
+
4cc7ea
 static Suite *
4cc7ea
 make_suite(void)
4cc7ea
 {
4cc7ea
@@ -1569,6 +1712,7 @@ make_suite(void)
4cc7ea
     tcase_add_test(tc_namespace, test_ns_duplicate_attrs_diff_prefixes);
4cc7ea
     tcase_add_test(tc_namespace, test_ns_unbound_prefix_on_attribute);
4cc7ea
     tcase_add_test(tc_namespace, test_ns_unbound_prefix_on_element);
4cc7ea
+    tcase_add_test(tc_namespace, test_ns_separator_in_uri);
4cc7ea
 
4cc7ea
 #ifdef XML_DTD
4cc7ea
     tcase_add_test(tc_basic,