Blob Blame History Raw
commit c8abeb6dbc14761866da2d3cf359d795f126b6d8
Author: Tomas Korbar <tkorbar@redhat.com>
Date:   Mon Mar 21 12:48:53 2022 +0100

    Add missing validation of encoding

diff --git a/lib/xmltok.c b/lib/xmltok.c
index cb98ce1..a080f59 100644
--- a/lib/xmltok.c
+++ b/lib/xmltok.c
@@ -71,13 +71,6 @@
                       + ((((byte)[2]) >> 5) & 1)] \
          & (1 << (((byte)[2]) & 0x1F)))
 
-#define UTF8_GET_NAMING(pages, p, n) \
-  ((n) == 2 \
-  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
-  : ((n) == 3 \
-     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
-     : 0))
-
 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
    with the additional restriction of not allowing the Unicode
diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
index c93e2ac..a135514 100644
--- a/lib/xmltok_impl.c
+++ b/lib/xmltok_impl.c
@@ -34,7 +34,7 @@
    case BT_LEAD ## n: \
      if (end - ptr < n) \
        return XML_TOK_PARTIAL_CHAR; \
-     if (!IS_NAME_CHAR(enc, ptr, n)) { \
+     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
        *nextTokPtr = ptr; \
        return XML_TOK_INVALID; \
      } \
@@ -62,7 +62,7 @@
    case BT_LEAD ## n: \
      if (end - ptr < n) \
        return XML_TOK_PARTIAL_CHAR; \
-     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
+     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
        *nextTokPtr = ptr; \
        return XML_TOK_INVALID; \
      } \
@@ -1097,6 +1097,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
   case BT_LEAD ## n: \
     if (end - ptr < n) \
       return XML_TOK_PARTIAL_CHAR; \
+    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
+      *nextTokPtr = ptr;                                                       \
+      return XML_TOK_INVALID;                                                  \
+    }                                                                          \
     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
       ptr += n; \
       tok = XML_TOK_NAME; \
diff --git a/tests/runtests.c b/tests/runtests.c
index 86f8b18..c01f096 100644
--- a/tests/runtests.c
+++ b/tests/runtests.c
@@ -14,6 +14,7 @@
 #include <string.h>
 #include <stdint.h>
 #include <limits.h>
+#include <stdbool.h>
 
 #include "expat.h"
 #include "chardata.h"
@@ -82,7 +83,7 @@ _xml_failure(XML_Parser parser, const char *file, int line)
 
 static void
 _expect_failure(char *text, enum XML_Error errorCode, char *errorMessage,
-                char *file, int lineno)
+                const char *file, int lineno)
 {
     if (XML_Parse(parser, text, strlen(text), XML_TRUE) == XML_STATUS_OK)
         /* Hackish use of _fail_unless() macro, but let's us report
@@ -1541,6 +1542,13 @@ START_TEST(test_ns_separator_in_uri) {
 }
 END_TEST
 
+START_TEST(test_bad_doctype_utf8) {
+  char *text = "<!DOCTYPE \xDB\x25"
+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
+                 "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
 
 START_TEST(test_utf8_in_start_tags) {
   struct test_case {
@@ -1695,6 +1703,8 @@ make_suite(void)
     tcase_add_test(tc_basic, test_ns_in_attribute_default_without_namespaces);
     tcase_add_test(tc_basic, test_stop_parser_between_char_data_calls);
     tcase_add_test(tc_basic, test_suspend_parser_between_char_data_calls);
+    tcase_add_test(tc_basic, test_utf8_in_start_tags);
+    tcase_add_test(tc_basic, test_bad_doctype_utf8);
 
     suite_add_tcase(s, tc_namespace);
     tcase_add_checked_fixture(tc_namespace,