commit c8abeb6dbc14761866da2d3cf359d795f126b6d8
Author: Tomas Korbar <tkorbar@redhat.com>
Date: Mon Mar 21 12:48:53 2022 +0100
Add missing validation of encoding
diff --git a/lib/xmltok.c b/lib/xmltok.c
index cb98ce1..a080f59 100644
--- a/lib/xmltok.c
+++ b/lib/xmltok.c
@@ -71,13 +71,6 @@
+ ((((byte)[2]) >> 5) & 1)] \
& (1 << (((byte)[2]) & 0x1F)))
-#define UTF8_GET_NAMING(pages, p, n) \
- ((n) == 2 \
- ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
- : ((n) == 3 \
- ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
- : 0))
-
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
with the additional restriction of not allowing the Unicode
diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c
index c93e2ac..a135514 100644
--- a/lib/xmltok_impl.c
+++ b/lib/xmltok_impl.c
@@ -34,7 +34,7 @@
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
- if (!IS_NAME_CHAR(enc, ptr, n)) { \
+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@@ -62,7 +62,7 @@
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
- if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@@ -1097,6 +1097,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
+ if (IS_INVALID_CHAR(enc, ptr, n)) { \
+ *nextTokPtr = ptr; \
+ return XML_TOK_INVALID; \
+ } \
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
ptr += n; \
tok = XML_TOK_NAME; \
diff --git a/tests/runtests.c b/tests/runtests.c
index 86f8b18..c01f096 100644
--- a/tests/runtests.c
+++ b/tests/runtests.c
@@ -14,6 +14,7 @@
#include <string.h>
#include <stdint.h>
#include <limits.h>
+#include <stdbool.h>
#include "expat.h"
#include "chardata.h"
@@ -82,7 +83,7 @@ _xml_failure(XML_Parser parser, const char *file, int line)
static void
_expect_failure(char *text, enum XML_Error errorCode, char *errorMessage,
- char *file, int lineno)
+ const char *file, int lineno)
{
if (XML_Parse(parser, text, strlen(text), XML_TRUE) == XML_STATUS_OK)
/* Hackish use of _fail_unless() macro, but let's us report
@@ -1541,6 +1542,13 @@ START_TEST(test_ns_separator_in_uri) {
}
END_TEST
+START_TEST(test_bad_doctype_utf8) {
+ char *text = "<!DOCTYPE \xDB\x25"
+ "doc><doc/>"; // [1101 1011] [<0>010 0101]
+ expect_failure(text, XML_ERROR_INVALID_TOKEN,
+ "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
START_TEST(test_utf8_in_start_tags) {
struct test_case {
@@ -1695,6 +1703,8 @@ make_suite(void)
tcase_add_test(tc_basic, test_ns_in_attribute_default_without_namespaces);
tcase_add_test(tc_basic, test_stop_parser_between_char_data_calls);
tcase_add_test(tc_basic, test_suspend_parser_between_char_data_calls);
+ tcase_add_test(tc_basic, test_utf8_in_start_tags);
+ tcase_add_test(tc_basic, test_bad_doctype_utf8);
suite_add_tcase(s, tc_namespace);
tcase_add_checked_fixture(tc_namespace,