commit c8abeb6dbc14761866da2d3cf359d795f126b6d8 Author: Tomas Korbar Date: Mon Mar 21 12:48:53 2022 +0100 Add missing validation of encoding diff --git a/lib/xmltok.c b/lib/xmltok.c index cb98ce1..a080f59 100644 --- a/lib/xmltok.c +++ b/lib/xmltok.c @@ -71,13 +71,6 @@ + ((((byte)[2]) >> 5) & 1)] \ & (1 << (((byte)[2]) & 0x1F))) -#define UTF8_GET_NAMING(pages, p, n) \ - ((n) == 2 \ - ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ - : ((n) == 3 \ - ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ - : 0)) - /* Detection of invalid UTF-8 sequences is based on Table 3.1B of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ with the additional restriction of not allowing the Unicode diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c index c93e2ac..a135514 100644 --- a/lib/xmltok_impl.c +++ b/lib/xmltok_impl.c @@ -34,7 +34,7 @@ case BT_LEAD ## n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ - if (!IS_NAME_CHAR(enc, ptr, n)) { \ + if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -62,7 +62,7 @@ case BT_LEAD ## n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ - if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \ + if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -1097,6 +1097,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_LEAD ## n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ + if (IS_INVALID_CHAR(enc, ptr, n)) { \ + *nextTokPtr = ptr; \ + return XML_TOK_INVALID; \ + } \ if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ ptr += n; \ tok = XML_TOK_NAME; \ diff --git a/tests/runtests.c b/tests/runtests.c index 86f8b18..c01f096 100644 --- a/tests/runtests.c +++ b/tests/runtests.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "expat.h" #include "chardata.h" @@ -82,7 +83,7 @@ _xml_failure(XML_Parser parser, const char *file, int line) static void _expect_failure(char *text, enum XML_Error errorCode, char *errorMessage, - char *file, int lineno) + const char *file, int lineno) { if (XML_Parse(parser, text, strlen(text), XML_TRUE) == XML_STATUS_OK) /* Hackish use of _fail_unless() macro, but let's us report @@ -1541,6 +1542,13 @@ START_TEST(test_ns_separator_in_uri) { } END_TEST +START_TEST(test_bad_doctype_utf8) { + char *text = ""; // [1101 1011] [<0>010 0101] + expect_failure(text, XML_ERROR_INVALID_TOKEN, + "Invalid UTF-8 in DOCTYPE not faulted"); +} +END_TEST START_TEST(test_utf8_in_start_tags) { struct test_case { @@ -1695,6 +1703,8 @@ make_suite(void) tcase_add_test(tc_basic, test_ns_in_attribute_default_without_namespaces); tcase_add_test(tc_basic, test_stop_parser_between_char_data_calls); tcase_add_test(tc_basic, test_suspend_parser_between_char_data_calls); + tcase_add_test(tc_basic, test_utf8_in_start_tags); + tcase_add_test(tc_basic, test_bad_doctype_utf8); suite_add_tcase(s, tc_namespace); tcase_add_checked_fixture(tc_namespace,