From 29231d73407542051a287cab5e18546e5a622f4a Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Tue, 6 Feb 2018 14:50:48 -0700
Subject: [perl #132063]: Heap buffer overflow
The proximal cause is several instances in regexec.c of the code
assuming that the input was valid UTF-8, whereas the input was too short
for what the start byte claimed it would be.
I grepped through the core for any other similar uses, and did not find
any.
---
regexec.c | 29 ++++++++++++++++-------------
t/lib/warnings/regexec | 7 +++++++
2 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/regexec.c b/regexec.c
index 5735b997fd..ea432c39d3 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1466,7 +1466,9 @@ Perl_re_intuit_start(pTHX_
? trie_utf8_fold \
: trie_latin_utf8_fold)))
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
+/* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
+ * 'foldbuf+sizeof(foldbuf)' */
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
U8 flags = FOLD_FLAGS_FULL; \
@@ -1474,7 +1476,7 @@ STMT_START {
case trie_flu8: \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
- _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end - uc); \
} \
goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
@@ -1483,7 +1485,7 @@ STMT_START {
case trie_utf8_fold: \
do_trie_utf8_fold: \
if ( foldlen>0 ) { \
- uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
foldlen -= len; \
uscan += len; \
len=0; \
@@ -1500,7 +1502,7 @@ STMT_START {
/* FALLTHROUGH */ \
case trie_latin_utf8_fold: \
if ( foldlen>0 ) { \
- uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
foldlen -= len; \
uscan += len; \
len=0; \
@@ -1519,7 +1521,7 @@ STMT_START {
} \
/* FALLTHROUGH */ \
case trie_utf8: \
- uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
+ uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
break; \
case trie_plain: \
uvc = (UV)*uc; \
@@ -2599,10 +2601,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
points[pointpos++ % maxlen]= uc;
if (foldlen || uc < (U8*)strend) {
- REXEC_TRIE_READ_CHAR(trie_type, trie,
- widecharmap, uc,
- uscan, len, uvc, charid, foldlen,
- foldbuf, uniflags);
+ REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
+ (U8 *) strend, uscan, len, uvc,
+ charid, foldlen, foldbuf,
+ uniflags);
DEBUG_TRIE_EXECUTE_r({
dump_exec_pos( (char *)uc, c, strend,
real_start, s, utf8_target, 0);
@@ -5511,8 +5513,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
I32 offset;
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
- uscan, len, uvc, charid, foldlen,
- foldbuf, uniflags);
+ (U8 *) reginfo->strend, uscan,
+ len, uvc, charid, foldlen,
+ foldbuf, uniflags);
charcount++;
if (foldlen>0)
ST.longfold = TRUE;
@@ -5642,8 +5645,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
while (foldlen) {
if (!--chars)
break;
- uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
- uniflags);
+ uvc = utf8n_to_uvchr(uscan, foldlen, &len,
+ uniflags);
uscan += len;
foldlen -= len;
}
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 900dd6ee7f..6635142dea 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -260,3 +260,10 @@ setlocale(&POSIX::LC_CTYPE, $utf8_locale);
"k" =~ /(?[ \N{KELVIN SIGN} ])/i;
":" =~ /(?[ \: ])/;
EXPECT
+########
+# NAME perl #132063, read beyond buffer end
+# OPTION fatal
+"\xff" =~ /(?il)\x{100}|\x{100}/;
+EXPECT
+Malformed UTF-8 character: \xff (too short; 1 byte available, need 13) in pattern match (m//) at - line 2.
+Malformed UTF-8 character (fatal) at - line 2.
--
2.11.0