From e3406ec06426fb9a7342541127d4c591d2446b6b Mon Sep 17 00:00:00 2001 From: ph10 Date: Fri, 5 Jul 2013 10:38:37 +0000 Subject: [PATCH 1/2] Fix checking whether a group could match an empty string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In UTF mode, the code for checking whether a group could match an empty string (which is used for indefinitely repeated groups to allow for breaking an infinite loop) was broken when the group contained a repeated negated single-character class with a character that occupied more than one data item and had a minimum repetition of zero (for example, [^\x{100}]* in UTF-8 mode). The effect was undefined: the group might or might not be deemed as matching an empty string, or the program might have crashed. Based on: commit 74d96caf6251eff2f6c6a3e879268ce2d2a6c9be Author: ph10 Date: Fri Jul 5 10:38:37 2013 +0000 Implement PCRE_INFO_MATCH_EMPTY and fix 2 bugs concerned with scanning for empty string matching. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1348 2f5784b3-3f2a-0410-8824- cb99058d5e15 Ported to 8.32. Needed for CVE-2015-2328 (bug #1285399). Signed-off-by: Petr Písař --- pcre_compile.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 18 deletions(-) diff --git a/pcre_compile.c b/pcre_compile.c index 0de3747..ce72527 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -2353,15 +2353,23 @@ Arguments: endcode points to where to stop utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode cd contains pointers to tables etc. + recurses chain of recurse_check to catch mutual recursion Returns: TRUE if what is matched could be empty */ +typedef struct recurse_check { + struct recurse_check *prev; + const pcre_uchar *group; +} recurse_check; + static BOOL could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, - BOOL utf, compile_data *cd) + BOOL utf, compile_data *cd, recurse_check *recurses) { register pcre_uchar c; +recurse_check this_recurse; + for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); code < endcode; code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) @@ -2369,7 +2377,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); const pcre_uchar *ccode; c = *code; - + /* Skip over forward assertions; the other assertions are skipped by first_significant_code() with a TRUE final argument. */ @@ -2389,25 +2397,50 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (c == OP_RECURSE) { - const pcre_uchar *scode; + const pcre_uchar *scode = cd->start_code + GET(code, 1); BOOL empty_branch; - /* Test for forward reference */ + /* Test for forward reference or uncompleted reference. This is disabled + when called to scan a completed pattern by setting cd->start_workspace to + NULL. */ - for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE) - if ((int)GET(scode, 0) == (int)(code + 1 - cd->start_code)) return TRUE; + if (cd->start_workspace != NULL) + { + const pcre_uchar *tcode; + for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE) + if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE; + if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ + } + + /* If we are scanning a completed pattern, there are no forward references + and all groups are complete. We need to detect whether this is a recursive + call, as otherwise there will be an infinite loop. If it is a recursion, + just skip over it. Simple recursions are easily detected. For mutual + recursions we keep a chain on the stack. */ + + else + { + recurse_check *r = recurses; + const pcre_uchar *endgroup = scode; + + do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); + if (code >= scode && code <= endgroup) continue; /* Simple recursion */ + + for (r = recurses; r != NULL; r = r->prev) + if (r->group == scode) break; + if (r != NULL) continue; /* Mutual recursion */ + } - /* Not a forward reference, test for completed backward reference */ + /* Completed reference; scan the referenced group, remembering it on the + stack chain to detect mutual recursions. */ empty_branch = FALSE; - scode = cd->start_code + GET(code, 1); - if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ - - /* Completed backwards reference */ - + this_recurse.prev = recurses; + this_recurse.group = scode; + do { - if (could_be_empty_branch(scode, endcode, utf, cd)) + if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse)) { empty_branch = TRUE; break; @@ -2463,7 +2496,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); empty_branch = FALSE; do { - if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd)) + if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, NULL)) empty_branch = TRUE; code += GET(code, 1); } @@ -2582,30 +2615,58 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); return TRUE; /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, - MINUPTO, and POSUPTO may be followed by a multibyte character */ + MINUPTO, and POSUPTO and their caseless and negative versions may be + followed by a multibyte character. */ #if defined SUPPORT_UTF && !defined COMPILE_PCRE32 case OP_STAR: case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: + case OP_MINSTAR: case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: + case OP_POSSTAR: case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: + case OP_QUERY: case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: + case OP_MINQUERY: case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: + case OP_POSQUERY: case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: + if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); break; case OP_UPTO: case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: + case OP_MINUPTO: case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: + case OP_POSUPTO: case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: + if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); break; #endif @@ -2662,7 +2723,7 @@ could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, { while (bcptr != NULL && bcptr->current_branch >= code) { - if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd)) + if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL)) return FALSE; bcptr = bcptr->outer; } @@ -5416,7 +5477,7 @@ for (;; ptr++) pcre_uchar *scode = bracode; do { - if (could_be_empty_branch(scode, ketcode, utf, cd)) + if (could_be_empty_branch(scode, ketcode, utf, cd, NULL)) { *bracode += OP_SBRA - OP_BRA; break; @@ -8172,10 +8233,12 @@ if (cd->hwm > cd->start_workspace) } } -/* If the workspace had to be expanded, free the new memory. */ +/* If the workspace had to be expanded, free the new memory. Set the pointer to +NULL to indicate that forward references have been filled in. */ if (cd->workspace_size > COMPILE_WORK_SIZE) (PUBL(free))((void *)cd->start_workspace); +cd->start_workspace = NULL; /* Give an error if there's back reference to a non-existent capturing subpattern. */ -- 2.5.5