Blob Blame History Raw
From d6f7a97d2c1dfcedd728d8b2ac1a07e953922950 Mon Sep 17 00:00:00 2001
From: Hugo van der Sanden <hv@crypt.org>
Date: Fri, 18 Dec 2020 14:55:16 +0100
Subject: [PATCH] study_chunk: avoid mutating regexp program within GOSUB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gh16947 and gh17743: studying GOSUB may restudy in an inner call
(via a mix of recursion and enframing) something that an outer call
is in the middle of looking at.  Let the outer frame deal with it.

(CVE-2020-12723)

Jitka Plesnikova, Petr Pisar: Ported from
3f4ba871d2d397dcd4386ed75e05353c36135c29 to 5.16.3.

Signed-off-by: Petr Písař <ppisar@redhat.com>
---
 embed.fnc  |  2 +-
 embed.h    |  2 +-
 proto.h    |  2 +-
 regcomp.c  | 42 ++++++++++++++++++++++++++++++------------
 t/re/pat.t | 26 +++++++++++++++++++++++++-
 5 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 9546555..019afa9 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1951,7 +1951,7 @@ Es	|I32	|study_chunk	|NN struct RExC_state_t *pRExC_state \
 				|NULLOK struct scan_data_t *data \
 				|I32 stopparen|NULLOK U8* recursed \
 				|NULLOK struct regnode_charclass_class *and_withp \
-				|U32 flags|U32 depth
+				|U32 flags|U32 depth|bool was_mutate_ok
 EsRn	|U32	|add_data	|NN struct RExC_state_t *pRExC_state|U32 n \
 				|NN const char *s
 rs	|void	|re_croak2	|NN const char* pat1|NN const char* pat2|...
diff --git a/embed.h b/embed.h
index f7db1e0..75f326e 100644
--- a/embed.h
+++ b/embed.h
@@ -952,7 +952,7 @@
 #define scan_commit(a,b,c,d)	S_scan_commit(aTHX_ a,b,c,d)
 #define set_regclass_bit(a,b,c,d,e)	S_set_regclass_bit(aTHX_ a,b,c,d,e)
 #define set_regclass_bit_fold(a,b,c,d,e)	S_set_regclass_bit_fold(aTHX_ a,b,c,d,e)
-#define study_chunk(a,b,c,d,e,f,g,h,i,j,k)	S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k)
+#define study_chunk(a,b,c,d,e,f,g,h,i,j,k,l)	S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k,l)
 #  endif
 #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C)
 #define _core_swash_init(a,b,c,d,e,f,g,h)	Perl__core_swash_init(aTHX_ a,b,c,d,e,f,g,h)
diff --git a/proto.h b/proto.h
index 143eee0..1b2ece6 100644
--- a/proto.h
+++ b/proto.h
@@ -6597,7 +6597,7 @@ STATIC U8	S_set_regclass_bit_fold(pTHX_ struct RExC_state_t *pRExC_state, regnod
 #define PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD	\
 	assert(pRExC_state); assert(node); assert(invlist_ptr); assert(alternate_ptr)
 
-STATIC I32	S_study_chunk(pTHX_ struct RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U8* recursed, struct regnode_charclass_class *and_withp, U32 flags, U32 depth)
+STATIC I32	S_study_chunk(pTHX_ struct RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U8* recursed, struct regnode_charclass_class *and_withp, U32 flags, U32 depth, bool was_mutate_ok)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_2)
 			__attribute__nonnull__(pTHX_3)
diff --git a/regcomp.c b/regcomp.c
index 2254159..57505f3 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2934,6 +2934,7 @@ typedef struct scan_frame {
     regnode *next;  /* next node to process when last is reached */
     struct scan_frame *prev; /*previous frame*/
     I32 stop; /* what stopparen do we use */
+    bool in_gosub;              /* this or an outer frame is for GOSUB */
 } scan_frame;
 
 
@@ -2975,7 +2976,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 			I32 stopparen,
 			U8* recursed,
 			struct regnode_charclass_class *and_withp,
-			U32 flags, U32 depth)
+			U32 flags, U32 depth, bool was_mutate_ok)
 			/* scanp: Start here (read-write). */
 			/* deltap: Write maxlen-minlen here. */
 			/* last: Stop before this one. */
@@ -3016,6 +3017,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                    length to get a real minimum (because the
                                    folded version may be shorter) */
 	bool has_exactf_sharp_s = FALSE;
+        /* avoid mutating ops if we are anywhere within the recursed or
+         * enframed handling for a GOSUB: the outermost level will handle it.
+         */
+        bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
+
 	/* Peephole optimizer: */
 	DEBUG_STUDYDATA("Peep:", data,depth);
 	DEBUG_PEEP("Peep",scan,depth);
@@ -3023,7 +3029,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
         /* Its not clear to khw or hv why this is done here, and not in the
          * clauses that deal with EXACT nodes.  khw's guess is that it's
          * because of a previous design */
-        JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
+        if (mutate_ok)
+            JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
+
 
 	/* Follow the next-chain of the current node and optimize
 	   away all the NOTHINGs from it.  */
@@ -3100,7 +3108,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    /* we suppose the run is continuous, last=next...*/
 		    minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
 					  next, &data_fake,
-					  stopparen, recursed, NULL, f,depth+1);
+					  stopparen, recursed, NULL, f,depth+1,
+					  mutate_ok);
 		    if (min1 > minnext)
 			min1 = minnext;
 		    if (max1 < minnext + deltanext)
@@ -3161,7 +3170,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		    }
 		}
 
-                if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
+                if (PERL_ENABLE_TRIE_OPTIMISATION
+		    && OP( startbranch ) == BRANCH 
+		    && mutate_ok) {
+
 		/* demq.
 
 		   Assuming this was/is a branch we are dealing with: 'scan' now
@@ -3481,6 +3493,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 	        newframe->last = last;
 	        newframe->stop = stopparen;
 	        newframe->prev = frame;
+                newframe->in_gosub = (
+		    (frame && frame->in_gosub) || (OP(scan) == GOSUB || OP(scan) == GOSTART)
+		);
 
 	        frame = newframe;
 	        scan =  start;
@@ -3792,7 +3807,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, 
 		                      last, data, stopparen, recursed, NULL,
 				      (mincount == 0
-					? (f & ~SCF_DO_SUBSTR) : f),depth+1);
+					? (f & ~SCF_DO_SUBSTR) : f),depth+1,mutate_ok);
 
 		if (flags & SCF_DO_STCLASS)
 		    data->start_class = oclass;
@@ -3844,7 +3859,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		if (  OP(oscan) == CURLYX && data
 		      && data->flags & SF_IN_PAR
 		      && !(data->flags & SF_HAS_EVAL)
-		      && !deltanext && minnext == 1 ) {
+		      && !deltanext && minnext == 1
+			       && mutate_ok
+			) {
 		    /* Try to optimize to CURLYN.  */
 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
 		    regnode * const nxt1 = nxt;
@@ -3890,6 +3907,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 		      && !(data->flags & SF_HAS_EVAL)
 		      && !deltanext	/* atom is fixed width */
 		      && minnext != 0	/* CURLYM can't handle zero width */
+		      && mutate_ok
 		) {
 		    /* XXXX How to optimize if data == 0? */
 		    /* Optimize to a simpler form.  */
@@ -3936,7 +3954,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 #endif
 			/* Optimize again: */
 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
-				    NULL, stopparen, recursed, NULL, 0,depth+1);
+				    NULL, stopparen, recursed, NULL, 0,depth+1,mutate_ok);
 		    }
 		    else
 			oscan->flags = 0;
@@ -4376,7 +4394,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 next = regnext(scan);
                 nscan = NEXTOPER(NEXTOPER(scan));
                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, 
-                    last, &data_fake, stopparen, recursed, NULL, f, depth+1);
+                    last, &data_fake, stopparen, recursed, NULL, f, depth+1,mutate_ok);
                 if (scan->flags) {
                     if (deltanext) {
 			FAIL("Variable length lookbehind not implemented");
@@ -4462,7 +4480,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 nscan = NEXTOPER(NEXTOPER(scan));
 
                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, 
-                    last, &data_fake, stopparen, recursed, NULL, f,depth+1);
+                    last, &data_fake, stopparen, recursed, NULL, f,depth+1,mutate_ok);
                 if (scan->flags) {
                     if (deltanext) {
 			FAIL("Variable length lookbehind not implemented");
@@ -4625,7 +4643,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                          */
                         minnext = study_chunk(pRExC_state, &scan, minlenp, 
                             &deltanext, (regnode *)nextbranch, &data_fake, 
-                            stopparen, recursed, NULL, f,depth+1);
+                            stopparen, recursed, NULL, f,depth+1,mutate_ok);
                     }
                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
                         nextbranch= regnext((regnode*)nextbranch);
@@ -5442,7 +5460,7 @@ reStudy:
         
 	minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
             &data, -1, NULL, NULL,
-            SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
+            SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0,TRUE);
 
 
         CHECK_RESTUDY_GOTO;
@@ -5627,7 +5645,7 @@ reStudy:
 
         
 	minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
-	    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
+	    &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0,TRUE);
         
         CHECK_RESTUDY_GOTO;
 
diff --git a/t/re/pat.t b/t/re/pat.t
index faddbc5..e328fbd 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -19,7 +19,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 472;  # Update this when adding/deleting tests.
+plan tests => 476;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1260,6 +1260,30 @@ EOP
         use re '/aa';
         unlike 'k', qr/(?i:\N{KELVIN SIGN})/, "(?i: shouldn't lose the passed in /aa";
     }
+    # gh16947: test regexp corruption (GOSUB)
+    {
+        fresh_perl_is(q{
+            'xy' =~ /x(?0)|x(?|y|y)/ && print 'ok'
+        }, 'ok', {}, 'gh16947: test regexp corruption (GOSUB)');
+    }
+    # gh16947: test fix doesn't break SUSPEND
+    {
+        fresh_perl_is(q{ 'sx' =~ m{ss++}i; print 'ok' },
+                'ok', {}, "gh16947: test fix doesn't break SUSPEND");
+    }
+
+    # gh17743: more regexp corruption via GOSUB
+    {
+        fresh_perl_is(q{
+            "0" =~ /((0(?0)|000(?|0000|0000)(?0))|)/; print "ok"
+        }, 'ok', {}, 'gh17743: test regexp corruption (1)');
+
+        fresh_perl_is(q{
+            "000000000000" =~ /(0(())(0((?0)())|000(?|\x{ef}\x{bf}\x{bd}|\x{ef}\x{bf}\x{bd}))|)/;
+            print "ok"
+        }, 'ok', {}, 'gh17743: test regexp corruption (2)');
+    }
+
 } # End of sub run_tests
 
 1;
-- 
2.26.2