683572
From 783ddef8fc74b00cde72898c2c3228853dc82d91 Mon Sep 17 00:00:00 2001
683572
From: Hugo van der Sanden <hv@crypt.org>
683572
Date: Sat, 11 Apr 2020 14:10:24 +0100
683572
Subject: [PATCH] study_chunk: avoid mutating regexp program within GOSUB
683572
MIME-Version: 1.0
683572
Content-Type: text/plain; charset=UTF-8
683572
Content-Transfer-Encoding: 8bit
683572
683572
gh16947 and gh17743: studying GOSUB may restudy in an inner call
683572
(via a mix of recursion and enframing) something that an outer call
683572
is in the middle of looking at.  Let the outer frame deal with it.
683572
683572
(CVE-2020-12723)
683572
683572
(cherry picked from commit c4033e740bd18d9fbe3456a9db2ec2053cdc5271)
683572
Petr Písař: Ported to 5.30.1 from
683572
66bbb51b93253a3f87d11c2695cfb7bdb782184a.
683572
Signed-off-by: Petr Písař <ppisar@redhat.com>
683572
---
683572
 embed.fnc  |  2 +-
683572
 embed.h    |  2 +-
683572
 proto.h    |  2 +-
683572
 regcomp.c  | 54 +++++++++++++++++++++++++++++++++++-------------------
683572
 t/re/pat.t | 26 +++++++++++++++++++++++++-
683572
 5 files changed, 63 insertions(+), 23 deletions(-)
683572
683572
diff --git a/embed.fnc b/embed.fnc
683572
index 1b9cf54..d0463e4 100644
683572
--- a/embed.fnc
683572
+++ b/embed.fnc
683572
@@ -2482,7 +2482,7 @@ Es	|SSize_t|study_chunk	|NN RExC_state_t *pRExC_state \
683572
 				|NULLOK struct scan_data_t *data \
683572
                                 |I32 stopparen|U32 recursed_depth \
683572
 				|NULLOK regnode_ssc *and_withp \
683572
-				|U32 flags|U32 depth
683572
+				|U32 flags|U32 depth|bool was_mutate_ok
683572
 Es	|void	|rck_elide_nothing|NN regnode *node
683572
 EsR	|SV *	|get_ANYOFM_contents|NN const regnode * n
683572
 EsRn	|U32	|add_data	|NN RExC_state_t* const pRExC_state \
683572
diff --git a/embed.h b/embed.h
683572
index cf44011..72c2a8e 100644
683572
--- a/embed.h
683572
+++ b/embed.h
683572
@@ -1239,7 +1239,7 @@
683572
 #define ssc_is_cp_posixl_init	S_ssc_is_cp_posixl_init
683572
 #define ssc_or(a,b,c)		S_ssc_or(aTHX_ a,b,c)
683572
 #define ssc_union(a,b,c)	S_ssc_union(aTHX_ a,b,c)
683572
-#define study_chunk(a,b,c,d,e,f,g,h,i,j,k)	S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k)
683572
+#define study_chunk(a,b,c,d,e,f,g,h,i,j,k,l)	S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k,l)
683572
 #  endif
683572
 #  if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C)
683572
 #define _invlist_dump(a,b,c,d)	Perl__invlist_dump(aTHX_ a,b,c,d)
683572
diff --git a/proto.h b/proto.h
683572
index ee74153..9a3ce27 100644
683572
--- a/proto.h
683572
+++ b/proto.h
683572
@@ -5671,7 +5671,7 @@ PERL_STATIC_INLINE void	S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, c
683572
 #define PERL_ARGS_ASSERT_SSC_UNION	\
683572
 	assert(ssc); assert(invlist)
683572
 #endif
683572
-STATIC SSize_t	S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SSize_t *minlenp, SSize_t *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U32 recursed_depth, regnode_ssc *and_withp, U32 flags, U32 depth);
683572
+STATIC SSize_t	S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SSize_t *minlenp, SSize_t *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U32 recursed_depth, regnode_ssc *and_withp, U32 flags, U32 depth, bool was_mutate_ok);
683572
 #define PERL_ARGS_ASSERT_STUDY_CHUNK	\
683572
 	assert(pRExC_state); assert(scanp); assert(minlenp); assert(deltap); assert(last)
683572
 #endif
683572
diff --git a/regcomp.c b/regcomp.c
683572
index b101752..b9ea2a0 100644
683572
--- a/regcomp.c
683572
+++ b/regcomp.c
683572
@@ -106,6 +106,7 @@ typedef struct scan_frame {
683572
     regnode *next_regnode;      /* next node to process when last is reached */
683572
     U32 prev_recursed_depth;
683572
     I32 stopparen;              /* what stopparen do we use */
683572
+    bool in_gosub;              /* this or an outer frame is for GOSUB */
683572
 
683572
     struct scan_frame *this_prev_frame; /* this previous frame */
683572
     struct scan_frame *prev_frame;      /* previous frame */
683572
@@ -4475,7 +4476,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
 			I32 stopparen,
683572
                         U32 recursed_depth,
683572
 			regnode_ssc *and_withp,
683572
-			U32 flags, U32 depth)
683572
+			U32 flags, U32 depth, bool was_mutate_ok)
683572
 			/* scanp: Start here (read-write). */
683572
 			/* deltap: Write maxlen-minlen here. */
683572
 			/* last: Stop before this one. */
683572
@@ -4554,6 +4555,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
                                    node length to get a real minimum (because
683572
                                    the folded version may be shorter) */
683572
 	bool unfolded_multi_char = FALSE;
683572
+        /* avoid mutating ops if we are anywhere within the recursed or
683572
+         * enframed handling for a GOSUB: the outermost level will handle it.
683572
+         */
683572
+        bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
683572
 	/* Peephole optimizer: */
683572
         DEBUG_STUDYDATA("Peep", data, depth, is_inf);
683572
         DEBUG_PEEP("Peep", scan, depth, flags);
683572
@@ -4564,7 +4569,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
          * parsing code, as each (?:..) is handled by a different invocation of
683572
          * reg() -- Yves
683572
          */
683572
-        JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
683572
+        if (mutate_ok)
683572
+            JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
683572
 
683572
         /* Follow the next-chain of the current node and optimize
683572
            away all the NOTHINGs from it.
683572
@@ -4596,7 +4602,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
             /* DEFINEP study_chunk() recursion */
683572
             (void)study_chunk(pRExC_state, &scan, &minlen,
683572
                               &deltanext, next, &data_fake, stopparen,
683572
-                              recursed_depth, NULL, f, depth+1);
683572
+                              recursed_depth, NULL, f, depth+1, mutate_ok);
683572
 
683572
             scan = next;
683572
         } else
683572
@@ -4664,7 +4670,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
                     /* recurse study_chunk() for each BRANCH in an alternation */
683572
 		    minnext = study_chunk(pRExC_state, &scan, minlenp,
683572
                                       &deltanext, next, &data_fake, stopparen,
683572
-                                      recursed_depth, NULL, f, depth+1);
683572
+                                      recursed_depth, NULL, f, depth+1,
683572
+                                      mutate_ok);
683572
 
683572
 		    if (min1 > minnext)
683572
 			min1 = minnext;
683572
@@ -4731,9 +4738,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
 		    }
683572
 		}
683572
 
683572
-                if (PERL_ENABLE_TRIE_OPTIMISATION &&
683572
-                        OP( startbranch ) == BRANCH )
683572
-                {
683572
+                if (PERL_ENABLE_TRIE_OPTIMISATION
683572
+                    && OP(startbranch) == BRANCH
683572
+                    && mutate_ok
683572
+                ) {
683572
 		/* demq.
683572
 
683572
                    Assuming this was/is a branch we are dealing with: 'scan'
683572
@@ -5188,6 +5196,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
                 newframe->stopparen = stopparen;
683572
                 newframe->prev_recursed_depth = recursed_depth;
683572
                 newframe->this_prev_frame= frame;
683572
+                newframe->in_gosub = (
683572
+                    (frame && frame->in_gosub) || OP(scan) == GOSUB
683572
+                );
683572
 
683572
                 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
683572
                 DEBUG_PEEP("fnew", scan, depth, flags);
683572
@@ -5345,7 +5356,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
 
683572
                 /* This temporary node can now be turned into EXACTFU, and
683572
                  * must, as regexec.c doesn't handle it */
683572
-                if (OP(next) == EXACTFU_S_EDGE) {
683572
+                if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
683572
                     OP(next) = EXACTFU;
683572
                 }
683572
 
683572
@@ -5353,8 +5364,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
                     &&   isALPHA_A(* STRING(next))
683572
                     && (         OP(next) == EXACTFAA
683572
                         || (     OP(next) == EXACTFU
683572
-                            && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
683572
-                {
683572
+                            && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
683572
+                    &&   mutate_ok
683572
+                ) {
683572
                     /* These differ in just one bit */
683572
                     U8 mask = ~ ('A' ^ 'a');
683572
 
683572
@@ -5441,7 +5453,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
                                   (mincount == 0
683572
                                    ? (f & ~SCF_DO_SUBSTR)
683572
                                    : f)
683572
-                                  ,depth+1);
683572
+                                  , depth+1, mutate_ok);
683572
 
683572
 		if (flags & SCF_DO_STCLASS)
683572
 		    data->start_class = oclass;
683572
@@ -5507,7 +5519,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
 		if (  OP(oscan) == CURLYX && data
683572
 		      && data->flags & SF_IN_PAR
683572
 		      && !(data->flags & SF_HAS_EVAL)
683572
-		      && !deltanext && minnext == 1 ) {
683572
+		      && !deltanext && minnext == 1
683572
+                      && mutate_ok
683572
+                ) {
683572
 		    /* Try to optimize to CURLYN.  */
683572
 		    regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
683572
 		    regnode * const nxt1 = nxt;
683572
@@ -5557,10 +5571,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
 		      && !(data->flags & SF_HAS_EVAL)
683572
 		      && !deltanext	/* atom is fixed width */
683572
 		      && minnext != 0	/* CURLYM can't handle zero width */
683572
-
683572
                          /* Nor characters whose fold at run-time may be
683572
                           * multi-character */
683572
                       && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
683572
+                      && mutate_ok
683572
 		) {
683572
 		    /* XXXX How to optimize if data == 0? */
683572
 		    /* Optimize to a simpler form.  */
683572
@@ -5613,7 +5627,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
683572
                         /* recurse study_chunk() on optimised CURLYX => CURLYM */
683572
 			study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
683572
                                     NULL, stopparen, recursed_depth, NULL, 0,
683572
-                                    depth+1);
683572
+                                    depth+1, mutate_ok);
683572
 		    }
683572
 		    else
683572
 			oscan->flags = 0;
683572
@@ -6018,7 +6032,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
683572
                 /* recurse study_chunk() for lookahead body */
683572
                 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
683572
                                       last, &data_fake, stopparen,
683572
-                                      recursed_depth, NULL, f, depth+1);
683572
+                                      recursed_depth, NULL, f, depth+1,
683572
+                                      mutate_ok);
683572
                 if (scan->flags) {
683572
                     if (   deltanext < 0
683572
                         || deltanext > (I32) U8_MAX
683572
@@ -6123,7 +6138,7 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
683572
                 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
683572
                                         &deltanext, last, &data_fake,
683572
                                         stopparen, recursed_depth, NULL,
683572
-                                        f, depth+1);
683572
+                                        f, depth+1, mutate_ok);
683572
                 if (scan->flags) {
683572
                     assert(0);  /* This code has never been tested since this
683572
                                    is normally not compiled */
683572
@@ -6291,7 +6306,8 @@ Perl_re_printf( aTHX_  "LHS=%" UVuf " RHS=%" UVuf "\n",
683572
                         /* optimise study_chunk() for TRIE */
683572
                         minnext = study_chunk(pRExC_state, &scan, minlenp,
683572
                             &deltanext, (regnode *)nextbranch, &data_fake,
683572
-                            stopparen, recursed_depth, NULL, f, depth+1);
683572
+                            stopparen, recursed_depth, NULL, f, depth+1,
683572
+                            mutate_ok);
683572
                     }
683572
                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
683572
                         nextbranch= regnext((regnode*)nextbranch);
683572
@@ -8084,7 +8100,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
683572
             &data, -1, 0, NULL,
683572
             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
683572
                           | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
683572
-            0);
683572
+            0, TRUE);
683572
 
683572
 
683572
         CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
683572
@@ -8213,7 +8229,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
683572
             SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
683572
                                                       ? SCF_TRIE_DOING_RESTUDY
683572
                                                       : 0),
683572
-            0);
683572
+            0, TRUE);
683572
 
683572
         CHECK_RESTUDY_GOTO_butfirst(NOOP);
683572
 
683572
diff --git a/t/re/pat.t b/t/re/pat.t
683572
index 6a868f4..ba89a58 100644
683572
--- a/t/re/pat.t
683572
+++ b/t/re/pat.t
683572
@@ -25,7 +25,7 @@ BEGIN {
683572
 skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
683572
 skip_all_without_unicode_tables();
683572
 
683572
-plan tests => 864;  # Update this when adding/deleting tests.
683572
+plan tests => 868;  # Update this when adding/deleting tests.
683572
 
683572
 run_tests() unless caller;
683572
 
683572
@@ -2115,6 +2115,30 @@ x{0c!}\;\;îçÿ /0f/!F?/;îçÿù\Q?x?ÿÿÿÿù`x{0c!}?;ù\Q
683572
         like(runperl(prog => "$s", stderr => 1), qr/Unmatched \(/);
683572
    }
683572
 
683572
+    # gh16947: test regexp corruption (GOSUB)
683572
+    {
683572
+        fresh_perl_is(q{
683572
+            'xy' =~ /x(?0)|x(?|y|y)/ && print 'ok'
683572
+        }, 'ok', {}, 'gh16947: test regexp corruption (GOSUB)');
683572
+    }
683572
+    # gh16947: test fix doesn't break SUSPEND
683572
+    {
683572
+        fresh_perl_is(q{ 'sx' =~ m{ss++}i; print 'ok' },
683572
+                'ok', {}, "gh16947: test fix doesn't break SUSPEND");
683572
+    }
683572
+
683572
+    # gh17743: more regexp corruption via GOSUB
683572
+    {
683572
+        fresh_perl_is(q{
683572
+            "0" =~ /((0(?0)|000(?|0000|0000)(?0))|)/; print "ok"
683572
+        }, 'ok', {}, 'gh17743: test regexp corruption (1)');
683572
+
683572
+        fresh_perl_is(q{
683572
+            "000000000000" =~ /(0(())(0((?0)())|000(?|\x{ef}\x{bf}\x{bd}|\x{ef}\x{bf}\x{bd}))|)/;
683572
+            print "ok"
683572
+        }, 'ok', {}, 'gh17743: test regexp corruption (2)');
683572
+    }
683572
+
683572
 } # End of sub run_tests
683572
 
683572
 1;
683572
-- 
683572
2.25.4
683572