Blame SOURCES/perl-5.24.4-Fix-131649-extended-charclass-can-trigger-assert.patch

b69025
From 10ce49389ea9ee26a3b02b6494b0a3849d56c6fa Mon Sep 17 00:00:00 2001
b69025
From: Yves Orton <demerphq@gmail.com>
b69025
Date: Mon, 26 Jun 2017 13:19:55 +0200
b69025
Subject: [PATCH] fix #131649 - extended charclass can trigger assert
b69025
b69025
The extended charclass parser makes some assumptions during the
b69025
first pass which are only true on well structured input, and it
b69025
does not properly catch various errors. later on the code assumes
b69025
that things the first pass will let through are valid, when in
b69025
fact they should trigger errors.
b69025
b69025
(cherry picked from commit 19a498a461d7c81ae3507c450953d1148efecf4f)
b69025
---
b69025
 pod/perldiag.pod        | 27 ++++++++++++++++++++++++++-
b69025
 pod/perlrecharclass.pod |  4 ++--
b69025
 regcomp.c               | 28 ++++++++++++++++++----------
b69025
 t/lib/warnings/regcomp  |  6 +++---
b69025
 t/re/reg_mesg.t         | 29 ++++++++++++++++-------------
b69025
 t/re/regex_sets.t       |  6 +++---
b69025
 6 files changed, 68 insertions(+), 32 deletions(-)
b69025
b69025
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
b69025
index 106fe41121..c29925a2a4 100644
b69025
--- a/pod/perldiag.pod
b69025
+++ b/pod/perldiag.pod
b69025
@@ -5904,7 +5904,7 @@ yourself.
b69025
 a perl4 interpreter, especially if the next 2 tokens are "use strict"
b69025
 or "my $var" or "our $var".
b69025
 
b69025
-=item Syntax error in (?[...]) in regex m/%s/
b69025
+=item Syntax error in (?[...]) in regex; marked by <-- HERE in m/%s/
b69025
 
b69025
 (F) Perl could not figure out what you meant inside this construct; this
b69025
 notifies you that it is giving up trying.
b69025
@@ -6402,6 +6402,31 @@ to find out why that isn't happening.
b69025
 (F) The unexec() routine failed for some reason.  See your local FSF
b69025
 representative, who probably put it there in the first place.
b69025
 
b69025
+=item Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/%s/
b69025
+
b69025
+(F) While parsing an extended character class a ']' character was encountered
b69025
+at a point in the definition where the only legal use of ']' is to close the
b69025
+character class definition as part of a '])', you may have forgotten the close
b69025
+paren, or otherwise confused the parser.
b69025
+
b69025
+=item Expecting close paren for nested extended charclass in regex; marked by <-- HERE in m/%s/
b69025
+
b69025
+(F) While parsing a nested extended character class like:
b69025
+
b69025
+    (?[ ... (?flags:(?[ ... ])) ... ])
b69025
+                             ^
b69025
+
b69025
+we expected to see a close paren ')' (marked by ^) but did not.
b69025
+
b69025
+=item Expecting close paren for wrapper for nested extended charclass in regex; marked by <-- HERE in m/%s/
b69025
+
b69025
+(F) While parsing a nested extended character class like:
b69025
+
b69025
+    (?[ ... (?flags:(?[ ... ])) ... ])
b69025
+                              ^
b69025
+
b69025
+we expected to see a close paren ')' (marked by ^) but did not.
b69025
+
b69025
 =item Unexpected binary operator '%c' with no preceding operand in regex;
b69025
 marked by S<<-- HERE> in m/%s/
b69025
 
b69025
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
b69025
index 79480e4131..8c008507d1 100644
b69025
--- a/pod/perlrecharclass.pod
b69025
+++ b/pod/perlrecharclass.pod
b69025
@@ -1128,8 +1128,8 @@ hence both of the following work:
b69025
 Any contained POSIX character classes, including things like C<\w> and C<\D>
b69025
 respect the C<E<sol>a> (and C<E<sol>aa>) modifiers.
b69025
 
b69025
-C<< (?[ ]) >> is a regex-compile-time construct.  Any attempt to use
b69025
-something which isn't knowable at the time the containing regular
b69025
+Note that C<< (?[ ]) >> is a regex-compile-time construct.  Any attempt
b69025
+to use something which isn't knowable at the time the containing regular
b69025
 expression is compiled is a fatal error.  In practice, this means
b69025
 just three limitations:
b69025
 
b69025
diff --git a/regcomp.c b/regcomp.c
b69025
index 4ee48ede42..ddac290d2b 100644
b69025
--- a/regcomp.c
b69025
+++ b/regcomp.c
b69025
@@ -14840,8 +14840,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
b69025
                                     TRUE /* Force /x */ );
b69025
 
b69025
             switch (*RExC_parse) {
b69025
-                case '?':
b69025
-                    if (RExC_parse[1] == '[') depth++, RExC_parse++;
b69025
+                case '(':
b69025
+                    if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
b69025
+                        depth++, RExC_parse+=2;
b69025
                     /* FALLTHROUGH */
b69025
                 default:
b69025
                     break;
b69025
@@ -14898,9 +14899,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
b69025
                 }
b69025
 
b69025
                 case ']':
b69025
-                    if (depth--) break;
b69025
-                    RExC_parse++;
b69025
-                    if (*RExC_parse == ')') {
b69025
+                    if (RExC_parse[1] == ')') {
b69025
+                        RExC_parse++;
b69025
+                        if (depth--) break;
b69025
                         node = reganode(pRExC_state, ANYOF, 0);
b69025
                         RExC_size += ANYOF_SKIP;
b69025
                         nextchar(pRExC_state);
b69025
@@ -14912,20 +14913,25 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
b69025
 
b69025
                         return node;
b69025
                     }
b69025
-                    goto no_close;
b69025
+                    /* We output the messages even if warnings are off, because we'll fail
b69025
+                     * the very next thing, and these give a likely diagnosis for that */
b69025
+                    if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
b69025
+                        output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
b69025
+                    }
b69025
+                    RExC_parse++;
b69025
+                    vFAIL("Unexpected ']' with no following ')' in (?[...");
b69025
             }
b69025
 
b69025
             RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
b69025
         }
b69025
 
b69025
-      no_close:
b69025
         /* We output the messages even if warnings are off, because we'll fail
b69025
          * the very next thing, and these give a likely diagnosis for that */
b69025
         if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) {
b69025
             output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
b69025
         }
b69025
 
b69025
-        FAIL("Syntax error in (?[...])");
b69025
+        vFAIL("Syntax error in (?[...])");
b69025
     }
b69025
 
b69025
     /* Pass 2 only after this. */
b69025
@@ -15105,12 +15111,14 @@ redo_curchar:
b69025
                      * inversion list, and RExC_parse points to the trailing
b69025
                      * ']'; the next character should be the ')' */
b69025
                     RExC_parse++;
b69025
-                    assert(UCHARAT(RExC_parse) == ')');
b69025
+                    if (UCHARAT(RExC_parse) != ')')
b69025
+                        vFAIL("Expecting close paren for nested extended charclass");
b69025
 
b69025
                     /* Then the ')' matching the original '(' handled by this
b69025
                      * case: statement */
b69025
                     RExC_parse++;
b69025
-                    assert(UCHARAT(RExC_parse) == ')');
b69025
+                    if (UCHARAT(RExC_parse) != ')')
b69025
+                        vFAIL("Expecting close paren for wrapper for nested extended charclass");
b69025
 
b69025
                     RExC_parse++;
b69025
                     RExC_flags = save_flags;
b69025
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
b69025
index 2b084c59b0..51ad57ccbe 100644
b69025
--- a/t/lib/warnings/regcomp
b69025
+++ b/t/lib/warnings/regcomp
b69025
@@ -59,21 +59,21 @@ Unmatched [ in regex; marked by <-- HERE in m/abc[ <-- HERE fi[.00./ at - line
b69025
 qr/(?[[[:word]]])/;
b69025
 EXPECT
b69025
 Assuming NOT a POSIX class since there is no terminating ':' in regex; marked by <-- HERE in m/(?[[[:word <-- HERE ]]])/ at - line 2.
b69025
-syntax error in (?[...]) in regex m/(?[[[:word]]])/ at - line 2.
b69025
+Unexpected ']' with no following ')' in (?[... in regex; marked by <-- HERE in m/(?[[[:word]] <-- HERE ])/ at - line 2.
b69025
 ########
b69025
 # NAME qr/(?[ [[:digit: ])/
b69025
 # OPTION fatal
b69025
 qr/(?[[[:digit: ])/;
b69025
 EXPECT
b69025
 Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[[:digit: ] <-- HERE )/ at - line 2.
b69025
-syntax error in (?[...]) in regex m/(?[[[:digit: ])/ at - line 2.
b69025
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[[:digit: ]) <-- HERE / at - line 2.
b69025
 ########
b69025
 # NAME qr/(?[ [:digit: ])/
b69025
 # OPTION fatal
b69025
 qr/(?[[:digit: ])/
b69025
 EXPECT
b69025
 Assuming NOT a POSIX class since no blanks are allowed in one in regex; marked by <-- HERE in m/(?[[:digit: ] <-- HERE )/ at - line 2.
b69025
-syntax error in (?[...]) in regex m/(?[[:digit: ])/ at - line 2.
b69025
+syntax error in (?[...]) in regex; marked by <-- HERE in m/(?[[:digit: ]) <-- HERE / at - line 2.
b69025
 ########
b69025
 # NAME [perl #126141]
b69025
 # OPTION fatal
b69025
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
b69025
index d26a7caf37..5194d93751 100644
b69025
--- a/t/re/reg_mesg.t
b69025
+++ b/t/re/reg_mesg.t
b69025
@@ -215,8 +215,9 @@ my @death =
b69025
  '/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
b69025
  '/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
b69025
 
b69025
- '/(?[[[::]]])/' => "Syntax error in (?[...]) in regex m/(?[[[::]]])/",
b69025
- '/(?[[[:w:]]])/' => "Syntax error in (?[...]) in regex m/(?[[[:w:]]])/",
b69025
+
b69025
+ '/(?[[[::]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[::]]{#}])/",
b69025
+ '/(?[[[:w:]]])/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[[[:w:]]{#}])/",
b69025
  '/(?[[:w:]])/' => "",
b69025
  '/[][[:alpha:]]' => "",    # [perl #127581]
b69025
  '/([.].*)[.]/'   => "",    # [perl #127582]
b69025
@@ -239,11 +240,12 @@ my @death =
b69025
  '/(?[ \p{foo} ])/' => 'Can\'t find Unicode property definition "foo" {#} m/(?[ \p{foo}{#} ])/',
b69025
  '/(?[ \p{ foo = bar } ])/' => 'Can\'t find Unicode property definition "foo = bar" {#} m/(?[ \p{ foo = bar }{#} ])/',
b69025
  '/(?[ \8 ])/' => 'Unrecognized escape \8 in character class {#} m/(?[ \8{#} ])/',
b69025
- '/(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ]/',
b69025
- '/(?[ [ \t ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ \t ]/',
b69025
- '/(?[ \t ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ \t ] ]/',
b69025
- '/(?[ [ ] ]/' => 'Syntax error in (?[...]) in regex m/(?[ [ ] ]/',
b69025
- '/(?[ \t + \e # This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # This was supposed to be a comment ])/',
b69025
+ '/(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#}/",
b69025
+ '/(?[ [ \t ]/' => "Syntax error in (?[...]) {#} m/(?[ [ \\t ]{#}/",
b69025
+ '/(?[ \t ] ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/(?[ \\t ]{#} ]/",
b69025
+ '/(?[ [ ] ]/' => "Syntax error in (?[...]) {#} m/(?[ [ ] ]{#}/",
b69025
+ '/(?[ \t + \e # This was supposed to be a comment ])/' =>
b69025
+    "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # This was supposed to be a comment ]){#}/",
b69025
  '/(?[ ])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[ {#}])/',
b69025
  'm/(?[[a-\d]])/' => 'False [] range "a-\d" {#} m/(?[[a-\d{#}]])/',
b69025
  'm/(?[[\w-x]])/' => 'False [] range "\w-" {#} m/(?[[\w-{#}x]])/',
b69025
@@ -431,10 +433,10 @@ my @death_utf8 = mark_as_utf8(
b69025
 
b69025
  '/ネ\p{}ネ/' => 'Empty \p{} {#} m/ネ\p{{#}}ネ/',
b69025
 
b69025
- '/ネ(?[[[:ネ]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ]]])ネ/",
b69025
- '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ: ])ネ/",
b69025
- '/ネ(?[[[::]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[::]]])ネ/",
b69025
- '/ネ(?[[[:ネ:]]])ネ/' => "Syntax error in (?[...]) in regex m/ネ(?[[[:ネ:]]])ネ/",
b69025
+ '/ネ(?[[[:ネ]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ]]{#}])ネ/",
b69025
+ '/ネ(?[[[:ネ: ])ネ/' => "Syntax error in (?[...]) {#} m/ネ(?[[[:ネ: ])ネ{#}/",
b69025
+ '/ネ(?[[[::]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[::]]{#}])ネ/",
b69025
+ '/ネ(?[[[:ネ:]]])ネ/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[[[:ネ:]]{#}])ネ/",
b69025
  '/ネ(?[[:ネ:]])ネ/' => "",
b69025
  '/ネ(?[ネ])ネ/' =>  'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
b69025
  '/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
b69025
@@ -447,8 +449,9 @@ my @death_utf8 = mark_as_utf8(
b69025
  '/(?[ \x{ネ} ])ネ/' => 'Non-hex character {#} m/(?[ \x{ネ{#}} ])ネ/',
b69025
  '/(?[ \p{ネ} ])/' => 'Can\'t find Unicode property definition "ネ" {#} m/(?[ \p{ネ}{#} ])/',
b69025
  '/(?[ \p{ ネ = bar } ])/' => 'Can\'t find Unicode property definition "ネ = bar" {#} m/(?[ \p{ ネ = bar }{#} ])/',
b69025
- '/ネ(?[ \t ]/' => 'Syntax error in (?[...]) in regex m/ネ(?[ \t ]/',
b69025
- '/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
b69025
+ '/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
b69025
+ '/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
b69025
+    "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
b69025
  'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
b69025
  '/\cネ/' => "Character following \"\\c\" must be printable ASCII",
b69025
  '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
b69025
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
b69025
index 6a79f9d692..e9644bd4e6 100644
b69025
--- a/t/re/regex_sets.t
b69025
+++ b/t/re/regex_sets.t
b69025
@@ -158,13 +158,13 @@ for my $char ("٠", "٥", "٩") {
b69025
 	eval { $_ = '/(?[(\c]) /'; qr/$_/ };
b69025
 	like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
b69025
 	eval { $_ = '(?[\c#]' . "\n])"; qr/$_/ };
b69025
-	like($@, qr/^Syntax error/, '/(?[(\c]) / should not panic');
b69025
+	like($@, qr/^Unexpected/, '/(?[(\c]) / should not panic');
b69025
 	eval { $_ = '(?[(\c])'; qr/$_/ };
b69025
 	like($@, qr/^Syntax error/, '/(?[(\c])/ should be a syntax error');
b69025
 	eval { $_ = '(?[(\c]) ]\b'; qr/$_/ };
b69025
-	like($@, qr/^Syntax error/, '/(?[(\c]) ]\b/ should be a syntax error');
b69025
+	like($@, qr/^Unexpected/, '/(?[(\c]) ]\b/ should be a syntax error');
b69025
 	eval { $_ = '(?[\c[]](])'; qr/$_/ };
b69025
-	like($@, qr/^Syntax error/, '/(?[\c[]](])/ should be a syntax error');
b69025
+	like($@, qr/^Unexpected/, '/(?[\c[]](])/ should be a syntax error');
b69025
 	like("\c#", qr/(?[\c#])/, '\c# should match itself');
b69025
 	like("\c[", qr/(?[\c[])/, '\c[ should match itself');
b69025
 	like("\c\ ", qr/(?[\c\])/, '\c\ should match itself');
b69025
-- 
b69025
2.11.0
b69025