b8876f
From 1b90dad20879f0e7a3eced5da0e0aacda93708ed Mon Sep 17 00:00:00 2001
b8876f
From: Yves Orton <demerphq@gmail.com>
b8876f
Date: Thu, 27 Oct 2016 13:52:24 +0200
b8876f
Subject: [PATCH] regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8
b8876f
 with prefix optimisation
b8876f
MIME-Version: 1.0
b8876f
Content-Type: text/plain; charset=UTF-8
b8876f
Content-Transfer-Encoding: 8bit
b8876f
b8876f
Ported to 5.24.0:
b8876f
b8876f
commit da42332b10691ba7af7550035ffc7f46c87e4e66
b8876f
Author: Yves Orton <demerphq@gmail.com>
b8876f
Date:   Thu Oct 27 13:52:24 2016 +0200
b8876f
b8876f
    regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 with prefix optimisation
b8876f
b8876f
    The trie code contains a number of sub optimisations, one of which
b8876f
    extracts common prefixes from alternations, and another which isa
b8876f
    bitmap of the possible matching first chars.
b8876f
b8876f
    The bitmap needs to contain the possible first octets of the string
b8876f
    which the trie can match, and for codepoints which might have a different
b8876f
    first octet under utf8 or non-utf8 need to register BOTH codepoints.
b8876f
b8876f
    So for instance in the pattern (?:a|a\x{E4}) we should restructure this
b8876f
    as a(|\x{E4), and the bitmap for the trie should contain both \x{E4} AND
b8876f
    \x{C3} as \x{C3} is the first byte of \x{EF} expressed as utf8.
b8876f
b8876f
Signed-off-by: Petr Písař <ppisar@redhat.com>
b8876f
---
b8876f
 regcomp.c  | 14 ++++++++++++++
b8876f
 t/re/pat.t |  9 ++++++++-
b8876f
 2 files changed, 22 insertions(+), 1 deletion(-)
b8876f
b8876f
diff --git a/regcomp.c b/regcomp.c
b8876f
index 7462885..bcb8db5 100644
b8876f
--- a/regcomp.c
b8876f
+++ b/regcomp.c
b8876f
@@ -3272,6 +3272,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
b8876f
                                     TRIE_BITMAP_SET(trie,*ch);
b8876f
                                     if ( folder )
b8876f
                                         TRIE_BITMAP_SET(trie, folder[ *ch ]);
b8876f
+                                    if ( !UTF ) {
b8876f
+                                        /* store first byte of utf8 representation of
b8876f
+                                           variant codepoints */
b8876f
+                                        if (! UVCHR_IS_INVARIANT(*ch)) {
b8876f
+                                            TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
b8876f
+                                        }
b8876f
+                                    }
b8876f
                                     DEBUG_OPTIMISE_r(
b8876f
                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
b8876f
                                     );
b8876f
@@ -3280,6 +3287,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
b8876f
 			    TRIE_BITMAP_SET(trie,*ch);
b8876f
 			    if ( folder )
b8876f
 				TRIE_BITMAP_SET(trie,folder[ *ch ]);
b8876f
+                            if ( !UTF ) {
b8876f
+                                /* store first byte of utf8 representation of
b8876f
+                                   variant codepoints */
b8876f
+                                if (! UVCHR_IS_INVARIANT(*ch)) {
b8876f
+                                    TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
b8876f
+                                }
b8876f
+                            }
b8876f
                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
b8876f
 			}
b8876f
                         idx = ofs;
b8876f
diff --git a/t/re/pat.t b/t/re/pat.t
b8876f
index 295a9f7..4aa77cf 100644
b8876f
--- a/t/re/pat.t
b8876f
+++ b/t/re/pat.t
b8876f
@@ -23,7 +23,7 @@ BEGIN {
b8876f
     skip_all_without_unicode_tables();
b8876f
 }
b8876f
 
b8876f
-plan tests => 789;  # Update this when adding/deleting tests.
b8876f
+plan tests => 791;  # Update this when adding/deleting tests.
b8876f
 
b8876f
 run_tests() unless caller;
b8876f
 
b8876f
@@ -1758,6 +1758,13 @@ EOP
b8876f
                 fresh_perl_is($code, $expect, {}, "$bug - $test_name" );
b8876f
             }
b8876f
         }
b8876f
+
b8876f
+        {
b8876f
+            my $str = "a\xE4";
b8876f
+            ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - latin1 case" );
b8876f
+            utf8::upgrade($str);
b8876f
+            ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - utf8 case" );
b8876f
+        }
b8876f
 } # End of sub run_tests
b8876f
 
b8876f
 1;
b8876f
-- 
b8876f
2.7.4
b8876f