|
|
b8876f |
From 1b90dad20879f0e7a3eced5da0e0aacda93708ed Mon Sep 17 00:00:00 2001
|
|
|
b8876f |
From: Yves Orton <demerphq@gmail.com>
|
|
|
b8876f |
Date: Thu, 27 Oct 2016 13:52:24 +0200
|
|
|
b8876f |
Subject: [PATCH] regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8
|
|
|
b8876f |
with prefix optimisation
|
|
|
b8876f |
MIME-Version: 1.0
|
|
|
b8876f |
Content-Type: text/plain; charset=UTF-8
|
|
|
b8876f |
Content-Transfer-Encoding: 8bit
|
|
|
b8876f |
|
|
|
b8876f |
Ported to 5.24.0:
|
|
|
b8876f |
|
|
|
b8876f |
commit da42332b10691ba7af7550035ffc7f46c87e4e66
|
|
|
b8876f |
Author: Yves Orton <demerphq@gmail.com>
|
|
|
b8876f |
Date: Thu Oct 27 13:52:24 2016 +0200
|
|
|
b8876f |
|
|
|
b8876f |
regcomp.c: fix perl #129950 - fix firstchar bitmap under utf8 with prefix optimisation
|
|
|
b8876f |
|
|
|
b8876f |
The trie code contains a number of sub optimisations, one of which
|
|
|
b8876f |
extracts common prefixes from alternations, and another which isa
|
|
|
b8876f |
bitmap of the possible matching first chars.
|
|
|
b8876f |
|
|
|
b8876f |
The bitmap needs to contain the possible first octets of the string
|
|
|
b8876f |
which the trie can match, and for codepoints which might have a different
|
|
|
b8876f |
first octet under utf8 or non-utf8 need to register BOTH codepoints.
|
|
|
b8876f |
|
|
|
b8876f |
So for instance in the pattern (?:a|a\x{E4}) we should restructure this
|
|
|
b8876f |
as a(|\x{E4), and the bitmap for the trie should contain both \x{E4} AND
|
|
|
b8876f |
\x{C3} as \x{C3} is the first byte of \x{EF} expressed as utf8.
|
|
|
b8876f |
|
|
|
b8876f |
Signed-off-by: Petr Písař <ppisar@redhat.com>
|
|
|
b8876f |
---
|
|
|
b8876f |
regcomp.c | 14 ++++++++++++++
|
|
|
b8876f |
t/re/pat.t | 9 ++++++++-
|
|
|
b8876f |
2 files changed, 22 insertions(+), 1 deletion(-)
|
|
|
b8876f |
|
|
|
b8876f |
diff --git a/regcomp.c b/regcomp.c
|
|
|
b8876f |
index 7462885..bcb8db5 100644
|
|
|
b8876f |
--- a/regcomp.c
|
|
|
b8876f |
+++ b/regcomp.c
|
|
|
b8876f |
@@ -3272,6 +3272,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
|
|
|
b8876f |
TRIE_BITMAP_SET(trie,*ch);
|
|
|
b8876f |
if ( folder )
|
|
|
b8876f |
TRIE_BITMAP_SET(trie, folder[ *ch ]);
|
|
|
b8876f |
+ if ( !UTF ) {
|
|
|
b8876f |
+ /* store first byte of utf8 representation of
|
|
|
b8876f |
+ variant codepoints */
|
|
|
b8876f |
+ if (! UVCHR_IS_INVARIANT(*ch)) {
|
|
|
b8876f |
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
|
|
|
b8876f |
+ }
|
|
|
b8876f |
+ }
|
|
|
b8876f |
DEBUG_OPTIMISE_r(
|
|
|
b8876f |
Perl_re_printf( aTHX_ "%s", (char*)ch)
|
|
|
b8876f |
);
|
|
|
b8876f |
@@ -3280,6 +3287,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
|
|
|
b8876f |
TRIE_BITMAP_SET(trie,*ch);
|
|
|
b8876f |
if ( folder )
|
|
|
b8876f |
TRIE_BITMAP_SET(trie,folder[ *ch ]);
|
|
|
b8876f |
+ if ( !UTF ) {
|
|
|
b8876f |
+ /* store first byte of utf8 representation of
|
|
|
b8876f |
+ variant codepoints */
|
|
|
b8876f |
+ if (! UVCHR_IS_INVARIANT(*ch)) {
|
|
|
b8876f |
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(*ch));
|
|
|
b8876f |
+ }
|
|
|
b8876f |
+ }
|
|
|
b8876f |
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
|
|
|
b8876f |
}
|
|
|
b8876f |
idx = ofs;
|
|
|
b8876f |
diff --git a/t/re/pat.t b/t/re/pat.t
|
|
|
b8876f |
index 295a9f7..4aa77cf 100644
|
|
|
b8876f |
--- a/t/re/pat.t
|
|
|
b8876f |
+++ b/t/re/pat.t
|
|
|
b8876f |
@@ -23,7 +23,7 @@ BEGIN {
|
|
|
b8876f |
skip_all_without_unicode_tables();
|
|
|
b8876f |
}
|
|
|
b8876f |
|
|
|
b8876f |
-plan tests => 789; # Update this when adding/deleting tests.
|
|
|
b8876f |
+plan tests => 791; # Update this when adding/deleting tests.
|
|
|
b8876f |
|
|
|
b8876f |
run_tests() unless caller;
|
|
|
b8876f |
|
|
|
b8876f |
@@ -1758,6 +1758,13 @@ EOP
|
|
|
b8876f |
fresh_perl_is($code, $expect, {}, "$bug - $test_name" );
|
|
|
b8876f |
}
|
|
|
b8876f |
}
|
|
|
b8876f |
+
|
|
|
b8876f |
+ {
|
|
|
b8876f |
+ my $str = "a\xE4";
|
|
|
b8876f |
+ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - latin1 case" );
|
|
|
b8876f |
+ utf8::upgrade($str);
|
|
|
b8876f |
+ ok( $str =~ m{^(a|a\x{e4})$}, "fix [perl #129950] - utf8 case" );
|
|
|
b8876f |
+ }
|
|
|
b8876f |
} # End of sub run_tests
|
|
|
b8876f |
|
|
|
b8876f |
1;
|
|
|
b8876f |
--
|
|
|
b8876f |
2.7.4
|
|
|
b8876f |
|