683572
From 0c311b7c345769239f38d0139ea7738feec5ca4d Mon Sep 17 00:00:00 2001
683572
From: Karl Williamson <khw@cpan.org>
683572
Date: Sat, 2 Nov 2019 13:59:38 -0600
683572
Subject: [PATCH] toke.c: Fix bug tr/// upgrading to UTF-8 in middle
683572
MIME-Version: 1.0
683572
Content-Type: text/plain; charset=UTF-8
683572
Content-Transfer-Encoding: 8bit
683572
683572
Consider tr/\x{ff}-\x{100}/AB/.
683572
683572
While parsing, the code keeps an offset from the beginning of the output
683572
to the beginning of the second number in the range.  This is purely for
683572
speed so that it wouldn't have to re-find the beginning of that value,
683572
when it already knew it.
683572
683572
But the example above shows the folly of this shortcut.  The second
683572
number in the range causes the output to be upgraded to UTF-8, which
683572
makes that offset invalid in general.  Change to re-find the beginning.
683572
683572
Signed-off-by: Petr Písař <ppisar@redhat.com>
683572
---
683572
 t/op/tr.t | 12 +++++++++++-
683572
 toke.c    |  4 +++-
683572
 2 files changed, 14 insertions(+), 2 deletions(-)
683572
683572
diff --git a/t/op/tr.t b/t/op/tr.t
683572
index 47d603d4fd..25125c5bc7 100644
683572
--- a/t/op/tr.t
683572
+++ b/t/op/tr.t
683572
@@ -13,7 +13,7 @@ BEGIN {
683572
 
683572
 use utf8;
683572
 
683572
-plan tests => 301;
683572
+plan tests => 304;
683572
 
683572
 # Test this first before we extend the stack with other operations.
683572
 # This caused an asan failure due to a bad write past the end of the stack.
683572
@@ -1145,4 +1145,14 @@ for ("", nullrocow) {
683572
                     'RT #133880 illegal \N{}');
683572
 }
683572
 
683572
+{
683572
+    my $c = "\xff";
683572
+    my $d = "\x{104}";
683572
+    eval '$c =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
683572
+    is($@, "", 'tr/\x{ff}-\x{104}/\x{100}-\x{105}/ compiled');
683572
+    is($c, "\x{100}", 'ff -> 100');
683572
+    eval '$d =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
683572
+    is($d, "\x{105}", '104 -> 105');
683572
+}
683572
+
683572
 1;
683572
diff --git a/toke.c b/toke.c
683572
index 2995737af2..28f305c62c 100644
683572
--- a/toke.c
683572
+++ b/toke.c
683572
@@ -3044,7 +3044,7 @@ S_scan_const(pTHX_ char *start)
683572
                  * 'offset_to_max' is the offset in 'sv' at which the character
683572
                  *      (the range's maximum end point) before 'd'  begins.
683572
                  */
683572
-                char * max_ptr = SvPVX(sv) + offset_to_max;
683572
+                char * max_ptr;
683572
                 char * min_ptr;
683572
                 IV range_min;
683572
 		IV range_max;	/* last character in range */
683572
@@ -3056,6 +3056,8 @@ S_scan_const(pTHX_ char *start)
683572
                 IV real_range_max = 0;
683572
 #endif
683572
                 /* Get the code point values of the range ends. */
683572
+                max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
683572
+                offset_to_max = max_ptr - SvPVX_const(sv);
683572
                 if (d_is_utf8) {
683572
                     /* We know the utf8 is valid, because we just constructed
683572
                      * it ourselves in previous loop iterations */
683572
-- 
683572
2.21.0
683572