04bfb0
From 0c311b7c345769239f38d0139ea7738feec5ca4d Mon Sep 17 00:00:00 2001
04bfb0
From: Karl Williamson <khw@cpan.org>
04bfb0
Date: Sat, 2 Nov 2019 13:59:38 -0600
04bfb0
Subject: [PATCH] toke.c: Fix bug tr/// upgrading to UTF-8 in middle
04bfb0
MIME-Version: 1.0
04bfb0
Content-Type: text/plain; charset=UTF-8
04bfb0
Content-Transfer-Encoding: 8bit
04bfb0
04bfb0
Consider tr/\x{ff}-\x{100}/AB/.
04bfb0
04bfb0
While parsing, the code keeps an offset from the beginning of the output
04bfb0
to the beginning of the second number in the range.  This is purely for
04bfb0
speed so that it wouldn't have to re-find the beginning of that value,
04bfb0
when it already knew it.
04bfb0
04bfb0
But the example above shows the folly of this shortcut.  The second
04bfb0
number in the range causes the output to be upgraded to UTF-8, which
04bfb0
makes that offset invalid in general.  Change to re-find the beginning.
04bfb0
04bfb0
Signed-off-by: Petr Písař <ppisar@redhat.com>
04bfb0
---
04bfb0
 t/op/tr.t | 12 +++++++++++-
04bfb0
 toke.c    |  4 +++-
04bfb0
 2 files changed, 14 insertions(+), 2 deletions(-)
04bfb0
04bfb0
diff --git a/t/op/tr.t b/t/op/tr.t
04bfb0
index 47d603d4fd..25125c5bc7 100644
04bfb0
--- a/t/op/tr.t
04bfb0
+++ b/t/op/tr.t
04bfb0
@@ -13,7 +13,7 @@ BEGIN {
04bfb0
 
04bfb0
 use utf8;
04bfb0
 
04bfb0
-plan tests => 301;
04bfb0
+plan tests => 304;
04bfb0
 
04bfb0
 # Test this first before we extend the stack with other operations.
04bfb0
 # This caused an asan failure due to a bad write past the end of the stack.
04bfb0
@@ -1145,4 +1145,14 @@ for ("", nullrocow) {
04bfb0
                     'RT #133880 illegal \N{}');
04bfb0
 }
04bfb0
 
04bfb0
+{
04bfb0
+    my $c = "\xff";
04bfb0
+    my $d = "\x{104}";
04bfb0
+    eval '$c =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
04bfb0
+    is($@, "", 'tr/\x{ff}-\x{104}/\x{100}-\x{105}/ compiled');
04bfb0
+    is($c, "\x{100}", 'ff -> 100');
04bfb0
+    eval '$d =~ tr/\x{ff}-\x{104}/\x{100}-\x{105}/';
04bfb0
+    is($d, "\x{105}", '104 -> 105');
04bfb0
+}
04bfb0
+
04bfb0
 1;
04bfb0
diff --git a/toke.c b/toke.c
04bfb0
index 2995737af2..28f305c62c 100644
04bfb0
--- a/toke.c
04bfb0
+++ b/toke.c
04bfb0
@@ -3044,7 +3044,7 @@ S_scan_const(pTHX_ char *start)
04bfb0
                  * 'offset_to_max' is the offset in 'sv' at which the character
04bfb0
                  *      (the range's maximum end point) before 'd'  begins.
04bfb0
                  */
04bfb0
-                char * max_ptr = SvPVX(sv) + offset_to_max;
04bfb0
+                char * max_ptr;
04bfb0
                 char * min_ptr;
04bfb0
                 IV range_min;
04bfb0
 		IV range_max;	/* last character in range */
04bfb0
@@ -3056,6 +3056,8 @@ S_scan_const(pTHX_ char *start)
04bfb0
                 IV real_range_max = 0;
04bfb0
 #endif
04bfb0
                 /* Get the code point values of the range ends. */
04bfb0
+                max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
04bfb0
+                offset_to_max = max_ptr - SvPVX_const(sv);
04bfb0
                 if (d_is_utf8) {
04bfb0
                     /* We know the utf8 is valid, because we just constructed
04bfb0
                      * it ourselves in previous loop iterations */
04bfb0
-- 
04bfb0
2.21.0
04bfb0