Blame SOURCES/perl-5.27.1-RT-130907-Fix-the-Unicode-Bug-in-split.patch

243a19
From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001
243a19
From: Aaron Crane <arc@cpan.org>
243a19
Date: Sat, 4 Mar 2017 12:50:58 +0000
243a19
Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " "
243a19
MIME-Version: 1.0
243a19
Content-Type: text/plain; charset=UTF-8
243a19
Content-Transfer-Encoding: 8bit
243a19
243a19
Ported to 5.26.0:
243a19
243a19
commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d
243a19
Author: Aaron Crane <arc@cpan.org>
243a19
Date:   Sat Mar 4 12:50:58 2017 +0000
243a19
243a19
    RT #130907: Fix the Unicode Bug in split " "
243a19
243a19
Signed-off-by: Petr Písař <ppisar@redhat.com>
243a19
---
243a19
 lib/feature.pm       |  5 +++--
243a19
 pod/perldelta.pod    |  9 +++++++++
243a19
 pod/perlfunc.pod     |  8 ++++++++
243a19
 pod/perlunicode.pod  | 11 +++++++++++
243a19
 pod/perluniintro.pod |  5 +++--
243a19
 pp.c                 | 13 +++++++++++++
243a19
 regen/feature.pl     |  5 +++--
243a19
 t/op/split.t         | 20 +++++++++++++++++++-
243a19
 8 files changed, 69 insertions(+), 7 deletions(-)
243a19
243a19
diff --git a/lib/feature.pm b/lib/feature.pm
243a19
index ed13273..93e020b 100644
243a19
--- a/lib/feature.pm
243a19
+++ b/lib/feature.pm
243a19
@@ -175,8 +175,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
243a19
 
243a19
 This feature is available starting with Perl 5.12; was almost fully
243a19
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
243a19
-and extended further in Perl 5.26 to cover L
243a19
-operator|perlop/Range Operators>.
243a19
+was extended further in Perl 5.26 to cover L
243a19
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
243a19
+cover L<special-cased whitespace splitting|perlfunc/split>.
243a19
 
243a19
 =head2 The 'unicode_eval' and 'evalbytes' features
243a19
 
243a19
#diff --git a/pod/perldelta.pod b/pod/perldelta.pod
243a19
#index 06dcd1d..d31335f 100644
243a19
#--- a/pod/perldelta.pod
243a19
#+++ b/pod/perldelta.pod
243a19
#@@ -3206,6 +3206,15 @@ calls.
243a19
# Parsing bad POSIX charclasses no longer leaks memory.
243a19
# L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313>
243a19
# 
243a19
#+=item *
243a19
#+
243a19
#+C<split ' '> now correctly handles the argument being split when in the
243a19
#+scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
243a19
#+>> feature. Previously, when a string using the single-byte internal
243a19
#+representation contained characters that are whitespace by Unicode rules but
243a19
#+not by ASCII rules, it treated those characters as part of fields rather
243a19
#+than as field separators.  [perl #130907]
243a19
#+
243a19
# =back
243a19
# 
243a19
# =head1 Known Problems
243a19
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
243a19
index b8dca6e..9abadf4 100644
243a19
--- a/pod/perlfunc.pod
243a19
+++ b/pod/perlfunc.pod
243a19
@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
243a19
 pattern argument to split; in Perl 5.18.0 and later this special case is
243a19
 triggered by any expression which evaluates to the simple string S<C<" ">>.
243a19
 
243a19
+As of Perl 5.28, this special-cased whitespace splitting works as expected in
243a19
+the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
243a19
+'unicode_strings' feature >>. In previous versions, and outside the scope of
243a19
+that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
243a19
+whitespace according to Unicode rules but not according to ASCII rules can be
243a19
+treated as part of fields rather than as field separators, depending on the
243a19
+string's internal encoding.
243a19
+
243a19
 If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
243a19
 the previously described I<awk> emulation.
243a19
 
243a19
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
243a19
index 9c13c35..2e84e95 100644
243a19
--- a/pod/perlunicode.pod
243a19
+++ b/pod/perlunicode.pod
243a19
@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters
243a19
 exceeded that of the right-hand side, where the right-hand side took up more
243a19
 bytes than the correct range endpoint.
243a19
 
243a19
+=item *
243a19
+
243a19
+In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
243a19
+
243a19
+Starting in Perl 5.28.0, the C<split> function with a pattern specified as
243a19
+a string containing a single space handles whitespace characters consistently
243a19
+within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
243a19
+characters that are whitespace according to Unicode rules but not according to
243a19
+ASCII rules were treated as field contents rather than field separators when
243a19
+they appear in byte-encoded strings.
243a19
+
243a19
 =back
243a19
 
243a19
 You can see from the above that the effect of C<unicode_strings>
243a19
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
243a19
index d35de34..595ec46 100644
243a19
--- a/pod/perluniintro.pod
243a19
+++ b/pod/perluniintro.pod
243a19
@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
243a19
 problems of the initial Unicode implementation, but for example
243a19
 regular expressions still do not work with Unicode in 5.6.1.
243a19
 Perl v5.14.0 is the first release where Unicode support is
243a19
-(almost) seamlessly integrable without some gotchas. (There are two
243a19
+(almost) seamlessly integrable without some gotchas. (There are a few
243a19
 exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
243a19
 were fixed starting in Perl 5.16.0. Secondly, some differences in
243a19
 L<the range operator|perlop/Range Operators> were fixed starting in
243a19
-Perl 5.26.0.)
243a19
+Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
243a19
+started in Perl 5.28.0.)
243a19
 
243a19
 To enable this
243a19
 seamless support, you should C<use feature 'unicode_strings'> (which is
243a19
diff --git a/pp.c b/pp.c
243a19
index cc4cb59..d9dd005 100644
243a19
--- a/pp.c
243a19
+++ b/pp.c
243a19
@@ -5740,6 +5740,7 @@ PP(pp_split)
243a19
     STRLEN len;
243a19
     const char *s = SvPV_const(sv, len);
243a19
     const bool do_utf8 = DO_UTF8(sv);
243a19
+    const bool in_uni_8_bit = IN_UNI_8_BIT;
243a19
     const char *strend = s + len;
243a19
     PMOP *pm = cPMOPx(PL_op);
243a19
     REGEXP *rx;
243a19
@@ -5826,6 +5827,10 @@ PP(pp_split)
243a19
 	    while (s < strend && isSPACE_LC(*s))
243a19
 		s++;
243a19
 	}
243a19
+        else if (in_uni_8_bit) {
243a19
+            while (s < strend && isSPACE_L1(*s))
243a19
+                s++;
243a19
+        }
243a19
 	else {
243a19
 	    while (s < strend && isSPACE(*s))
243a19
 		s++;
243a19
@@ -5857,6 +5862,10 @@ PP(pp_split)
243a19
             {
243a19
 	        while (m < strend && !isSPACE_LC(*m))
243a19
 		    ++m;
243a19
+            }
243a19
+            else if (in_uni_8_bit) {
243a19
+                while (m < strend && !isSPACE_L1(*m))
243a19
+                    ++m;
243a19
             } else {
243a19
                 while (m < strend && !isSPACE(*m))
243a19
                     ++m;
243a19
@@ -5891,6 +5900,10 @@ PP(pp_split)
243a19
             {
243a19
 	        while (s < strend && isSPACE_LC(*s))
243a19
 		    ++s;
243a19
+            }
243a19
+            else if (in_uni_8_bit) {
243a19
+                while (s < strend && isSPACE_L1(*s))
243a19
+                    ++s;
243a19
             } else {
243a19
                 while (s < strend && isSPACE(*s))
243a19
                     ++s;
243a19
diff --git a/regen/feature.pl b/regen/feature.pl
243a19
index 579120e..8a4ce63 100755
243a19
--- a/regen/feature.pl
243a19
+++ b/regen/feature.pl
243a19
@@ -485,8 +485,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
243a19
 
243a19
 This feature is available starting with Perl 5.12; was almost fully
243a19
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
243a19
-and extended further in Perl 5.26 to cover L
243a19
-operator|perlop/Range Operators>.
243a19
+was extended further in Perl 5.26 to cover L
243a19
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
243a19
+cover L<special-cased whitespace splitting|perlfunc/split>.
243a19
 
243a19
 =head2 The 'unicode_eval' and 'evalbytes' features
243a19
 
243a19
diff --git a/t/op/split.t b/t/op/split.t
243a19
index d60bcaf..038c5d7 100644
243a19
--- a/t/op/split.t
243a19
+++ b/t/op/split.t
243a19
@@ -7,7 +7,7 @@ BEGIN {
243a19
     set_up_inc('../lib');
243a19
 }
243a19
 
243a19
-plan tests => 163;
243a19
+plan tests => 172;
243a19
 
243a19
 $FS = ':';
243a19
 
243a19
@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
243a19
         qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
243a19
 }
243a19
 
243a19
+SKIP: {
243a19
+    # RT #130907: unicode_strings feature doesn't work with split ' '
243a19
+
243a19
+    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
243a19
+        or skip 'no unicode whitespace found in high-8-bit range', 9;
243a19
+
243a19
+    for (["$sp$sp. /", "leading unicode whitespace"],
243a19
+         [".$sp$sp/",  "unicode whitespace separator"],
243a19
+         [". /$sp$sp", "trailing unicode whitespace"]) {
243a19
+        my ($str, $desc) = @$_;
243a19
+        use feature "unicode_strings";
243a19
+        my @got = split " ", $str;
243a19
+        is @got, 2, "whitespace split: $desc: field count";
243a19
+        is $got[0], '.', "whitespace split: $desc: field 0";
243a19
+        is $got[1], '/', "whitespace split: $desc: field 1";
243a19
+    }
243a19
+}
243a19
+
243a19
 {
243a19
     # 'RT #116086: split "\x20" does not work as documented';
243a19
     my @results;
243a19
-- 
243a19
2.9.4
243a19