b8c914
From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001
b8c914
From: Aaron Crane <arc@cpan.org>
b8c914
Date: Sat, 4 Mar 2017 12:50:58 +0000
b8c914
Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " "
b8c914
MIME-Version: 1.0
b8c914
Content-Type: text/plain; charset=UTF-8
b8c914
Content-Transfer-Encoding: 8bit
b8c914
b8c914
Ported to 5.26.0:
b8c914
b8c914
commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d
b8c914
Author: Aaron Crane <arc@cpan.org>
b8c914
Date:   Sat Mar 4 12:50:58 2017 +0000
b8c914
b8c914
    RT #130907: Fix the Unicode Bug in split " "
b8c914
b8c914
Signed-off-by: Petr Písař <ppisar@redhat.com>
b8c914
---
b8c914
 lib/feature.pm       |  5 +++--
b8c914
 pod/perldelta.pod    |  9 +++++++++
b8c914
 pod/perlfunc.pod     |  8 ++++++++
b8c914
 pod/perlunicode.pod  | 11 +++++++++++
b8c914
 pod/perluniintro.pod |  5 +++--
b8c914
 pp.c                 | 13 +++++++++++++
b8c914
 regen/feature.pl     |  5 +++--
b8c914
 t/op/split.t         | 20 +++++++++++++++++++-
b8c914
 8 files changed, 69 insertions(+), 7 deletions(-)
b8c914
b8c914
diff --git a/lib/feature.pm b/lib/feature.pm
b8c914
index ed13273..93e020b 100644
b8c914
--- a/lib/feature.pm
b8c914
+++ b/lib/feature.pm
b8c914
@@ -175,8 +175,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
b8c914
 
b8c914
 This feature is available starting with Perl 5.12; was almost fully
b8c914
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
b8c914
-and extended further in Perl 5.26 to cover L
b8c914
-operator|perlop/Range Operators>.
b8c914
+was extended further in Perl 5.26 to cover L
b8c914
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
b8c914
+cover L<special-cased whitespace splitting|perlfunc/split>.
b8c914
 
b8c914
 =head2 The 'unicode_eval' and 'evalbytes' features
b8c914
 
b8c914
#diff --git a/pod/perldelta.pod b/pod/perldelta.pod
b8c914
#index 06dcd1d..d31335f 100644
b8c914
#--- a/pod/perldelta.pod
b8c914
#+++ b/pod/perldelta.pod
b8c914
#@@ -3206,6 +3206,15 @@ calls.
b8c914
# Parsing bad POSIX charclasses no longer leaks memory.
b8c914
# L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313>
b8c914
# 
b8c914
#+=item *
b8c914
#+
b8c914
#+C<split ' '> now correctly handles the argument being split when in the
b8c914
#+scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
b8c914
#+>> feature. Previously, when a string using the single-byte internal
b8c914
#+representation contained characters that are whitespace by Unicode rules but
b8c914
#+not by ASCII rules, it treated those characters as part of fields rather
b8c914
#+than as field separators.  [perl #130907]
b8c914
#+
b8c914
# =back
b8c914
# 
b8c914
# =head1 Known Problems
b8c914
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
b8c914
index b8dca6e..9abadf4 100644
b8c914
--- a/pod/perlfunc.pod
b8c914
+++ b/pod/perlfunc.pod
b8c914
@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
b8c914
 pattern argument to split; in Perl 5.18.0 and later this special case is
b8c914
 triggered by any expression which evaluates to the simple string S<C<" ">>.
b8c914
 
b8c914
+As of Perl 5.28, this special-cased whitespace splitting works as expected in
b8c914
+the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
b8c914
+'unicode_strings' feature >>. In previous versions, and outside the scope of
b8c914
+that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
b8c914
+whitespace according to Unicode rules but not according to ASCII rules can be
b8c914
+treated as part of fields rather than as field separators, depending on the
b8c914
+string's internal encoding.
b8c914
+
b8c914
 If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
b8c914
 the previously described I<awk> emulation.
b8c914
 
b8c914
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
b8c914
index 9c13c35..2e84e95 100644
b8c914
--- a/pod/perlunicode.pod
b8c914
+++ b/pod/perlunicode.pod
b8c914
@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters
b8c914
 exceeded that of the right-hand side, where the right-hand side took up more
b8c914
 bytes than the correct range endpoint.
b8c914
 
b8c914
+=item *
b8c914
+
b8c914
+In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
b8c914
+
b8c914
+Starting in Perl 5.28.0, the C<split> function with a pattern specified as
b8c914
+a string containing a single space handles whitespace characters consistently
b8c914
+within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
b8c914
+characters that are whitespace according to Unicode rules but not according to
b8c914
+ASCII rules were treated as field contents rather than field separators when
b8c914
+they appear in byte-encoded strings.
b8c914
+
b8c914
 =back
b8c914
 
b8c914
 You can see from the above that the effect of C<unicode_strings>
b8c914
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
b8c914
index d35de34..595ec46 100644
b8c914
--- a/pod/perluniintro.pod
b8c914
+++ b/pod/perluniintro.pod
b8c914
@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
b8c914
 problems of the initial Unicode implementation, but for example
b8c914
 regular expressions still do not work with Unicode in 5.6.1.
b8c914
 Perl v5.14.0 is the first release where Unicode support is
b8c914
-(almost) seamlessly integrable without some gotchas. (There are two
b8c914
+(almost) seamlessly integrable without some gotchas. (There are a few
b8c914
 exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
b8c914
 were fixed starting in Perl 5.16.0. Secondly, some differences in
b8c914
 L<the range operator|perlop/Range Operators> were fixed starting in
b8c914
-Perl 5.26.0.)
b8c914
+Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
b8c914
+started in Perl 5.28.0.)
b8c914
 
b8c914
 To enable this
b8c914
 seamless support, you should C<use feature 'unicode_strings'> (which is
b8c914
diff --git a/pp.c b/pp.c
b8c914
index cc4cb59..d9dd005 100644
b8c914
--- a/pp.c
b8c914
+++ b/pp.c
b8c914
@@ -5740,6 +5740,7 @@ PP(pp_split)
b8c914
     STRLEN len;
b8c914
     const char *s = SvPV_const(sv, len);
b8c914
     const bool do_utf8 = DO_UTF8(sv);
b8c914
+    const bool in_uni_8_bit = IN_UNI_8_BIT;
b8c914
     const char *strend = s + len;
b8c914
     PMOP *pm = cPMOPx(PL_op);
b8c914
     REGEXP *rx;
b8c914
@@ -5826,6 +5827,10 @@ PP(pp_split)
b8c914
 	    while (s < strend && isSPACE_LC(*s))
b8c914
 		s++;
b8c914
 	}
b8c914
+        else if (in_uni_8_bit) {
b8c914
+            while (s < strend && isSPACE_L1(*s))
b8c914
+                s++;
b8c914
+        }
b8c914
 	else {
b8c914
 	    while (s < strend && isSPACE(*s))
b8c914
 		s++;
b8c914
@@ -5857,6 +5862,10 @@ PP(pp_split)
b8c914
             {
b8c914
 	        while (m < strend && !isSPACE_LC(*m))
b8c914
 		    ++m;
b8c914
+            }
b8c914
+            else if (in_uni_8_bit) {
b8c914
+                while (m < strend && !isSPACE_L1(*m))
b8c914
+                    ++m;
b8c914
             } else {
b8c914
                 while (m < strend && !isSPACE(*m))
b8c914
                     ++m;
b8c914
@@ -5891,6 +5900,10 @@ PP(pp_split)
b8c914
             {
b8c914
 	        while (s < strend && isSPACE_LC(*s))
b8c914
 		    ++s;
b8c914
+            }
b8c914
+            else if (in_uni_8_bit) {
b8c914
+                while (s < strend && isSPACE_L1(*s))
b8c914
+                    ++s;
b8c914
             } else {
b8c914
                 while (s < strend && isSPACE(*s))
b8c914
                     ++s;
b8c914
diff --git a/regen/feature.pl b/regen/feature.pl
b8c914
index 579120e..8a4ce63 100755
b8c914
--- a/regen/feature.pl
b8c914
+++ b/regen/feature.pl
b8c914
@@ -485,8 +485,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
b8c914
 
b8c914
 This feature is available starting with Perl 5.12; was almost fully
b8c914
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
b8c914
-and extended further in Perl 5.26 to cover L
b8c914
-operator|perlop/Range Operators>.
b8c914
+was extended further in Perl 5.26 to cover L
b8c914
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
b8c914
+cover L<special-cased whitespace splitting|perlfunc/split>.
b8c914
 
b8c914
 =head2 The 'unicode_eval' and 'evalbytes' features
b8c914
 
b8c914
diff --git a/t/op/split.t b/t/op/split.t
b8c914
index d60bcaf..038c5d7 100644
b8c914
--- a/t/op/split.t
b8c914
+++ b/t/op/split.t
b8c914
@@ -7,7 +7,7 @@ BEGIN {
b8c914
     set_up_inc('../lib');
b8c914
 }
b8c914
 
b8c914
-plan tests => 163;
b8c914
+plan tests => 172;
b8c914
 
b8c914
 $FS = ':';
b8c914
 
b8c914
@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
b8c914
         qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
b8c914
 }
b8c914
 
b8c914
+SKIP: {
b8c914
+    # RT #130907: unicode_strings feature doesn't work with split ' '
b8c914
+
b8c914
+    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
b8c914
+        or skip 'no unicode whitespace found in high-8-bit range', 9;
b8c914
+
b8c914
+    for (["$sp$sp. /", "leading unicode whitespace"],
b8c914
+         [".$sp$sp/",  "unicode whitespace separator"],
b8c914
+         [". /$sp$sp", "trailing unicode whitespace"]) {
b8c914
+        my ($str, $desc) = @$_;
b8c914
+        use feature "unicode_strings";
b8c914
+        my @got = split " ", $str;
b8c914
+        is @got, 2, "whitespace split: $desc: field count";
b8c914
+        is $got[0], '.', "whitespace split: $desc: field 0";
b8c914
+        is $got[1], '/', "whitespace split: $desc: field 1";
b8c914
+    }
b8c914
+}
b8c914
+
b8c914
 {
b8c914
     # 'RT #116086: split "\x20" does not work as documented';
b8c914
     my @results;
b8c914
-- 
b8c914
2.9.4
b8c914