f6ea51
From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001
f6ea51
From: Aaron Crane <arc@cpan.org>
f6ea51
Date: Sat, 4 Mar 2017 12:50:58 +0000
f6ea51
Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " "
f6ea51
MIME-Version: 1.0
f6ea51
Content-Type: text/plain; charset=UTF-8
f6ea51
Content-Transfer-Encoding: 8bit
f6ea51
f6ea51
Ported to 5.26.0:
f6ea51
f6ea51
commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d
f6ea51
Author: Aaron Crane <arc@cpan.org>
f6ea51
Date:   Sat Mar 4 12:50:58 2017 +0000
f6ea51
f6ea51
    RT #130907: Fix the Unicode Bug in split " "
f6ea51
f6ea51
Signed-off-by: Petr Písař <ppisar@redhat.com>
f6ea51
---
f6ea51
 lib/feature.pm       |  5 +++--
f6ea51
 pod/perldelta.pod    |  9 +++++++++
f6ea51
 pod/perlfunc.pod     |  8 ++++++++
f6ea51
 pod/perlunicode.pod  | 11 +++++++++++
f6ea51
 pod/perluniintro.pod |  5 +++--
f6ea51
 pp.c                 | 13 +++++++++++++
f6ea51
 regen/feature.pl     |  5 +++--
f6ea51
 t/op/split.t         | 20 +++++++++++++++++++-
f6ea51
 8 files changed, 69 insertions(+), 7 deletions(-)
f6ea51
f6ea51
diff --git a/lib/feature.pm b/lib/feature.pm
f6ea51
index ed13273..93e020b 100644
f6ea51
--- a/lib/feature.pm
f6ea51
+++ b/lib/feature.pm
f6ea51
@@ -175,8 +175,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
f6ea51
 
f6ea51
 This feature is available starting with Perl 5.12; was almost fully
f6ea51
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
f6ea51
-and extended further in Perl 5.26 to cover L
f6ea51
-operator|perlop/Range Operators>.
f6ea51
+was extended further in Perl 5.26 to cover L
f6ea51
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
f6ea51
+cover L<special-cased whitespace splitting|perlfunc/split>.
f6ea51
 
f6ea51
 =head2 The 'unicode_eval' and 'evalbytes' features
f6ea51
 
f6ea51
#diff --git a/pod/perldelta.pod b/pod/perldelta.pod
f6ea51
#index 06dcd1d..d31335f 100644
f6ea51
#--- a/pod/perldelta.pod
f6ea51
#+++ b/pod/perldelta.pod
f6ea51
#@@ -3206,6 +3206,15 @@ calls.
f6ea51
# Parsing bad POSIX charclasses no longer leaks memory.
f6ea51
# L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313>
f6ea51
# 
f6ea51
#+=item *
f6ea51
#+
f6ea51
#+C<split ' '> now correctly handles the argument being split when in the
f6ea51
#+scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
f6ea51
#+>> feature. Previously, when a string using the single-byte internal
f6ea51
#+representation contained characters that are whitespace by Unicode rules but
f6ea51
#+not by ASCII rules, it treated those characters as part of fields rather
f6ea51
#+than as field separators.  [perl #130907]
f6ea51
#+
f6ea51
# =back
f6ea51
# 
f6ea51
# =head1 Known Problems
f6ea51
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
f6ea51
index b8dca6e..9abadf4 100644
f6ea51
--- a/pod/perlfunc.pod
f6ea51
+++ b/pod/perlfunc.pod
f6ea51
@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
f6ea51
 pattern argument to split; in Perl 5.18.0 and later this special case is
f6ea51
 triggered by any expression which evaluates to the simple string S<C<" ">>.
f6ea51
 
f6ea51
+As of Perl 5.28, this special-cased whitespace splitting works as expected in
f6ea51
+the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
f6ea51
+'unicode_strings' feature >>. In previous versions, and outside the scope of
f6ea51
+that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
f6ea51
+whitespace according to Unicode rules but not according to ASCII rules can be
f6ea51
+treated as part of fields rather than as field separators, depending on the
f6ea51
+string's internal encoding.
f6ea51
+
f6ea51
 If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
f6ea51
 the previously described I<awk> emulation.
f6ea51
 
f6ea51
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
f6ea51
index 9c13c35..2e84e95 100644
f6ea51
--- a/pod/perlunicode.pod
f6ea51
+++ b/pod/perlunicode.pod
f6ea51
@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters
f6ea51
 exceeded that of the right-hand side, where the right-hand side took up more
f6ea51
 bytes than the correct range endpoint.
f6ea51
 
f6ea51
+=item *
f6ea51
+
f6ea51
+In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
f6ea51
+
f6ea51
+Starting in Perl 5.28.0, the C<split> function with a pattern specified as
f6ea51
+a string containing a single space handles whitespace characters consistently
f6ea51
+within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
f6ea51
+characters that are whitespace according to Unicode rules but not according to
f6ea51
+ASCII rules were treated as field contents rather than field separators when
f6ea51
+they appear in byte-encoded strings.
f6ea51
+
f6ea51
 =back
f6ea51
 
f6ea51
 You can see from the above that the effect of C<unicode_strings>
f6ea51
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
f6ea51
index d35de34..595ec46 100644
f6ea51
--- a/pod/perluniintro.pod
f6ea51
+++ b/pod/perluniintro.pod
f6ea51
@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
f6ea51
 problems of the initial Unicode implementation, but for example
f6ea51
 regular expressions still do not work with Unicode in 5.6.1.
f6ea51
 Perl v5.14.0 is the first release where Unicode support is
f6ea51
-(almost) seamlessly integrable without some gotchas. (There are two
f6ea51
+(almost) seamlessly integrable without some gotchas. (There are a few
f6ea51
 exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
f6ea51
 were fixed starting in Perl 5.16.0. Secondly, some differences in
f6ea51
 L<the range operator|perlop/Range Operators> were fixed starting in
f6ea51
-Perl 5.26.0.)
f6ea51
+Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
f6ea51
+started in Perl 5.28.0.)
f6ea51
 
f6ea51
 To enable this
f6ea51
 seamless support, you should C<use feature 'unicode_strings'> (which is
f6ea51
diff --git a/pp.c b/pp.c
f6ea51
index cc4cb59..d9dd005 100644
f6ea51
--- a/pp.c
f6ea51
+++ b/pp.c
f6ea51
@@ -5740,6 +5740,7 @@ PP(pp_split)
f6ea51
     STRLEN len;
f6ea51
     const char *s = SvPV_const(sv, len);
f6ea51
     const bool do_utf8 = DO_UTF8(sv);
f6ea51
+    const bool in_uni_8_bit = IN_UNI_8_BIT;
f6ea51
     const char *strend = s + len;
f6ea51
     PMOP *pm = cPMOPx(PL_op);
f6ea51
     REGEXP *rx;
f6ea51
@@ -5826,6 +5827,10 @@ PP(pp_split)
f6ea51
 	    while (s < strend && isSPACE_LC(*s))
f6ea51
 		s++;
f6ea51
 	}
f6ea51
+        else if (in_uni_8_bit) {
f6ea51
+            while (s < strend && isSPACE_L1(*s))
f6ea51
+                s++;
f6ea51
+        }
f6ea51
 	else {
f6ea51
 	    while (s < strend && isSPACE(*s))
f6ea51
 		s++;
f6ea51
@@ -5857,6 +5862,10 @@ PP(pp_split)
f6ea51
             {
f6ea51
 	        while (m < strend && !isSPACE_LC(*m))
f6ea51
 		    ++m;
f6ea51
+            }
f6ea51
+            else if (in_uni_8_bit) {
f6ea51
+                while (m < strend && !isSPACE_L1(*m))
f6ea51
+                    ++m;
f6ea51
             } else {
f6ea51
                 while (m < strend && !isSPACE(*m))
f6ea51
                     ++m;
f6ea51
@@ -5891,6 +5900,10 @@ PP(pp_split)
f6ea51
             {
f6ea51
 	        while (s < strend && isSPACE_LC(*s))
f6ea51
 		    ++s;
f6ea51
+            }
f6ea51
+            else if (in_uni_8_bit) {
f6ea51
+                while (s < strend && isSPACE_L1(*s))
f6ea51
+                    ++s;
f6ea51
             } else {
f6ea51
                 while (s < strend && isSPACE(*s))
f6ea51
                     ++s;
f6ea51
diff --git a/regen/feature.pl b/regen/feature.pl
f6ea51
index 579120e..8a4ce63 100755
f6ea51
--- a/regen/feature.pl
f6ea51
+++ b/regen/feature.pl
f6ea51
@@ -485,8 +485,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
f6ea51
 
f6ea51
 This feature is available starting with Perl 5.12; was almost fully
f6ea51
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
f6ea51
-and extended further in Perl 5.26 to cover L
f6ea51
-operator|perlop/Range Operators>.
f6ea51
+was extended further in Perl 5.26 to cover L
f6ea51
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
f6ea51
+cover L<special-cased whitespace splitting|perlfunc/split>.
f6ea51
 
f6ea51
 =head2 The 'unicode_eval' and 'evalbytes' features
f6ea51
 
f6ea51
diff --git a/t/op/split.t b/t/op/split.t
f6ea51
index d60bcaf..038c5d7 100644
f6ea51
--- a/t/op/split.t
f6ea51
+++ b/t/op/split.t
f6ea51
@@ -7,7 +7,7 @@ BEGIN {
f6ea51
     set_up_inc('../lib');
f6ea51
 }
f6ea51
 
f6ea51
-plan tests => 163;
f6ea51
+plan tests => 172;
f6ea51
 
f6ea51
 $FS = ':';
f6ea51
 
f6ea51
@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
f6ea51
         qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
f6ea51
 }
f6ea51
 
f6ea51
+SKIP: {
f6ea51
+    # RT #130907: unicode_strings feature doesn't work with split ' '
f6ea51
+
f6ea51
+    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
f6ea51
+        or skip 'no unicode whitespace found in high-8-bit range', 9;
f6ea51
+
f6ea51
+    for (["$sp$sp. /", "leading unicode whitespace"],
f6ea51
+         [".$sp$sp/",  "unicode whitespace separator"],
f6ea51
+         [". /$sp$sp", "trailing unicode whitespace"]) {
f6ea51
+        my ($str, $desc) = @$_;
f6ea51
+        use feature "unicode_strings";
f6ea51
+        my @got = split " ", $str;
f6ea51
+        is @got, 2, "whitespace split: $desc: field count";
f6ea51
+        is $got[0], '.', "whitespace split: $desc: field 0";
f6ea51
+        is $got[1], '/', "whitespace split: $desc: field 1";
f6ea51
+    }
f6ea51
+}
f6ea51
+
f6ea51
 {
f6ea51
     # 'RT #116086: split "\x20" does not work as documented';
f6ea51
     my @results;
f6ea51
-- 
f6ea51
2.9.4
f6ea51