a4ac56
From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001
a4ac56
From: Aaron Crane <arc@cpan.org>
a4ac56
Date: Sat, 4 Mar 2017 12:50:58 +0000
a4ac56
Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " "
a4ac56
MIME-Version: 1.0
a4ac56
Content-Type: text/plain; charset=UTF-8
a4ac56
Content-Transfer-Encoding: 8bit
a4ac56
a4ac56
Ported to 5.26.0:
a4ac56
a4ac56
commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d
a4ac56
Author: Aaron Crane <arc@cpan.org>
a4ac56
Date:   Sat Mar 4 12:50:58 2017 +0000
a4ac56
a4ac56
    RT #130907: Fix the Unicode Bug in split " "
a4ac56
a4ac56
Signed-off-by: Petr Písař <ppisar@redhat.com>
a4ac56
---
a4ac56
 lib/feature.pm       |  5 +++--
a4ac56
 pod/perldelta.pod    |  9 +++++++++
a4ac56
 pod/perlfunc.pod     |  8 ++++++++
a4ac56
 pod/perlunicode.pod  | 11 +++++++++++
a4ac56
 pod/perluniintro.pod |  5 +++--
a4ac56
 pp.c                 | 13 +++++++++++++
a4ac56
 regen/feature.pl     |  5 +++--
a4ac56
 t/op/split.t         | 20 +++++++++++++++++++-
a4ac56
 8 files changed, 69 insertions(+), 7 deletions(-)
a4ac56
a4ac56
diff --git a/lib/feature.pm b/lib/feature.pm
a4ac56
index ed13273..93e020b 100644
a4ac56
--- a/lib/feature.pm
a4ac56
+++ b/lib/feature.pm
a4ac56
@@ -175,8 +175,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
a4ac56
 
a4ac56
 This feature is available starting with Perl 5.12; was almost fully
a4ac56
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
a4ac56
-and extended further in Perl 5.26 to cover L
a4ac56
-operator|perlop/Range Operators>.
a4ac56
+was extended further in Perl 5.26 to cover L
a4ac56
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
a4ac56
+cover L<special-cased whitespace splitting|perlfunc/split>.
a4ac56
 
a4ac56
 =head2 The 'unicode_eval' and 'evalbytes' features
a4ac56
 
a4ac56
#diff --git a/pod/perldelta.pod b/pod/perldelta.pod
a4ac56
#index 06dcd1d..d31335f 100644
a4ac56
#--- a/pod/perldelta.pod
a4ac56
#+++ b/pod/perldelta.pod
a4ac56
#@@ -3206,6 +3206,15 @@ calls.
a4ac56
# Parsing bad POSIX charclasses no longer leaks memory.
a4ac56
# L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313>
a4ac56
# 
a4ac56
#+=item *
a4ac56
#+
a4ac56
#+C<split ' '> now correctly handles the argument being split when in the
a4ac56
#+scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
a4ac56
#+>> feature. Previously, when a string using the single-byte internal
a4ac56
#+representation contained characters that are whitespace by Unicode rules but
a4ac56
#+not by ASCII rules, it treated those characters as part of fields rather
a4ac56
#+than as field separators.  [perl #130907]
a4ac56
#+
a4ac56
# =back
a4ac56
# 
a4ac56
# =head1 Known Problems
a4ac56
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
a4ac56
index b8dca6e..9abadf4 100644
a4ac56
--- a/pod/perlfunc.pod
a4ac56
+++ b/pod/perlfunc.pod
a4ac56
@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
a4ac56
 pattern argument to split; in Perl 5.18.0 and later this special case is
a4ac56
 triggered by any expression which evaluates to the simple string S<C<" ">>.
a4ac56
 
a4ac56
+As of Perl 5.28, this special-cased whitespace splitting works as expected in
a4ac56
+the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
a4ac56
+'unicode_strings' feature >>. In previous versions, and outside the scope of
a4ac56
+that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
a4ac56
+whitespace according to Unicode rules but not according to ASCII rules can be
a4ac56
+treated as part of fields rather than as field separators, depending on the
a4ac56
+string's internal encoding.
a4ac56
+
a4ac56
 If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
a4ac56
 the previously described I<awk> emulation.
a4ac56
 
a4ac56
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
a4ac56
index 9c13c35..2e84e95 100644
a4ac56
--- a/pod/perlunicode.pod
a4ac56
+++ b/pod/perlunicode.pod
a4ac56
@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters
a4ac56
 exceeded that of the right-hand side, where the right-hand side took up more
a4ac56
 bytes than the correct range endpoint.
a4ac56
 
a4ac56
+=item *
a4ac56
+
a4ac56
+In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
a4ac56
+
a4ac56
+Starting in Perl 5.28.0, the C<split> function with a pattern specified as
a4ac56
+a string containing a single space handles whitespace characters consistently
a4ac56
+within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
a4ac56
+characters that are whitespace according to Unicode rules but not according to
a4ac56
+ASCII rules were treated as field contents rather than field separators when
a4ac56
+they appear in byte-encoded strings.
a4ac56
+
a4ac56
 =back
a4ac56
 
a4ac56
 You can see from the above that the effect of C<unicode_strings>
a4ac56
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
a4ac56
index d35de34..595ec46 100644
a4ac56
--- a/pod/perluniintro.pod
a4ac56
+++ b/pod/perluniintro.pod
a4ac56
@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
a4ac56
 problems of the initial Unicode implementation, but for example
a4ac56
 regular expressions still do not work with Unicode in 5.6.1.
a4ac56
 Perl v5.14.0 is the first release where Unicode support is
a4ac56
-(almost) seamlessly integrable without some gotchas. (There are two
a4ac56
+(almost) seamlessly integrable without some gotchas. (There are a few
a4ac56
 exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
a4ac56
 were fixed starting in Perl 5.16.0. Secondly, some differences in
a4ac56
 L<the range operator|perlop/Range Operators> were fixed starting in
a4ac56
-Perl 5.26.0.)
a4ac56
+Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
a4ac56
+started in Perl 5.28.0.)
a4ac56
 
a4ac56
 To enable this
a4ac56
 seamless support, you should C<use feature 'unicode_strings'> (which is
a4ac56
diff --git a/pp.c b/pp.c
a4ac56
index cc4cb59..d9dd005 100644
a4ac56
--- a/pp.c
a4ac56
+++ b/pp.c
a4ac56
@@ -5740,6 +5740,7 @@ PP(pp_split)
a4ac56
     STRLEN len;
a4ac56
     const char *s = SvPV_const(sv, len);
a4ac56
     const bool do_utf8 = DO_UTF8(sv);
a4ac56
+    const bool in_uni_8_bit = IN_UNI_8_BIT;
a4ac56
     const char *strend = s + len;
a4ac56
     PMOP *pm = cPMOPx(PL_op);
a4ac56
     REGEXP *rx;
a4ac56
@@ -5826,6 +5827,10 @@ PP(pp_split)
a4ac56
 	    while (s < strend && isSPACE_LC(*s))
a4ac56
 		s++;
a4ac56
 	}
a4ac56
+        else if (in_uni_8_bit) {
a4ac56
+            while (s < strend && isSPACE_L1(*s))
a4ac56
+                s++;
a4ac56
+        }
a4ac56
 	else {
a4ac56
 	    while (s < strend && isSPACE(*s))
a4ac56
 		s++;
a4ac56
@@ -5857,6 +5862,10 @@ PP(pp_split)
a4ac56
             {
a4ac56
 	        while (m < strend && !isSPACE_LC(*m))
a4ac56
 		    ++m;
a4ac56
+            }
a4ac56
+            else if (in_uni_8_bit) {
a4ac56
+                while (m < strend && !isSPACE_L1(*m))
a4ac56
+                    ++m;
a4ac56
             } else {
a4ac56
                 while (m < strend && !isSPACE(*m))
a4ac56
                     ++m;
a4ac56
@@ -5891,6 +5900,10 @@ PP(pp_split)
a4ac56
             {
a4ac56
 	        while (s < strend && isSPACE_LC(*s))
a4ac56
 		    ++s;
a4ac56
+            }
a4ac56
+            else if (in_uni_8_bit) {
a4ac56
+                while (s < strend && isSPACE_L1(*s))
a4ac56
+                    ++s;
a4ac56
             } else {
a4ac56
                 while (s < strend && isSPACE(*s))
a4ac56
                     ++s;
a4ac56
diff --git a/regen/feature.pl b/regen/feature.pl
a4ac56
index 579120e..8a4ce63 100755
a4ac56
--- a/regen/feature.pl
a4ac56
+++ b/regen/feature.pl
a4ac56
@@ -485,8 +485,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
a4ac56
 
a4ac56
 This feature is available starting with Perl 5.12; was almost fully
a4ac56
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
a4ac56
-and extended further in Perl 5.26 to cover L
a4ac56
-operator|perlop/Range Operators>.
a4ac56
+was extended further in Perl 5.26 to cover L
a4ac56
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
a4ac56
+cover L<special-cased whitespace splitting|perlfunc/split>.
a4ac56
 
a4ac56
 =head2 The 'unicode_eval' and 'evalbytes' features
a4ac56
 
a4ac56
diff --git a/t/op/split.t b/t/op/split.t
a4ac56
index d60bcaf..038c5d7 100644
a4ac56
--- a/t/op/split.t
a4ac56
+++ b/t/op/split.t
a4ac56
@@ -7,7 +7,7 @@ BEGIN {
a4ac56
     set_up_inc('../lib');
a4ac56
 }
a4ac56
 
a4ac56
-plan tests => 163;
a4ac56
+plan tests => 172;
a4ac56
 
a4ac56
 $FS = ':';
a4ac56
 
a4ac56
@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
a4ac56
         qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
a4ac56
 }
a4ac56
 
a4ac56
+SKIP: {
a4ac56
+    # RT #130907: unicode_strings feature doesn't work with split ' '
a4ac56
+
a4ac56
+    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
a4ac56
+        or skip 'no unicode whitespace found in high-8-bit range', 9;
a4ac56
+
a4ac56
+    for (["$sp$sp. /", "leading unicode whitespace"],
a4ac56
+         [".$sp$sp/",  "unicode whitespace separator"],
a4ac56
+         [". /$sp$sp", "trailing unicode whitespace"]) {
a4ac56
+        my ($str, $desc) = @$_;
a4ac56
+        use feature "unicode_strings";
a4ac56
+        my @got = split " ", $str;
a4ac56
+        is @got, 2, "whitespace split: $desc: field count";
a4ac56
+        is $got[0], '.', "whitespace split: $desc: field 0";
a4ac56
+        is $got[1], '/', "whitespace split: $desc: field 1";
a4ac56
+    }
a4ac56
+}
a4ac56
+
a4ac56
 {
a4ac56
     # 'RT #116086: split "\x20" does not work as documented';
a4ac56
     my @results;
a4ac56
-- 
a4ac56
2.9.4
a4ac56