34484a
From 5aca16e032861ea3dfcc96ad417ea87e2b1552e5 Mon Sep 17 00:00:00 2001
34484a
From: Aaron Crane <arc@cpan.org>
34484a
Date: Sat, 4 Mar 2017 12:50:58 +0000
34484a
Subject: [PATCH] RT #130907: Fix the Unicode Bug in split " "
34484a
MIME-Version: 1.0
34484a
Content-Type: text/plain; charset=UTF-8
34484a
Content-Transfer-Encoding: 8bit
34484a
34484a
Ported to 5.26.0:
34484a
34484a
commit 20ae58f7a9bbf84d043d6e90f5988b6e3ca4ee3d
34484a
Author: Aaron Crane <arc@cpan.org>
34484a
Date:   Sat Mar 4 12:50:58 2017 +0000
34484a
34484a
    RT #130907: Fix the Unicode Bug in split " "
34484a
34484a
Signed-off-by: Petr Písař <ppisar@redhat.com>
34484a
---
34484a
 lib/feature.pm       |  5 +++--
34484a
 pod/perldelta.pod    |  9 +++++++++
34484a
 pod/perlfunc.pod     |  8 ++++++++
34484a
 pod/perlunicode.pod  | 11 +++++++++++
34484a
 pod/perluniintro.pod |  5 +++--
34484a
 pp.c                 | 13 +++++++++++++
34484a
 regen/feature.pl     |  5 +++--
34484a
 t/op/split.t         | 20 +++++++++++++++++++-
34484a
 8 files changed, 69 insertions(+), 7 deletions(-)
34484a
34484a
diff --git a/lib/feature.pm b/lib/feature.pm
34484a
index ed13273..93e020b 100644
34484a
--- a/lib/feature.pm
34484a
+++ b/lib/feature.pm
34484a
@@ -175,8 +175,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
34484a
 
34484a
 This feature is available starting with Perl 5.12; was almost fully
34484a
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
34484a
-and extended further in Perl 5.26 to cover L
34484a
-operator|perlop/Range Operators>.
34484a
+was extended further in Perl 5.26 to cover L
34484a
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
34484a
+cover L<special-cased whitespace splitting|perlfunc/split>.
34484a
 
34484a
 =head2 The 'unicode_eval' and 'evalbytes' features
34484a
 
34484a
#diff --git a/pod/perldelta.pod b/pod/perldelta.pod
34484a
#index 06dcd1d..d31335f 100644
34484a
#--- a/pod/perldelta.pod
34484a
#+++ b/pod/perldelta.pod
34484a
#@@ -3206,6 +3206,15 @@ calls.
34484a
# Parsing bad POSIX charclasses no longer leaks memory.
34484a
# L<[perl #128313]|https://rt.perl.org/Public/Bug/Display.html?id=128313>
34484a
# 
34484a
#+=item *
34484a
#+
34484a
#+C<split ' '> now correctly handles the argument being split when in the
34484a
#+scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
34484a
#+>> feature. Previously, when a string using the single-byte internal
34484a
#+representation contained characters that are whitespace by Unicode rules but
34484a
#+not by ASCII rules, it treated those characters as part of fields rather
34484a
#+than as field separators.  [perl #130907]
34484a
#+
34484a
# =back
34484a
# 
34484a
# =head1 Known Problems
34484a
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
34484a
index b8dca6e..9abadf4 100644
34484a
--- a/pod/perlfunc.pod
34484a
+++ b/pod/perlfunc.pod
34484a
@@ -7616,6 +7616,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
34484a
 pattern argument to split; in Perl 5.18.0 and later this special case is
34484a
 triggered by any expression which evaluates to the simple string S<C<" ">>.
34484a
 
34484a
+As of Perl 5.28, this special-cased whitespace splitting works as expected in
34484a
+the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
34484a
+'unicode_strings' feature >>. In previous versions, and outside the scope of
34484a
+that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
34484a
+whitespace according to Unicode rules but not according to ASCII rules can be
34484a
+treated as part of fields rather than as field separators, depending on the
34484a
+string's internal encoding.
34484a
+
34484a
 If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
34484a
 the previously described I<awk> emulation.
34484a
 
34484a
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
34484a
index 9c13c35..2e84e95 100644
34484a
--- a/pod/perlunicode.pod
34484a
+++ b/pod/perlunicode.pod
34484a
@@ -1835,6 +1835,17 @@ outside its scope, it could produce strings whose length in characters
34484a
 exceeded that of the right-hand side, where the right-hand side took up more
34484a
 bytes than the correct range endpoint.
34484a
 
34484a
+=item *
34484a
+
34484a
+In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
34484a
+
34484a
+Starting in Perl 5.28.0, the C<split> function with a pattern specified as
34484a
+a string containing a single space handles whitespace characters consistently
34484a
+within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
34484a
+characters that are whitespace according to Unicode rules but not according to
34484a
+ASCII rules were treated as field contents rather than field separators when
34484a
+they appear in byte-encoded strings.
34484a
+
34484a
 =back
34484a
 
34484a
 You can see from the above that the effect of C<unicode_strings>
34484a
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod
34484a
index d35de34..595ec46 100644
34484a
--- a/pod/perluniintro.pod
34484a
+++ b/pod/perluniintro.pod
34484a
@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
34484a
 problems of the initial Unicode implementation, but for example
34484a
 regular expressions still do not work with Unicode in 5.6.1.
34484a
 Perl v5.14.0 is the first release where Unicode support is
34484a
-(almost) seamlessly integrable without some gotchas. (There are two
34484a
+(almost) seamlessly integrable without some gotchas. (There are a few
34484a
 exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
34484a
 were fixed starting in Perl 5.16.0. Secondly, some differences in
34484a
 L<the range operator|perlop/Range Operators> were fixed starting in
34484a
-Perl 5.26.0.)
34484a
+Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
34484a
+started in Perl 5.28.0.)
34484a
 
34484a
 To enable this
34484a
 seamless support, you should C<use feature 'unicode_strings'> (which is
34484a
diff --git a/pp.c b/pp.c
34484a
index cc4cb59..d9dd005 100644
34484a
--- a/pp.c
34484a
+++ b/pp.c
34484a
@@ -5740,6 +5740,7 @@ PP(pp_split)
34484a
     STRLEN len;
34484a
     const char *s = SvPV_const(sv, len);
34484a
     const bool do_utf8 = DO_UTF8(sv);
34484a
+    const bool in_uni_8_bit = IN_UNI_8_BIT;
34484a
     const char *strend = s + len;
34484a
     PMOP *pm = cPMOPx(PL_op);
34484a
     REGEXP *rx;
34484a
@@ -5826,6 +5827,10 @@ PP(pp_split)
34484a
 	    while (s < strend && isSPACE_LC(*s))
34484a
 		s++;
34484a
 	}
34484a
+        else if (in_uni_8_bit) {
34484a
+            while (s < strend && isSPACE_L1(*s))
34484a
+                s++;
34484a
+        }
34484a
 	else {
34484a
 	    while (s < strend && isSPACE(*s))
34484a
 		s++;
34484a
@@ -5857,6 +5862,10 @@ PP(pp_split)
34484a
             {
34484a
 	        while (m < strend && !isSPACE_LC(*m))
34484a
 		    ++m;
34484a
+            }
34484a
+            else if (in_uni_8_bit) {
34484a
+                while (m < strend && !isSPACE_L1(*m))
34484a
+                    ++m;
34484a
             } else {
34484a
                 while (m < strend && !isSPACE(*m))
34484a
                     ++m;
34484a
@@ -5891,6 +5900,10 @@ PP(pp_split)
34484a
             {
34484a
 	        while (s < strend && isSPACE_LC(*s))
34484a
 		    ++s;
34484a
+            }
34484a
+            else if (in_uni_8_bit) {
34484a
+                while (s < strend && isSPACE_L1(*s))
34484a
+                    ++s;
34484a
             } else {
34484a
                 while (s < strend && isSPACE(*s))
34484a
                     ++s;
34484a
diff --git a/regen/feature.pl b/regen/feature.pl
34484a
index 579120e..8a4ce63 100755
34484a
--- a/regen/feature.pl
34484a
+++ b/regen/feature.pl
34484a
@@ -485,8 +485,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
34484a
 
34484a
 This feature is available starting with Perl 5.12; was almost fully
34484a
 implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
34484a
-and extended further in Perl 5.26 to cover L
34484a
-operator|perlop/Range Operators>.
34484a
+was extended further in Perl 5.26 to cover L
34484a
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
34484a
+cover L<special-cased whitespace splitting|perlfunc/split>.
34484a
 
34484a
 =head2 The 'unicode_eval' and 'evalbytes' features
34484a
 
34484a
diff --git a/t/op/split.t b/t/op/split.t
34484a
index d60bcaf..038c5d7 100644
34484a
--- a/t/op/split.t
34484a
+++ b/t/op/split.t
34484a
@@ -7,7 +7,7 @@ BEGIN {
34484a
     set_up_inc('../lib');
34484a
 }
34484a
 
34484a
-plan tests => 163;
34484a
+plan tests => 172;
34484a
 
34484a
 $FS = ':';
34484a
 
34484a
@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
34484a
         qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
34484a
 }
34484a
 
34484a
+SKIP: {
34484a
+    # RT #130907: unicode_strings feature doesn't work with split ' '
34484a
+
34484a
+    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
34484a
+        or skip 'no unicode whitespace found in high-8-bit range', 9;
34484a
+
34484a
+    for (["$sp$sp. /", "leading unicode whitespace"],
34484a
+         [".$sp$sp/",  "unicode whitespace separator"],
34484a
+         [". /$sp$sp", "trailing unicode whitespace"]) {
34484a
+        my ($str, $desc) = @$_;
34484a
+        use feature "unicode_strings";
34484a
+        my @got = split " ", $str;
34484a
+        is @got, 2, "whitespace split: $desc: field count";
34484a
+        is $got[0], '.', "whitespace split: $desc: field 0";
34484a
+        is $got[1], '/', "whitespace split: $desc: field 1";
34484a
+    }
34484a
+}
34484a
+
34484a
 {
34484a
     # 'RT #116086: split "\x20" does not work as documented';
34484a
     my @results;
34484a
-- 
34484a
2.9.4
34484a