3b386f
commit a23bd00f9d810c28d9e83ce1d7cf53968375937d
3b386f
Author: Paul E. Murphy <murphyp@linux.vnet.ibm.com>
3b386f
Date:   Mon May 18 11:16:06 2020 -0500
3b386f
3b386f
    powerpc64le: add optimized strlen for P9
3b386f
    
3b386f
    This started as a trivial change to Anton's rawmemchr.  I got
3b386f
    carried away.  This is a hybrid between P8's asympotically
3b386f
    faster 64B checks with extremely efficient small string checks
3b386f
    e.g <64B (and sometimes a little bit more depending on alignment).
3b386f
    
3b386f
    The second trick is to align to 64B by running a 48B checking loop
3b386f
    16B at a time until we naturally align to 64B (i.e checking 48/96/144
3b386f
    bytes/iteration based on the alignment after the first 5 comparisons).
3b386f
    This allieviates the need to check page boundaries.
3b386f
    
3b386f
    Finally, explicly use the P7 strlen with the runtime loader when building
3b386f
    P9.  We need to be cautious about vector/vsx extensions here on P9 only
3b386f
    builds.
3b386f
3b386f
diff --git a/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
3b386f
new file mode 100644
3b386f
index 0000000000000000..e9d83323acacfbca
3b386f
--- /dev/null
3b386f
+++ b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
3b386f
@@ -0,0 +1 @@
3b386f
+#include <sysdeps/powerpc/powerpc64/power7/strlen.S>
3b386f
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
3b386f
new file mode 100644
3b386f
index 0000000000000000..66a9b79647eebbd8
3b386f
--- /dev/null
3b386f
+++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
3b386f
@@ -0,0 +1,213 @@
3b386f
+/* Optimized strlen implementation for PowerPC64/POWER9.
3b386f
+   Copyright (C) 2020 Free Software Foundation, Inc.
3b386f
+   This file is part of the GNU C Library.
3b386f
+
3b386f
+   The GNU C Library is free software; you can redistribute it and/or
3b386f
+   modify it under the terms of the GNU Lesser General Public
3b386f
+   License as published by the Free Software Foundation; either
3b386f
+   version 2.1 of the License, or (at your option) any later version.
3b386f
+
3b386f
+   The GNU C Library is distributed in the hope that it will be useful,
3b386f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
3b386f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3b386f
+   Lesser General Public License for more details.
3b386f
+
3b386f
+   You should have received a copy of the GNU Lesser General Public
3b386f
+   License along with the GNU C Library; if not, see
3b386f
+   <https://www.gnu.org/licenses/>.  */
3b386f
+
3b386f
+#include <sysdep.h>
3b386f
+
3b386f
+#ifndef STRLEN
3b386f
+# define STRLEN __strlen
3b386f
+# define DEFINE_STRLEN_HIDDEN_DEF 1
3b386f
+#endif
3b386f
+
3b386f
+/* Implements the function
3b386f
+
3b386f
+   int [r3] strlen (const void *s [r3])
3b386f
+
3b386f
+   The implementation can load bytes past a matching byte, but only
3b386f
+   up to the next 64B boundary, so it never crosses a page.  */
3b386f
+
3b386f
+.machine power9
3b386f
+ENTRY_TOCLESS (STRLEN, 4)
3b386f
+	CALL_MCOUNT 2
3b386f
+
3b386f
+	vspltisb  v18,0
3b386f
+	vspltisb  v19,-1
3b386f
+
3b386f
+	neg	  r5,r3
3b386f
+	rldicl	  r9,r5,0,60   /* How many bytes to get source 16B aligned?  */
3b386f
+
3b386f
+	/* Align data and fill bytes not loaded with non matching char.  */
3b386f
+	lvx	  v0,0,r3
3b386f
+	lvsr	  v1,0,r3
3b386f
+	vperm	  v0,v19,v0,v1
3b386f
+
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	beq	  cr6,L(aligned)
3b386f
+
3b386f
+	vctzlsbb  r3,v6
3b386f
+	blr
3b386f
+
3b386f
+	/* Test 64B 16B at a time.  The 64B vector loop is optimized for
3b386f
+	   longer strings.  Likewise, we check a multiple of 64B to avoid
3b386f
+	   breaking the alignment calculation below.  */
3b386f
+L(aligned):
3b386f
+	add	  r4,r3,r9
3b386f
+	rldicl.	  r5,r4,60,62  /* Determine the number of 48B loops needed for
3b386f
+                                  alignment to 64B.  And test for zero.  */
3b386f
+
3b386f
+	lxv	  v0+32,0(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne	  cr6,L(tail1)
3b386f
+
3b386f
+	lxv	  v0+32,16(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne 	  cr6,L(tail2)
3b386f
+
3b386f
+	lxv	  v0+32,32(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne 	  cr6,L(tail3)
3b386f
+
3b386f
+	lxv	  v0+32,48(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne 	  cr6,L(tail4)
3b386f
+	addi	  r4,r4,64
3b386f
+
3b386f
+	/* Speculatively generate a fake 16B aligned address to generate the
3b386f
+	   vector byte constant 0,1,..,15 using lvsl during reduction.  */
3b386f
+	li	  r0,0
3b386f
+
3b386f
+	/* Skip the alignment if already 64B aligned.  */
3b386f
+	beq	  L(loop_64b)
3b386f
+	mtctr	  r5
3b386f
+
3b386f
+	/* Test 48B per iteration until 64B aligned.  */
3b386f
+	.p2align  5
3b386f
+L(loop):
3b386f
+	lxv	  v0+32,0(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne	  cr6,L(tail1)
3b386f
+
3b386f
+	lxv	  v0+32,16(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne	  cr6,L(tail2)
3b386f
+
3b386f
+	lxv 	  v0+32,32(r4)
3b386f
+	vcmpequb. v6,v0,v18
3b386f
+	bne	  cr6,L(tail3)
3b386f
+
3b386f
+	addi	  r4,r4,48
3b386f
+	bdnz	  L(loop)
3b386f
+
3b386f
+	.p2align  5
3b386f
+L(loop_64b):
3b386f
+	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
3b386f
+	lxv	  v2+32,16(r4)
3b386f
+	lxv	  v3+32,32(r4)
3b386f
+	lxv	  v4+32,48(r4)
3b386f
+	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
3b386f
+	vminub	  v6,v3,v4
3b386f
+	vminub	  v7,v5,v6
3b386f
+	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
3b386f
+	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
3b386f
+	bne	  cr6,L(vmx_zero)
3b386f
+
3b386f
+	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
3b386f
+	lxv	  v2+32,16(r4)
3b386f
+	lxv	  v3+32,32(r4)
3b386f
+	lxv	  v4+32,48(r4)
3b386f
+	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
3b386f
+	vminub	  v6,v3,v4
3b386f
+	vminub	  v7,v5,v6
3b386f
+	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
3b386f
+	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
3b386f
+	bne	  cr6,L(vmx_zero)
3b386f
+
3b386f
+	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
3b386f
+	lxv	  v2+32,16(r4)
3b386f
+	lxv	  v3+32,32(r4)
3b386f
+	lxv	  v4+32,48(r4)
3b386f
+	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
3b386f
+	vminub	  v6,v3,v4
3b386f
+	vminub	  v7,v5,v6
3b386f
+	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
3b386f
+	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
3b386f
+	beq	  cr6,L(loop_64b)
3b386f
+
3b386f
+L(vmx_zero):
3b386f
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
3b386f
+	   block and mark it in its corresponding VR.  */
3b386f
+	vcmpequb  v1,v1,v18
3b386f
+	vcmpequb  v2,v2,v18
3b386f
+	vcmpequb  v3,v3,v18
3b386f
+	vcmpequb  v4,v4,v18
3b386f
+
3b386f
+	/* We will now 'compress' the result into a single doubleword, so it
3b386f
+	   can be moved to a GPR for the final calculation.  First, we
3b386f
+	   generate an appropriate mask for vbpermq, so we can permute bits into
3b386f
+	   the first halfword.  */
3b386f
+	vspltisb  v10,3
3b386f
+	lvsl	  v11,0,r0
3b386f
+	vslb	  v10,v11,v10
3b386f
+
3b386f
+	/* Permute the first bit of each byte into bits 48-63.  */
3b386f
+	vbpermq	  v1,v1,v10
3b386f
+	vbpermq	  v2,v2,v10
3b386f
+	vbpermq	  v3,v3,v10
3b386f
+	vbpermq	  v4,v4,v10
3b386f
+
3b386f
+	/* Shift each component into its correct position for merging.  */
3b386f
+	vsldoi	  v2,v2,v2,2
3b386f
+	vsldoi	  v3,v3,v3,4
3b386f
+	vsldoi	  v4,v4,v4,6
3b386f
+
3b386f
+	/* Merge the results and move to a GPR.  */
3b386f
+	vor	  v1,v2,v1
3b386f
+	vor	  v2,v3,v4
3b386f
+	vor	  v4,v1,v2
3b386f
+	mfvrd	  r10,v4
3b386f
+
3b386f
+	/* Adjust address to the begninning of the current 64-byte block.  */
3b386f
+	addi	  r4,r4,-64
3b386f
+
3b386f
+	cnttzd	  r0,r10           /* Count trailing zeros before the match.  */
3b386f
+	subf	  r5,r3,r4
3b386f
+	add	  r3,r5,r0         /* Compute final length.  */
3b386f
+	blr
3b386f
+
3b386f
+L(tail1):
3b386f
+	vctzlsbb  r0,v6
3b386f
+	add	  r4,r4,r0
3b386f
+	subf	  r3,r3,r4
3b386f
+	blr
3b386f
+
3b386f
+L(tail2):
3b386f
+	vctzlsbb  r0,v6
3b386f
+	add	  r4,r4,r0
3b386f
+	addi	  r4,r4,16
3b386f
+	subf	  r3,r3,r4
3b386f
+	blr
3b386f
+
3b386f
+L(tail3):
3b386f
+	vctzlsbb  r0,v6
3b386f
+	add	  r4,r4,r0
3b386f
+	addi	  r4,r4,32
3b386f
+	subf	  r3,r3,r4
3b386f
+	blr
3b386f
+
3b386f
+L(tail4):
3b386f
+	vctzlsbb  r0,v6
3b386f
+	add	  r4,r4,r0
3b386f
+	addi	  r4,r4,48
3b386f
+	subf	  r3,r3,r4
3b386f
+	blr
3b386f
+
3b386f
+END (STRLEN)
3b386f
+
3b386f
+#ifdef DEFINE_STRLEN_HIDDEN_DEF
3b386f
+weak_alias (__strlen, strlen)
3b386f
+libc_hidden_builtin_def (strlen)
3b386f
+#endif
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
3b386f
index 1a8ef5fb73c3b0db..6d5661d08257b7a0 100644
3b386f
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
3b386f
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
3b386f
 
3b386f
 ifneq (,$(filter %le,$(config-machine)))
3b386f
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
3b386f
-		   rawmemchr-power9
3b386f
+		   rawmemchr-power9 strlen-power9
3b386f
 endif
3b386f
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
3b386f
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
3b386f
index 297935863e44c0e1..daa30d3907395680 100644
3b386f
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
3b386f
@@ -111,6 +111,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
3b386f
 
3b386f
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
3b386f
   IFUNC_IMPL (i, name, strlen,
3b386f
+#ifdef __LITTLE_ENDIAN__
3b386f
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
3b386f
+			      __strlen_power9)
3b386f
+#endif
3b386f
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
3b386f
 			      __strlen_power8)
3b386f
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
3b386f
new file mode 100644
3b386f
index 0000000000000000..68c8d54b5f5876a2
3b386f
--- /dev/null
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
3b386f
@@ -0,0 +1,2 @@
3b386f
+#define STRLEN __strlen_power9
3b386f
+#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
3b386f
index 74810dab9929d505..b7f0fbb13fb97783 100644
3b386f
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
3b386f
@@ -30,8 +30,13 @@ extern __typeof (__redirect_strlen) __libc_strlen;
3b386f
 extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
3b386f
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
3b386f
 extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
3b386f
+extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
3b386f
 
3b386f
 libc_ifunc (__libc_strlen,
3b386f
+# ifdef __LITTLE_ENDIAN__
3b386f
+	  (hwcap2 & PPC_FEATURE2_ARCH_3_00)
3b386f
+	  ? __strlen_power9 :
3b386f
+# endif
3b386f
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
3b386f
 	    ? __strlen_power8 :
3b386f
 	      (hwcap & PPC_FEATURE_HAS_VSX)