446cf2
commit a23bd00f9d810c28d9e83ce1d7cf53968375937d
446cf2
Author: Paul E. Murphy <murphyp@linux.vnet.ibm.com>
446cf2
Date:   Mon May 18 11:16:06 2020 -0500
446cf2
446cf2
    powerpc64le: add optimized strlen for P9
446cf2
    
446cf2
    This started as a trivial change to Anton's rawmemchr.  I got
446cf2
    carried away.  This is a hybrid between P8's asympotically
446cf2
    faster 64B checks with extremely efficient small string checks
446cf2
    e.g <64B (and sometimes a little bit more depending on alignment).
446cf2
    
446cf2
    The second trick is to align to 64B by running a 48B checking loop
446cf2
    16B at a time until we naturally align to 64B (i.e checking 48/96/144
446cf2
    bytes/iteration based on the alignment after the first 5 comparisons).
446cf2
    This allieviates the need to check page boundaries.
446cf2
    
446cf2
    Finally, explicly use the P7 strlen with the runtime loader when building
446cf2
    P9.  We need to be cautious about vector/vsx extensions here on P9 only
446cf2
    builds.
446cf2
446cf2
diff --git a/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..e9d83323acacfbca
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/powerpc/powerpc64/le/power9/rtld-strlen.S
446cf2
@@ -0,0 +1 @@
446cf2
+#include <sysdeps/powerpc/powerpc64/power7/strlen.S>
446cf2
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strlen.S b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..66a9b79647eebbd8
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/powerpc/powerpc64/le/power9/strlen.S
446cf2
@@ -0,0 +1,213 @@
446cf2
+/* Optimized strlen implementation for PowerPC64/POWER9.
446cf2
+   Copyright (C) 2020 Free Software Foundation, Inc.
446cf2
+   This file is part of the GNU C Library.
446cf2
+
446cf2
+   The GNU C Library is free software; you can redistribute it and/or
446cf2
+   modify it under the terms of the GNU Lesser General Public
446cf2
+   License as published by the Free Software Foundation; either
446cf2
+   version 2.1 of the License, or (at your option) any later version.
446cf2
+
446cf2
+   The GNU C Library is distributed in the hope that it will be useful,
446cf2
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
446cf2
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
446cf2
+   Lesser General Public License for more details.
446cf2
+
446cf2
+   You should have received a copy of the GNU Lesser General Public
446cf2
+   License along with the GNU C Library; if not, see
446cf2
+   <https://www.gnu.org/licenses/>.  */
446cf2
+
446cf2
+#include <sysdep.h>
446cf2
+
446cf2
+#ifndef STRLEN
446cf2
+# define STRLEN __strlen
446cf2
+# define DEFINE_STRLEN_HIDDEN_DEF 1
446cf2
+#endif
446cf2
+
446cf2
+/* Implements the function
446cf2
+
446cf2
+   int [r3] strlen (const void *s [r3])
446cf2
+
446cf2
+   The implementation can load bytes past a matching byte, but only
446cf2
+   up to the next 64B boundary, so it never crosses a page.  */
446cf2
+
446cf2
+.machine power9
446cf2
+ENTRY_TOCLESS (STRLEN, 4)
446cf2
+	CALL_MCOUNT 2
446cf2
+
446cf2
+	vspltisb  v18,0
446cf2
+	vspltisb  v19,-1
446cf2
+
446cf2
+	neg	  r5,r3
446cf2
+	rldicl	  r9,r5,0,60   /* How many bytes to get source 16B aligned?  */
446cf2
+
446cf2
+	/* Align data and fill bytes not loaded with non matching char.  */
446cf2
+	lvx	  v0,0,r3
446cf2
+	lvsr	  v1,0,r3
446cf2
+	vperm	  v0,v19,v0,v1
446cf2
+
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	beq	  cr6,L(aligned)
446cf2
+
446cf2
+	vctzlsbb  r3,v6
446cf2
+	blr
446cf2
+
446cf2
+	/* Test 64B 16B at a time.  The 64B vector loop is optimized for
446cf2
+	   longer strings.  Likewise, we check a multiple of 64B to avoid
446cf2
+	   breaking the alignment calculation below.  */
446cf2
+L(aligned):
446cf2
+	add	  r4,r3,r9
446cf2
+	rldicl.	  r5,r4,60,62  /* Determine the number of 48B loops needed for
446cf2
+                                  alignment to 64B.  And test for zero.  */
446cf2
+
446cf2
+	lxv	  v0+32,0(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne	  cr6,L(tail1)
446cf2
+
446cf2
+	lxv	  v0+32,16(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne 	  cr6,L(tail2)
446cf2
+
446cf2
+	lxv	  v0+32,32(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne 	  cr6,L(tail3)
446cf2
+
446cf2
+	lxv	  v0+32,48(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne 	  cr6,L(tail4)
446cf2
+	addi	  r4,r4,64
446cf2
+
446cf2
+	/* Speculatively generate a fake 16B aligned address to generate the
446cf2
+	   vector byte constant 0,1,..,15 using lvsl during reduction.  */
446cf2
+	li	  r0,0
446cf2
+
446cf2
+	/* Skip the alignment if already 64B aligned.  */
446cf2
+	beq	  L(loop_64b)
446cf2
+	mtctr	  r5
446cf2
+
446cf2
+	/* Test 48B per iteration until 64B aligned.  */
446cf2
+	.p2align  5
446cf2
+L(loop):
446cf2
+	lxv	  v0+32,0(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne	  cr6,L(tail1)
446cf2
+
446cf2
+	lxv	  v0+32,16(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne	  cr6,L(tail2)
446cf2
+
446cf2
+	lxv 	  v0+32,32(r4)
446cf2
+	vcmpequb. v6,v0,v18
446cf2
+	bne	  cr6,L(tail3)
446cf2
+
446cf2
+	addi	  r4,r4,48
446cf2
+	bdnz	  L(loop)
446cf2
+
446cf2
+	.p2align  5
446cf2
+L(loop_64b):
446cf2
+	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
446cf2
+	lxv	  v2+32,16(r4)
446cf2
+	lxv	  v3+32,32(r4)
446cf2
+	lxv	  v4+32,48(r4)
446cf2
+	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
446cf2
+	vminub	  v6,v3,v4
446cf2
+	vminub	  v7,v5,v6
446cf2
+	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
446cf2
+	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
446cf2
+	bne	  cr6,L(vmx_zero)
446cf2
+
446cf2
+	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
446cf2
+	lxv	  v2+32,16(r4)
446cf2
+	lxv	  v3+32,32(r4)
446cf2
+	lxv	  v4+32,48(r4)
446cf2
+	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
446cf2
+	vminub	  v6,v3,v4
446cf2
+	vminub	  v7,v5,v6
446cf2
+	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
446cf2
+	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
446cf2
+	bne	  cr6,L(vmx_zero)
446cf2
+
446cf2
+	lxv	  v1+32,0(r4)     /* Load 4 quadwords.  */
446cf2
+	lxv	  v2+32,16(r4)
446cf2
+	lxv	  v3+32,32(r4)
446cf2
+	lxv	  v4+32,48(r4)
446cf2
+	vminub	  v5,v1,v2        /* Compare and merge into one VR for speed.  */
446cf2
+	vminub	  v6,v3,v4
446cf2
+	vminub	  v7,v5,v6
446cf2
+	vcmpequb. v7,v7,v18       /* Check for NULLs.  */
446cf2
+	addi	  r4,r4,64        /* Adjust address for the next iteration.  */
446cf2
+	beq	  cr6,L(loop_64b)
446cf2
+
446cf2
+L(vmx_zero):
446cf2
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
446cf2
+	   block and mark it in its corresponding VR.  */
446cf2
+	vcmpequb  v1,v1,v18
446cf2
+	vcmpequb  v2,v2,v18
446cf2
+	vcmpequb  v3,v3,v18
446cf2
+	vcmpequb  v4,v4,v18
446cf2
+
446cf2
+	/* We will now 'compress' the result into a single doubleword, so it
446cf2
+	   can be moved to a GPR for the final calculation.  First, we
446cf2
+	   generate an appropriate mask for vbpermq, so we can permute bits into
446cf2
+	   the first halfword.  */
446cf2
+	vspltisb  v10,3
446cf2
+	lvsl	  v11,0,r0
446cf2
+	vslb	  v10,v11,v10
446cf2
+
446cf2
+	/* Permute the first bit of each byte into bits 48-63.  */
446cf2
+	vbpermq	  v1,v1,v10
446cf2
+	vbpermq	  v2,v2,v10
446cf2
+	vbpermq	  v3,v3,v10
446cf2
+	vbpermq	  v4,v4,v10
446cf2
+
446cf2
+	/* Shift each component into its correct position for merging.  */
446cf2
+	vsldoi	  v2,v2,v2,2
446cf2
+	vsldoi	  v3,v3,v3,4
446cf2
+	vsldoi	  v4,v4,v4,6
446cf2
+
446cf2
+	/* Merge the results and move to a GPR.  */
446cf2
+	vor	  v1,v2,v1
446cf2
+	vor	  v2,v3,v4
446cf2
+	vor	  v4,v1,v2
446cf2
+	mfvrd	  r10,v4
446cf2
+
446cf2
+	/* Adjust address to the begninning of the current 64-byte block.  */
446cf2
+	addi	  r4,r4,-64
446cf2
+
446cf2
+	cnttzd	  r0,r10           /* Count trailing zeros before the match.  */
446cf2
+	subf	  r5,r3,r4
446cf2
+	add	  r3,r5,r0         /* Compute final length.  */
446cf2
+	blr
446cf2
+
446cf2
+L(tail1):
446cf2
+	vctzlsbb  r0,v6
446cf2
+	add	  r4,r4,r0
446cf2
+	subf	  r3,r3,r4
446cf2
+	blr
446cf2
+
446cf2
+L(tail2):
446cf2
+	vctzlsbb  r0,v6
446cf2
+	add	  r4,r4,r0
446cf2
+	addi	  r4,r4,16
446cf2
+	subf	  r3,r3,r4
446cf2
+	blr
446cf2
+
446cf2
+L(tail3):
446cf2
+	vctzlsbb  r0,v6
446cf2
+	add	  r4,r4,r0
446cf2
+	addi	  r4,r4,32
446cf2
+	subf	  r3,r3,r4
446cf2
+	blr
446cf2
+
446cf2
+L(tail4):
446cf2
+	vctzlsbb  r0,v6
446cf2
+	add	  r4,r4,r0
446cf2
+	addi	  r4,r4,48
446cf2
+	subf	  r3,r3,r4
446cf2
+	blr
446cf2
+
446cf2
+END (STRLEN)
446cf2
+
446cf2
+#ifdef DEFINE_STRLEN_HIDDEN_DEF
446cf2
+weak_alias (__strlen, strlen)
446cf2
+libc_hidden_builtin_def (strlen)
446cf2
+#endif
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
446cf2
index 1a8ef5fb73c3b0db..6d5661d08257b7a0 100644
446cf2
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
446cf2
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
446cf2
 
446cf2
 ifneq (,$(filter %le,$(config-machine)))
446cf2
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
446cf2
-		   rawmemchr-power9
446cf2
+		   rawmemchr-power9 strlen-power9
446cf2
 endif
446cf2
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
446cf2
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
446cf2
index 297935863e44c0e1..daa30d3907395680 100644
446cf2
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
446cf2
@@ -111,6 +111,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
 
446cf2
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
446cf2
   IFUNC_IMPL (i, name, strlen,
446cf2
+#ifdef __LITTLE_ENDIAN__
446cf2
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
446cf2
+			      __strlen_power9)
446cf2
+#endif
446cf2
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
446cf2
 			      __strlen_power8)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..68c8d54b5f5876a2
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power9.S
446cf2
@@ -0,0 +1,2 @@
446cf2
+#define STRLEN __strlen_power9
446cf2
+#include <sysdeps/powerpc/powerpc64/le/power9/strlen.S>
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
446cf2
index 74810dab9929d505..b7f0fbb13fb97783 100644
446cf2
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
446cf2
@@ -30,8 +30,13 @@ extern __typeof (__redirect_strlen) __libc_strlen;
446cf2
 extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
446cf2
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
446cf2
 extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
446cf2
+extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
446cf2
 
446cf2
 libc_ifunc (__libc_strlen,
446cf2
+# ifdef __LITTLE_ENDIAN__
446cf2
+	  (hwcap2 & PPC_FEATURE2_ARCH_3_00)
446cf2
+	  ? __strlen_power9 :
446cf2
+# endif
446cf2
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
446cf2
 	    ? __strlen_power8 :
446cf2
 	      (hwcap & PPC_FEATURE_HAS_VSX)