8a984d
commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd
8a984d
Author: Matheus Castanho <msc@linux.ibm.com>
8a984d
Date:   Tue May 11 17:53:07 2021 -0300
8a984d
8a984d
    powerpc: Add optimized rawmemchr for POWER10
8a984d
    
8a984d
    Reuse code for optimized strlen to implement a faster version of rawmemchr.
8a984d
    This takes advantage of the same benefits provided by the strlen implementation,
8a984d
    but needs some extra steps. __strlen_power10 code should be unchanged after this
8a984d
    change.
8a984d
    
8a984d
    rawmemchr returns a pointer to the char found, while strlen returns only the
8a984d
    length, so we have to take that into account when preparing the return value.
8a984d
    
8a984d
    To quickly check 64B, the loop on __strlen_power10 merges the whole block into
8a984d
    16B by using unsigned minimum vector operations (vminub) and checks if there are
8a984d
    any \0 on the resulting vector. The same code is used by rawmemchr if the char c
8a984d
    is 0. However, this approach does not work when c != 0.  We first need to
8a984d
    subtract each byte by c, so that the value we are looking for is converted to a
8a984d
    0, then taking the minimum and checking for nulls works again.
8a984d
    
8a984d
    The new code branches after it has compared ~256 bytes and chooses which of the
8a984d
    two strategies above will be used in the main loop, based on the char c. This
8a984d
    extra branch adds some overhead (~5%) for length ~256, but is quickly amortized
8a984d
    by the faster loop for larger sizes.
8a984d
    
8a984d
    Compared to __rawmemchr_power9, this version is ~20% faster for length < 256.
8a984d
    Because of the optimized main loop, the improvement becomes ~35% for c != 0
8a984d
    and ~50% for c = 0 for strings longer than 256.
8a984d
    
8a984d
    Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
8a984d
    Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
8a984d
8a984d
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
8a984d
new file mode 100644
8a984d
index 0000000000000000..5351c2634f6086bf
8a984d
--- /dev/null
8a984d
+++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
8a984d
@@ -0,0 +1,22 @@
8a984d
+/* Optimized rawmemchr implementation for POWER10 LE.
8a984d
+   Copyright (C) 2021 Free Software Foundation, Inc.
8a984d
+   This file is part of the GNU C Library.
8a984d
+
8a984d
+   The GNU C Library is free software; you can redistribute it and/or
8a984d
+   modify it under the terms of the GNU Lesser General Public
8a984d
+   License as published by the Free Software Foundation; either
8a984d
+   version 2.1 of the License, or (at your option) any later version.
8a984d
+
8a984d
+   The GNU C Library is distributed in the hope that it will be useful,
8a984d
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8a984d
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8a984d
+   Lesser General Public License for more details.
8a984d
+
8a984d
+   You should have received a copy of the GNU Lesser General Public
8a984d
+   License along with the GNU C Library; if not, see
8a984d
+   <https://www.gnu.org/licenses/>.  */
8a984d
+
8a984d
+#include <sysdep.h>
8a984d
+
8a984d
+#define USE_AS_RAWMEMCHR 1
8a984d
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
8a984d
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
8a984d
index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644
8a984d
--- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S
8a984d
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
8a984d
@@ -18,10 +18,50 @@
8a984d
 
8a984d
 #include <sysdep.h>
8a984d
 
8a984d
-#ifndef STRLEN
8a984d
-# define STRLEN __strlen
8a984d
-# define DEFINE_STRLEN_HIDDEN_DEF 1
8a984d
-#endif
8a984d
+/* To reuse the code for rawmemchr, we have some extra steps compared to the
8a984d
+   strlen implementation:
8a984d
+      - Sum the initial value of r3 with the position at which the char was
8a984d
+        found, to guarantee we return a pointer and not the length.
8a984d
+      - In the main loop, subtract each byte by the char we are looking for,
8a984d
+        so we can keep using vminub to quickly check 64B at once.  */
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+# ifndef RAWMEMCHR
8a984d
+#  define FUNCNAME __rawmemchr
8a984d
+# else
8a984d
+#  define FUNCNAME RAWMEMCHR
8a984d
+# endif
8a984d
+# define MCOUNT_NARGS 2
8a984d
+# define VREG_ZERO v20
8a984d
+# define OFF_START_LOOP 256
8a984d
+# define RAWMEMCHR_SUBTRACT_VECTORS \
8a984d
+	vsububm   v4,v4,v18;	    \
8a984d
+	vsububm   v5,v5,v18;	    \
8a984d
+	vsububm   v6,v6,v18;	    \
8a984d
+	vsububm   v7,v7,v18;
8a984d
+# define TAIL(vreg,increment)	   \
8a984d
+	vctzlsbb  r4,vreg;	   \
8a984d
+	addi	  r4,r4,increment; \
8a984d
+	add	  r3,r5,r4;	   \
8a984d
+	blr
8a984d
+
8a984d
+#else /* strlen */
8a984d
+
8a984d
+# ifndef STRLEN
8a984d
+#  define FUNCNAME __strlen
8a984d
+#  define DEFINE_STRLEN_HIDDEN_DEF 1
8a984d
+# else
8a984d
+#  define FUNCNAME STRLEN
8a984d
+# endif
8a984d
+# define MCOUNT_NARGS 1
8a984d
+# define VREG_ZERO v18
8a984d
+# define OFF_START_LOOP 192
8a984d
+# define TAIL(vreg,increment)	   \
8a984d
+	vctzlsbb  r4,vreg;	   \
8a984d
+	subf	  r3,r3,r5;	   \
8a984d
+	addi	  r4,r4,increment; \
8a984d
+	add	  r3,r3,r4;	   \
8a984d
+	blr
8a984d
+#endif /* USE_AS_RAWMEMCHR */
8a984d
 
8a984d
 /* TODO: Replace macros by the actual instructions when minimum binutils becomes
8a984d
    >= 2.35.  This is used to keep compatibility with older versions.  */
8a984d
@@ -50,33 +90,41 @@
8a984d
 	li	  r6,offset;		    \
8a984d
 	LXVP(v4+32,offset,addr);	    \
8a984d
 	LXVP(v6+32,offset+32,addr);	    \
8a984d
+	RAWMEMCHR_SUBTRACT_VECTORS;	    \
8a984d
 	vminub	  v14,v4,v5;		    \
8a984d
 	vminub	  v15,v6,v7;		    \
8a984d
 	vminub	  v16,v14,v15;		    \
8a984d
-	vcmpequb. v0,v16,v18;		    \
8a984d
+	vcmpequb. v0,v16,VREG_ZERO;	    \
8a984d
 	bne	  cr6,L(label)
8a984d
 
8a984d
-#define TAIL(vreg,increment)	   \
8a984d
-	vctzlsbb  r4,vreg;	   \
8a984d
-	subf	  r3,r3,r5;	   \
8a984d
-	addi	  r4,r4,increment; \
8a984d
-	add	  r3,r3,r4;	   \
8a984d
-	blr
8a984d
-
8a984d
 /* Implements the function
8a984d
 
8a984d
    int [r3] strlen (const void *s [r3])
8a984d
 
8a984d
+   but when USE_AS_RAWMEMCHR is set, implements the function
8a984d
+
8a984d
+   void* [r3] rawmemchr (const void *s [r3], int c [r4])
8a984d
+
8a984d
    The implementation can load bytes past a matching byte, but only
8a984d
    up to the next 64B boundary, so it never crosses a page.  */
8a984d
 
8a984d
 .machine power9
8a984d
 
8a984d
-ENTRY_TOCLESS (STRLEN, 4)
8a984d
-	CALL_MCOUNT 1
8a984d
+ENTRY_TOCLESS (FUNCNAME, 4)
8a984d
+	CALL_MCOUNT MCOUNT_NARGS
8a984d
 
8a984d
-	vspltisb  v18,0
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+	xori	r5,r4,0xff
8a984d
+
8a984d
+	mtvsrd	v18+32,r4	/* matching char in v18  */
8a984d
+	mtvsrd	v19+32,r5	/* non matching char in v19  */
8a984d
+
8a984d
+	vspltb	v18,v18,7	/* replicate  */
8a984d
+	vspltb	v19,v19,7	/* replicate  */
8a984d
+#else
8a984d
 	vspltisb  v19,-1
8a984d
+#endif
8a984d
+	vspltisb  VREG_ZERO,0
8a984d
 
8a984d
 	/* Next 16B-aligned address. Prepare address for L(aligned).  */
8a984d
 	addi	  r5,r3,16
8a984d
@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4)
8a984d
 	vcmpequb. v6,v0,v18
8a984d
 	beq	  cr6,L(aligned)
8a984d
 
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+	vctzlsbb  r6,v6
8a984d
+	add	  r3,r3,r6
8a984d
+#else
8a984d
 	vctzlsbb  r3,v6
8a984d
+#endif
8a984d
 	blr
8a984d
 
8a984d
-	/* Test next 176B, 16B at a time.  The main loop is optimized for longer
8a984d
-	   strings, so checking the first bytes in 16B chunks benefits a lot
8a984d
-	   small strings.  */
8a984d
+	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
8a984d
+	   optimized for longer strings, so checking the first bytes in 16B
8a984d
+	   chunks benefits a lot small strings.  */
8a984d
 	.p2align 5
8a984d
 L(aligned):
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
8a984d
+				  choose how we will perform the main loop.  */
8a984d
+#endif
8a984d
 	/* Prepare address for the loop.  */
8a984d
-	addi	  r4,r3,192
8a984d
+	addi	  r4,r3,OFF_START_LOOP
8a984d
 	clrrdi	  r4,r4,6
8a984d
 
8a984d
 	CHECK16(v0,0,r5,tail1)
8a984d
@@ -113,15 +170,43 @@ L(aligned):
8a984d
 	CHECK16(v8,128,r5,tail9)
8a984d
 	CHECK16(v9,144,r5,tail10)
8a984d
 	CHECK16(v10,160,r5,tail11)
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+	CHECK16(v0,176,r5,tail12)
8a984d
+	CHECK16(v1,192,r5,tail13)
8a984d
+	CHECK16(v2,208,r5,tail14)
8a984d
+	CHECK16(v3,224,r5,tail15)
8a984d
+#endif
8a984d
 
8a984d
 	addi	  r5,r4,128
8a984d
 
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+	/* If c == 0, use the same loop as strlen, without the vsububm.  */
8a984d
+	beq	cr5,L(loop)
8a984d
+
8a984d
+	/* This is very similar to the block after L(loop), the difference is
8a984d
+	   that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
8a984d
+	   each byte loaded by the char we are looking for, this way we can keep
8a984d
+	   using vminub to merge the results and checking for nulls.  */
8a984d
+	.p2align 5
8a984d
+L(rawmemchr_loop):
8a984d
+	CHECK64(0,r4,pre_tail_64b)
8a984d
+	CHECK64(64,r4,pre_tail_64b)
8a984d
+	addi	  r4,r4,256
8a984d
+
8a984d
+	CHECK64(0,r5,tail_64b)
8a984d
+	CHECK64(64,r5,tail_64b)
8a984d
+	addi	  r5,r5,256
8a984d
+
8a984d
+	b	  L(rawmemchr_loop)
8a984d
+#endif
8a984d
 	/* Switch to a more aggressive approach checking 64B each time.  Use 2
8a984d
 	   pointers 128B apart and unroll the loop once to make the pointer
8a984d
 	   updates and usages separated enough to avoid stalls waiting for
8a984d
 	   address calculation.  */
8a984d
 	.p2align 5
8a984d
 L(loop):
8a984d
+#undef RAWMEMCHR_SUBTRACT_VECTORS
8a984d
+#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
8a984d
 	CHECK64(0,r4,pre_tail_64b)
8a984d
 	CHECK64(64,r4,pre_tail_64b)
8a984d
 	addi	  r4,r4,256
8a984d
@@ -140,10 +225,10 @@ L(tail_64b):
8a984d
 	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
8a984d
 	   low 16B bytes into vx+1, and the high into vx, so the order here is
8a984d
 	   v5, v4, v7, v6.  */
8a984d
-	vcmpequb  v1,v5,v18
8a984d
-	vcmpequb  v2,v4,v18
8a984d
-	vcmpequb  v3,v7,v18
8a984d
-	vcmpequb  v4,v6,v18
8a984d
+	vcmpequb  v1,v5,VREG_ZERO
8a984d
+	vcmpequb  v2,v4,VREG_ZERO
8a984d
+	vcmpequb  v3,v7,VREG_ZERO
8a984d
+	vcmpequb  v4,v6,VREG_ZERO
8a984d
 
8a984d
 	/* Take into account the other 64B blocks we had already checked.  */
8a984d
 	add	r5,r5,r6
8a984d
@@ -165,7 +250,9 @@ L(tail_64b):
8a984d
 	or	  r10,r8,r7
8a984d
 
8a984d
 	cnttzd	  r0,r10	  /* Count trailing zeros before the match.  */
8a984d
+#ifndef USE_AS_RAWMEMCHR
8a984d
 	subf	  r5,r3,r5
8a984d
+#endif
8a984d
 	add	  r3,r5,r0	  /* Compute final length.  */
8a984d
 	blr
8a984d
 
8a984d
@@ -213,9 +300,32 @@ L(tail10):
8a984d
 L(tail11):
8a984d
 	TAIL(v10,160)
8a984d
 
8a984d
-END (STRLEN)
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+	.p2align  5
8a984d
+L(tail12):
8a984d
+	TAIL(v0,176)
8a984d
+
8a984d
+	.p2align  5
8a984d
+L(tail13):
8a984d
+	TAIL(v1,192)
8a984d
+
8a984d
+	.p2align  5
8a984d
+L(tail14):
8a984d
+	TAIL(v2,208)
8a984d
+
8a984d
+	.p2align  5
8a984d
+L(tail15):
8a984d
+	TAIL(v3,224)
8a984d
+#endif
8a984d
+
8a984d
+END (FUNCNAME)
8a984d
 
8a984d
-#ifdef DEFINE_STRLEN_HIDDEN_DEF
8a984d
+#ifdef USE_AS_RAWMEMCHR
8a984d
+weak_alias (__rawmemchr,rawmemchr)
8a984d
+libc_hidden_builtin_def (__rawmemchr)
8a984d
+#else
8a984d
+# ifdef DEFINE_STRLEN_HIDDEN_DEF
8a984d
 weak_alias (__strlen, strlen)
8a984d
 libc_hidden_builtin_def (strlen)
8a984d
+# endif
8a984d
 #endif
8a984d
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
8a984d
index 1d517698429e1230..ac2446aca62cc4ab 100644
8a984d
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
8a984d
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
8a984d
@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
8a984d
 
8a984d
 ifneq (,$(filter %le,$(config-machine)))
8a984d
 sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
8a984d
+		   rawmemchr-power9 rawmemchr-power10 \
8a984d
 		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
8a984d
-		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
8a984d
-		   strlen-power10
8a984d
+		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
8a984d
 endif
8a984d
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
8a984d
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
8a984d
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8a984d
index 6e36659d1903448a..127af84b32a8196f 100644
8a984d
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8a984d
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
8a984d
@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
8a984d
   /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c.  */
8a984d
   IFUNC_IMPL (i, name, rawmemchr,
8a984d
 #ifdef __LITTLE_ENDIAN__
8a984d
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
8a984d
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
8a984d
+                              && (hwcap & PPC_FEATURE_HAS_VSX),
8a984d
+                              __rawmemchr_power10)
8a984d
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
8a984d
 			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
8a984d
 			      __rawmemchr_power9)
8a984d
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
8a984d
new file mode 100644
8a984d
index 0000000000000000..bf1ed7e1941f922d
8a984d
--- /dev/null
8a984d
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
8a984d
@@ -0,0 +1,21 @@
8a984d
+/* Optimized rawmemchr implementation for PowerPC64/POWER10.
8a984d
+   Copyright (C) 2021 Free Software Foundation, Inc.
8a984d
+   This file is part of the GNU C Library.
8a984d
+
8a984d
+   The GNU C Library is free software; you can redistribute it and/or
8a984d
+   modify it under the terms of the GNU Lesser General Public
8a984d
+   License as published by the Free Software Foundation; either
8a984d
+   version 2.1 of the License, or (at your option) any later version.
8a984d
+
8a984d
+   The GNU C Library is distributed in the hope that it will be useful,
8a984d
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
8a984d
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
8a984d
+   Lesser General Public License for more details.
8a984d
+
8a984d
+   You should have received a copy of the GNU Lesser General Public
8a984d
+   License along with the GNU C Library; if not, see
8a984d
+   <https://www.gnu.org/licenses/>.  */
8a984d
+
8a984d
+#define RAWMEMCHR __rawmemchr_power10
8a984d
+
8a984d
+#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S>
8a984d
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
8a984d
index 2a7ae5a1ed02e556..369d6359e8987052 100644
8a984d
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
8a984d
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
8a984d
@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
8a984d
 extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
8a984d
 # ifdef __LITTLE_ENDIAN__
8a984d
 extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
8a984d
+extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden;
8a984d
 # endif
8a984d
 
8a984d
 # undef __rawmemchr
8a984d
@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
8a984d
    ifunc symbol properly.  */
8a984d
 libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
8a984d
 # ifdef __LITTLE_ENDIAN__
8a984d
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_1)
8a984d
+		     && (hwcap & PPC_FEATURE_HAS_VSX)
8a984d
+		     ? __rawmemchr_power10 :
8a984d
 		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
8a984d
 		       ? __rawmemchr_power9 :
8a984d
 # endif