9bb5d6
commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd
9bb5d6
Author: Matheus Castanho <msc@linux.ibm.com>
9bb5d6
Date:   Tue May 11 17:53:07 2021 -0300
9bb5d6
9bb5d6
    powerpc: Add optimized rawmemchr for POWER10
9bb5d6
    
9bb5d6
    Reuse code for optimized strlen to implement a faster version of rawmemchr.
9bb5d6
    This takes advantage of the same benefits provided by the strlen implementation,
9bb5d6
    but needs some extra steps. __strlen_power10 code should be unchanged after this
9bb5d6
    change.
9bb5d6
    
9bb5d6
    rawmemchr returns a pointer to the char found, while strlen returns only the
9bb5d6
    length, so we have to take that into account when preparing the return value.
9bb5d6
    
9bb5d6
    To quickly check 64B, the loop on __strlen_power10 merges the whole block into
9bb5d6
    16B by using unsigned minimum vector operations (vminub) and checks if there are
9bb5d6
    any \0 on the resulting vector. The same code is used by rawmemchr if the char c
9bb5d6
    is 0. However, this approach does not work when c != 0.  We first need to
9bb5d6
    subtract each byte by c, so that the value we are looking for is converted to a
9bb5d6
    0, then taking the minimum and checking for nulls works again.
9bb5d6
    
9bb5d6
    The new code branches after it has compared ~256 bytes and chooses which of the
9bb5d6
    two strategies above will be used in the main loop, based on the char c. This
9bb5d6
    extra branch adds some overhead (~5%) for length ~256, but is quickly amortized
9bb5d6
    by the faster loop for larger sizes.
9bb5d6
    
9bb5d6
    Compared to __rawmemchr_power9, this version is ~20% faster for length < 256.
9bb5d6
    Because of the optimized main loop, the improvement becomes ~35% for c != 0
9bb5d6
    and ~50% for c = 0 for strings longer than 256.
9bb5d6
    
9bb5d6
    Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
9bb5d6
    Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
9bb5d6
9bb5d6
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
9bb5d6
new file mode 100644
9bb5d6
index 0000000000000000..5351c2634f6086bf
9bb5d6
--- /dev/null
9bb5d6
+++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
9bb5d6
@@ -0,0 +1,22 @@
9bb5d6
+/* Optimized rawmemchr implementation for POWER10 LE.
9bb5d6
+   Copyright (C) 2021 Free Software Foundation, Inc.
9bb5d6
+   This file is part of the GNU C Library.
9bb5d6
+
9bb5d6
+   The GNU C Library is free software; you can redistribute it and/or
9bb5d6
+   modify it under the terms of the GNU Lesser General Public
9bb5d6
+   License as published by the Free Software Foundation; either
9bb5d6
+   version 2.1 of the License, or (at your option) any later version.
9bb5d6
+
9bb5d6
+   The GNU C Library is distributed in the hope that it will be useful,
9bb5d6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
9bb5d6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9bb5d6
+   Lesser General Public License for more details.
9bb5d6
+
9bb5d6
+   You should have received a copy of the GNU Lesser General Public
9bb5d6
+   License along with the GNU C Library; if not, see
9bb5d6
+   <https://www.gnu.org/licenses/>.  */
9bb5d6
+
9bb5d6
+#include <sysdep.h>
9bb5d6
+
9bb5d6
+#define USE_AS_RAWMEMCHR 1
9bb5d6
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
9bb5d6
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
9bb5d6
index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644
9bb5d6
--- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S
9bb5d6
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
9bb5d6
@@ -18,10 +18,50 @@
9bb5d6
 
9bb5d6
 #include <sysdep.h>
9bb5d6
 
9bb5d6
-#ifndef STRLEN
9bb5d6
-# define STRLEN __strlen
9bb5d6
-# define DEFINE_STRLEN_HIDDEN_DEF 1
9bb5d6
-#endif
9bb5d6
+/* To reuse the code for rawmemchr, we have some extra steps compared to the
9bb5d6
+   strlen implementation:
9bb5d6
+      - Sum the initial value of r3 with the position at which the char was
9bb5d6
+        found, to guarantee we return a pointer and not the length.
9bb5d6
+      - In the main loop, subtract each byte by the char we are looking for,
9bb5d6
+        so we can keep using vminub to quickly check 64B at once.  */
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+# ifndef RAWMEMCHR
9bb5d6
+#  define FUNCNAME __rawmemchr
9bb5d6
+# else
9bb5d6
+#  define FUNCNAME RAWMEMCHR
9bb5d6
+# endif
9bb5d6
+# define MCOUNT_NARGS 2
9bb5d6
+# define VREG_ZERO v20
9bb5d6
+# define OFF_START_LOOP 256
9bb5d6
+# define RAWMEMCHR_SUBTRACT_VECTORS \
9bb5d6
+	vsububm   v4,v4,v18;	    \
9bb5d6
+	vsububm   v5,v5,v18;	    \
9bb5d6
+	vsububm   v6,v6,v18;	    \
9bb5d6
+	vsububm   v7,v7,v18;
9bb5d6
+# define TAIL(vreg,increment)	   \
9bb5d6
+	vctzlsbb  r4,vreg;	   \
9bb5d6
+	addi	  r4,r4,increment; \
9bb5d6
+	add	  r3,r5,r4;	   \
9bb5d6
+	blr
9bb5d6
+
9bb5d6
+#else /* strlen */
9bb5d6
+
9bb5d6
+# ifndef STRLEN
9bb5d6
+#  define FUNCNAME __strlen
9bb5d6
+#  define DEFINE_STRLEN_HIDDEN_DEF 1
9bb5d6
+# else
9bb5d6
+#  define FUNCNAME STRLEN
9bb5d6
+# endif
9bb5d6
+# define MCOUNT_NARGS 1
9bb5d6
+# define VREG_ZERO v18
9bb5d6
+# define OFF_START_LOOP 192
9bb5d6
+# define TAIL(vreg,increment)	   \
9bb5d6
+	vctzlsbb  r4,vreg;	   \
9bb5d6
+	subf	  r3,r3,r5;	   \
9bb5d6
+	addi	  r4,r4,increment; \
9bb5d6
+	add	  r3,r3,r4;	   \
9bb5d6
+	blr
9bb5d6
+#endif /* USE_AS_RAWMEMCHR */
9bb5d6
 
9bb5d6
 /* TODO: Replace macros by the actual instructions when minimum binutils becomes
9bb5d6
    >= 2.35.  This is used to keep compatibility with older versions.  */
9bb5d6
@@ -50,33 +90,41 @@
9bb5d6
 	li	  r6,offset;		    \
9bb5d6
 	LXVP(v4+32,offset,addr);	    \
9bb5d6
 	LXVP(v6+32,offset+32,addr);	    \
9bb5d6
+	RAWMEMCHR_SUBTRACT_VECTORS;	    \
9bb5d6
 	vminub	  v14,v4,v5;		    \
9bb5d6
 	vminub	  v15,v6,v7;		    \
9bb5d6
 	vminub	  v16,v14,v15;		    \
9bb5d6
-	vcmpequb. v0,v16,v18;		    \
9bb5d6
+	vcmpequb. v0,v16,VREG_ZERO;	    \
9bb5d6
 	bne	  cr6,L(label)
9bb5d6
 
9bb5d6
-#define TAIL(vreg,increment)	   \
9bb5d6
-	vctzlsbb  r4,vreg;	   \
9bb5d6
-	subf	  r3,r3,r5;	   \
9bb5d6
-	addi	  r4,r4,increment; \
9bb5d6
-	add	  r3,r3,r4;	   \
9bb5d6
-	blr
9bb5d6
-
9bb5d6
 /* Implements the function
9bb5d6
 
9bb5d6
    int [r3] strlen (const void *s [r3])
9bb5d6
 
9bb5d6
+   but when USE_AS_RAWMEMCHR is set, implements the function
9bb5d6
+
9bb5d6
+   void* [r3] rawmemchr (const void *s [r3], int c [r4])
9bb5d6
+
9bb5d6
    The implementation can load bytes past a matching byte, but only
9bb5d6
    up to the next 64B boundary, so it never crosses a page.  */
9bb5d6
 
9bb5d6
 .machine power9
9bb5d6
 
9bb5d6
-ENTRY_TOCLESS (STRLEN, 4)
9bb5d6
-	CALL_MCOUNT 1
9bb5d6
+ENTRY_TOCLESS (FUNCNAME, 4)
9bb5d6
+	CALL_MCOUNT MCOUNT_NARGS
9bb5d6
 
9bb5d6
-	vspltisb  v18,0
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+	xori	r5,r4,0xff
9bb5d6
+
9bb5d6
+	mtvsrd	v18+32,r4	/* matching char in v18  */
9bb5d6
+	mtvsrd	v19+32,r5	/* non matching char in v19  */
9bb5d6
+
9bb5d6
+	vspltb	v18,v18,7	/* replicate  */
9bb5d6
+	vspltb	v19,v19,7	/* replicate  */
9bb5d6
+#else
9bb5d6
 	vspltisb  v19,-1
9bb5d6
+#endif
9bb5d6
+	vspltisb  VREG_ZERO,0
9bb5d6
 
9bb5d6
 	/* Next 16B-aligned address. Prepare address for L(aligned).  */
9bb5d6
 	addi	  r5,r3,16
9bb5d6
@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4)
9bb5d6
 	vcmpequb. v6,v0,v18
9bb5d6
 	beq	  cr6,L(aligned)
9bb5d6
 
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+	vctzlsbb  r6,v6
9bb5d6
+	add	  r3,r3,r6
9bb5d6
+#else
9bb5d6
 	vctzlsbb  r3,v6
9bb5d6
+#endif
9bb5d6
 	blr
9bb5d6
 
9bb5d6
-	/* Test next 176B, 16B at a time.  The main loop is optimized for longer
9bb5d6
-	   strings, so checking the first bytes in 16B chunks benefits a lot
9bb5d6
-	   small strings.  */
9bb5d6
+	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
9bb5d6
+	   optimized for longer strings, so checking the first bytes in 16B
9bb5d6
+	   chunks benefits a lot small strings.  */
9bb5d6
 	.p2align 5
9bb5d6
 L(aligned):
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
9bb5d6
+				  choose how we will perform the main loop.  */
9bb5d6
+#endif
9bb5d6
 	/* Prepare address for the loop.  */
9bb5d6
-	addi	  r4,r3,192
9bb5d6
+	addi	  r4,r3,OFF_START_LOOP
9bb5d6
 	clrrdi	  r4,r4,6
9bb5d6
 
9bb5d6
 	CHECK16(v0,0,r5,tail1)
9bb5d6
@@ -113,15 +170,43 @@ L(aligned):
9bb5d6
 	CHECK16(v8,128,r5,tail9)
9bb5d6
 	CHECK16(v9,144,r5,tail10)
9bb5d6
 	CHECK16(v10,160,r5,tail11)
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+	CHECK16(v0,176,r5,tail12)
9bb5d6
+	CHECK16(v1,192,r5,tail13)
9bb5d6
+	CHECK16(v2,208,r5,tail14)
9bb5d6
+	CHECK16(v3,224,r5,tail15)
9bb5d6
+#endif
9bb5d6
 
9bb5d6
 	addi	  r5,r4,128
9bb5d6
 
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+	/* If c == 0, use the same loop as strlen, without the vsububm.  */
9bb5d6
+	beq	cr5,L(loop)
9bb5d6
+
9bb5d6
+	/* This is very similar to the block after L(loop), the difference is
9bb5d6
+	   that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
9bb5d6
+	   each byte loaded by the char we are looking for, this way we can keep
9bb5d6
+	   using vminub to merge the results and checking for nulls.  */
9bb5d6
+	.p2align 5
9bb5d6
+L(rawmemchr_loop):
9bb5d6
+	CHECK64(0,r4,pre_tail_64b)
9bb5d6
+	CHECK64(64,r4,pre_tail_64b)
9bb5d6
+	addi	  r4,r4,256
9bb5d6
+
9bb5d6
+	CHECK64(0,r5,tail_64b)
9bb5d6
+	CHECK64(64,r5,tail_64b)
9bb5d6
+	addi	  r5,r5,256
9bb5d6
+
9bb5d6
+	b	  L(rawmemchr_loop)
9bb5d6
+#endif
9bb5d6
 	/* Switch to a more aggressive approach checking 64B each time.  Use 2
9bb5d6
 	   pointers 128B apart and unroll the loop once to make the pointer
9bb5d6
 	   updates and usages separated enough to avoid stalls waiting for
9bb5d6
 	   address calculation.  */
9bb5d6
 	.p2align 5
9bb5d6
 L(loop):
9bb5d6
+#undef RAWMEMCHR_SUBTRACT_VECTORS
9bb5d6
+#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
9bb5d6
 	CHECK64(0,r4,pre_tail_64b)
9bb5d6
 	CHECK64(64,r4,pre_tail_64b)
9bb5d6
 	addi	  r4,r4,256
9bb5d6
@@ -140,10 +225,10 @@ L(tail_64b):
9bb5d6
 	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
9bb5d6
 	   low 16B bytes into vx+1, and the high into vx, so the order here is
9bb5d6
 	   v5, v4, v7, v6.  */
9bb5d6
-	vcmpequb  v1,v5,v18
9bb5d6
-	vcmpequb  v2,v4,v18
9bb5d6
-	vcmpequb  v3,v7,v18
9bb5d6
-	vcmpequb  v4,v6,v18
9bb5d6
+	vcmpequb  v1,v5,VREG_ZERO
9bb5d6
+	vcmpequb  v2,v4,VREG_ZERO
9bb5d6
+	vcmpequb  v3,v7,VREG_ZERO
9bb5d6
+	vcmpequb  v4,v6,VREG_ZERO
9bb5d6
 
9bb5d6
 	/* Take into account the other 64B blocks we had already checked.  */
9bb5d6
 	add	r5,r5,r6
9bb5d6
@@ -165,7 +250,9 @@ L(tail_64b):
9bb5d6
 	or	  r10,r8,r7
9bb5d6
 
9bb5d6
 	cnttzd	  r0,r10	  /* Count trailing zeros before the match.  */
9bb5d6
+#ifndef USE_AS_RAWMEMCHR
9bb5d6
 	subf	  r5,r3,r5
9bb5d6
+#endif
9bb5d6
 	add	  r3,r5,r0	  /* Compute final length.  */
9bb5d6
 	blr
9bb5d6
 
9bb5d6
@@ -213,9 +300,32 @@ L(tail10):
9bb5d6
 L(tail11):
9bb5d6
 	TAIL(v10,160)
9bb5d6
 
9bb5d6
-END (STRLEN)
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+	.p2align  5
9bb5d6
+L(tail12):
9bb5d6
+	TAIL(v0,176)
9bb5d6
+
9bb5d6
+	.p2align  5
9bb5d6
+L(tail13):
9bb5d6
+	TAIL(v1,192)
9bb5d6
+
9bb5d6
+	.p2align  5
9bb5d6
+L(tail14):
9bb5d6
+	TAIL(v2,208)
9bb5d6
+
9bb5d6
+	.p2align  5
9bb5d6
+L(tail15):
9bb5d6
+	TAIL(v3,224)
9bb5d6
+#endif
9bb5d6
+
9bb5d6
+END (FUNCNAME)
9bb5d6
 
9bb5d6
-#ifdef DEFINE_STRLEN_HIDDEN_DEF
9bb5d6
+#ifdef USE_AS_RAWMEMCHR
9bb5d6
+weak_alias (__rawmemchr,rawmemchr)
9bb5d6
+libc_hidden_builtin_def (__rawmemchr)
9bb5d6
+#else
9bb5d6
+# ifdef DEFINE_STRLEN_HIDDEN_DEF
9bb5d6
 weak_alias (__strlen, strlen)
9bb5d6
 libc_hidden_builtin_def (strlen)
9bb5d6
+# endif
9bb5d6
 #endif
9bb5d6
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
9bb5d6
index 1d517698429e1230..ac2446aca62cc4ab 100644
9bb5d6
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
9bb5d6
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
9bb5d6
@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
9bb5d6
 
9bb5d6
 ifneq (,$(filter %le,$(config-machine)))
9bb5d6
 sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
9bb5d6
+		   rawmemchr-power9 rawmemchr-power10 \
9bb5d6
 		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
9bb5d6
-		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
9bb5d6
-		   strlen-power10
9bb5d6
+		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
9bb5d6
 endif
9bb5d6
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
9bb5d6
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
9bb5d6
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
9bb5d6
index 6e36659d1903448a..127af84b32a8196f 100644
9bb5d6
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
9bb5d6
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
9bb5d6
@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
9bb5d6
   /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c.  */
9bb5d6
   IFUNC_IMPL (i, name, rawmemchr,
9bb5d6
 #ifdef __LITTLE_ENDIAN__
9bb5d6
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
9bb5d6
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
9bb5d6
+                              && (hwcap & PPC_FEATURE_HAS_VSX),
9bb5d6
+                              __rawmemchr_power10)
9bb5d6
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
9bb5d6
 			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
9bb5d6
 			      __rawmemchr_power9)
9bb5d6
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
9bb5d6
new file mode 100644
9bb5d6
index 0000000000000000..bf1ed7e1941f922d
9bb5d6
--- /dev/null
9bb5d6
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
9bb5d6
@@ -0,0 +1,21 @@
9bb5d6
+/* Optimized rawmemchr implementation for PowerPC64/POWER10.
9bb5d6
+   Copyright (C) 2021 Free Software Foundation, Inc.
9bb5d6
+   This file is part of the GNU C Library.
9bb5d6
+
9bb5d6
+   The GNU C Library is free software; you can redistribute it and/or
9bb5d6
+   modify it under the terms of the GNU Lesser General Public
9bb5d6
+   License as published by the Free Software Foundation; either
9bb5d6
+   version 2.1 of the License, or (at your option) any later version.
9bb5d6
+
9bb5d6
+   The GNU C Library is distributed in the hope that it will be useful,
9bb5d6
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
9bb5d6
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
9bb5d6
+   Lesser General Public License for more details.
9bb5d6
+
9bb5d6
+   You should have received a copy of the GNU Lesser General Public
9bb5d6
+   License along with the GNU C Library; if not, see
9bb5d6
+   <https://www.gnu.org/licenses/>.  */
9bb5d6
+
9bb5d6
+#define RAWMEMCHR __rawmemchr_power10
9bb5d6
+
9bb5d6
+#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S>
9bb5d6
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
9bb5d6
index 2a7ae5a1ed02e556..369d6359e8987052 100644
9bb5d6
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
9bb5d6
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
9bb5d6
@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
9bb5d6
 extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
9bb5d6
 # ifdef __LITTLE_ENDIAN__
9bb5d6
 extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
9bb5d6
+extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden;
9bb5d6
 # endif
9bb5d6
 
9bb5d6
 # undef __rawmemchr
9bb5d6
@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
9bb5d6
    ifunc symbol properly.  */
9bb5d6
 libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
9bb5d6
 # ifdef __LITTLE_ENDIAN__
9bb5d6
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_1)
9bb5d6
+		     && (hwcap & PPC_FEATURE_HAS_VSX)
9bb5d6
+		     ? __rawmemchr_power10 :
9bb5d6
 		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
9bb5d6
 		       ? __rawmemchr_power9 :
9bb5d6
 # endif