588e70
commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd
588e70
Author: Matheus Castanho <msc@linux.ibm.com>
588e70
Date:   Tue May 11 17:53:07 2021 -0300
588e70
588e70
    powerpc: Add optimized rawmemchr for POWER10
588e70
    
588e70
    Reuse code for optimized strlen to implement a faster version of rawmemchr.
588e70
    This takes advantage of the same benefits provided by the strlen implementation,
588e70
    but needs some extra steps. __strlen_power10 code should be unchanged after this
588e70
    change.
588e70
    
588e70
    rawmemchr returns a pointer to the char found, while strlen returns only the
588e70
    length, so we have to take that into account when preparing the return value.
588e70
    
588e70
    To quickly check 64B, the loop on __strlen_power10 merges the whole block into
588e70
    16B by using unsigned minimum vector operations (vminub) and checks if there are
588e70
    any \0 on the resulting vector. The same code is used by rawmemchr if the char c
588e70
    is 0. However, this approach does not work when c != 0.  We first need to
588e70
    subtract each byte by c, so that the value we are looking for is converted to a
588e70
    0, then taking the minimum and checking for nulls works again.
588e70
    
588e70
    The new code branches after it has compared ~256 bytes and chooses which of the
588e70
    two strategies above will be used in the main loop, based on the char c. This
588e70
    extra branch adds some overhead (~5%) for length ~256, but is quickly amortized
588e70
    by the faster loop for larger sizes.
588e70
    
588e70
    Compared to __rawmemchr_power9, this version is ~20% faster for length < 256.
588e70
    Because of the optimized main loop, the improvement becomes ~35% for c != 0
588e70
    and ~50% for c = 0 for strings longer than 256.
588e70
    
588e70
    Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
588e70
    Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
588e70
588e70
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
588e70
new file mode 100644
588e70
index 0000000000000000..5351c2634f6086bf
588e70
--- /dev/null
588e70
+++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
588e70
@@ -0,0 +1,22 @@
588e70
+/* Optimized rawmemchr implementation for POWER10 LE.
588e70
+   Copyright (C) 2021 Free Software Foundation, Inc.
588e70
+   This file is part of the GNU C Library.
588e70
+
588e70
+   The GNU C Library is free software; you can redistribute it and/or
588e70
+   modify it under the terms of the GNU Lesser General Public
588e70
+   License as published by the Free Software Foundation; either
588e70
+   version 2.1 of the License, or (at your option) any later version.
588e70
+
588e70
+   The GNU C Library is distributed in the hope that it will be useful,
588e70
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
588e70
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
588e70
+   Lesser General Public License for more details.
588e70
+
588e70
+   You should have received a copy of the GNU Lesser General Public
588e70
+   License along with the GNU C Library; if not, see
588e70
+   <https://www.gnu.org/licenses/>.  */
588e70
+
588e70
+#include <sysdep.h>
588e70
+
588e70
+#define USE_AS_RAWMEMCHR 1
588e70
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
588e70
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
588e70
index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644
588e70
--- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S
588e70
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
588e70
@@ -18,10 +18,50 @@
588e70
 
588e70
 #include <sysdep.h>
588e70
 
588e70
-#ifndef STRLEN
588e70
-# define STRLEN __strlen
588e70
-# define DEFINE_STRLEN_HIDDEN_DEF 1
588e70
-#endif
588e70
+/* To reuse the code for rawmemchr, we have some extra steps compared to the
588e70
+   strlen implementation:
588e70
+      - Sum the initial value of r3 with the position at which the char was
588e70
+        found, to guarantee we return a pointer and not the length.
588e70
+      - In the main loop, subtract each byte by the char we are looking for,
588e70
+        so we can keep using vminub to quickly check 64B at once.  */
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+# ifndef RAWMEMCHR
588e70
+#  define FUNCNAME __rawmemchr
588e70
+# else
588e70
+#  define FUNCNAME RAWMEMCHR
588e70
+# endif
588e70
+# define MCOUNT_NARGS 2
588e70
+# define VREG_ZERO v20
588e70
+# define OFF_START_LOOP 256
588e70
+# define RAWMEMCHR_SUBTRACT_VECTORS \
588e70
+	vsububm   v4,v4,v18;	    \
588e70
+	vsububm   v5,v5,v18;	    \
588e70
+	vsububm   v6,v6,v18;	    \
588e70
+	vsububm   v7,v7,v18;
588e70
+# define TAIL(vreg,increment)	   \
588e70
+	vctzlsbb  r4,vreg;	   \
588e70
+	addi	  r4,r4,increment; \
588e70
+	add	  r3,r5,r4;	   \
588e70
+	blr
588e70
+
588e70
+#else /* strlen */
588e70
+
588e70
+# ifndef STRLEN
588e70
+#  define FUNCNAME __strlen
588e70
+#  define DEFINE_STRLEN_HIDDEN_DEF 1
588e70
+# else
588e70
+#  define FUNCNAME STRLEN
588e70
+# endif
588e70
+# define MCOUNT_NARGS 1
588e70
+# define VREG_ZERO v18
588e70
+# define OFF_START_LOOP 192
588e70
+# define TAIL(vreg,increment)	   \
588e70
+	vctzlsbb  r4,vreg;	   \
588e70
+	subf	  r3,r3,r5;	   \
588e70
+	addi	  r4,r4,increment; \
588e70
+	add	  r3,r3,r4;	   \
588e70
+	blr
588e70
+#endif /* USE_AS_RAWMEMCHR */
588e70
 
588e70
 /* TODO: Replace macros by the actual instructions when minimum binutils becomes
588e70
    >= 2.35.  This is used to keep compatibility with older versions.  */
588e70
@@ -50,33 +90,41 @@
588e70
 	li	  r6,offset;		    \
588e70
 	LXVP(v4+32,offset,addr);	    \
588e70
 	LXVP(v6+32,offset+32,addr);	    \
588e70
+	RAWMEMCHR_SUBTRACT_VECTORS;	    \
588e70
 	vminub	  v14,v4,v5;		    \
588e70
 	vminub	  v15,v6,v7;		    \
588e70
 	vminub	  v16,v14,v15;		    \
588e70
-	vcmpequb. v0,v16,v18;		    \
588e70
+	vcmpequb. v0,v16,VREG_ZERO;	    \
588e70
 	bne	  cr6,L(label)
588e70
 
588e70
-#define TAIL(vreg,increment)	   \
588e70
-	vctzlsbb  r4,vreg;	   \
588e70
-	subf	  r3,r3,r5;	   \
588e70
-	addi	  r4,r4,increment; \
588e70
-	add	  r3,r3,r4;	   \
588e70
-	blr
588e70
-
588e70
 /* Implements the function
588e70
 
588e70
    int [r3] strlen (const void *s [r3])
588e70
 
588e70
+   but when USE_AS_RAWMEMCHR is set, implements the function
588e70
+
588e70
+   void* [r3] rawmemchr (const void *s [r3], int c [r4])
588e70
+
588e70
    The implementation can load bytes past a matching byte, but only
588e70
    up to the next 64B boundary, so it never crosses a page.  */
588e70
 
588e70
 .machine power9
588e70
 
588e70
-ENTRY_TOCLESS (STRLEN, 4)
588e70
-	CALL_MCOUNT 1
588e70
+ENTRY_TOCLESS (FUNCNAME, 4)
588e70
+	CALL_MCOUNT MCOUNT_NARGS
588e70
 
588e70
-	vspltisb  v18,0
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+	xori	r5,r4,0xff
588e70
+
588e70
+	mtvsrd	v18+32,r4	/* matching char in v18  */
588e70
+	mtvsrd	v19+32,r5	/* non matching char in v19  */
588e70
+
588e70
+	vspltb	v18,v18,7	/* replicate  */
588e70
+	vspltb	v19,v19,7	/* replicate  */
588e70
+#else
588e70
 	vspltisb  v19,-1
588e70
+#endif
588e70
+	vspltisb  VREG_ZERO,0
588e70
 
588e70
 	/* Next 16B-aligned address. Prepare address for L(aligned).  */
588e70
 	addi	  r5,r3,16
588e70
@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4)
588e70
 	vcmpequb. v6,v0,v18
588e70
 	beq	  cr6,L(aligned)
588e70
 
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+	vctzlsbb  r6,v6
588e70
+	add	  r3,r3,r6
588e70
+#else
588e70
 	vctzlsbb  r3,v6
588e70
+#endif
588e70
 	blr
588e70
 
588e70
-	/* Test next 176B, 16B at a time.  The main loop is optimized for longer
588e70
-	   strings, so checking the first bytes in 16B chunks benefits a lot
588e70
-	   small strings.  */
588e70
+	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
588e70
+	   optimized for longer strings, so checking the first bytes in 16B
588e70
+	   chunks benefits a lot small strings.  */
588e70
 	.p2align 5
588e70
 L(aligned):
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
588e70
+				  choose how we will perform the main loop.  */
588e70
+#endif
588e70
 	/* Prepare address for the loop.  */
588e70
-	addi	  r4,r3,192
588e70
+	addi	  r4,r3,OFF_START_LOOP
588e70
 	clrrdi	  r4,r4,6
588e70
 
588e70
 	CHECK16(v0,0,r5,tail1)
588e70
@@ -113,15 +170,43 @@ L(aligned):
588e70
 	CHECK16(v8,128,r5,tail9)
588e70
 	CHECK16(v9,144,r5,tail10)
588e70
 	CHECK16(v10,160,r5,tail11)
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+	CHECK16(v0,176,r5,tail12)
588e70
+	CHECK16(v1,192,r5,tail13)
588e70
+	CHECK16(v2,208,r5,tail14)
588e70
+	CHECK16(v3,224,r5,tail15)
588e70
+#endif
588e70
 
588e70
 	addi	  r5,r4,128
588e70
 
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+	/* If c == 0, use the same loop as strlen, without the vsububm.  */
588e70
+	beq	cr5,L(loop)
588e70
+
588e70
+	/* This is very similar to the block after L(loop), the difference is
588e70
+	   that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
588e70
+	   each byte loaded by the char we are looking for, this way we can keep
588e70
+	   using vminub to merge the results and checking for nulls.  */
588e70
+	.p2align 5
588e70
+L(rawmemchr_loop):
588e70
+	CHECK64(0,r4,pre_tail_64b)
588e70
+	CHECK64(64,r4,pre_tail_64b)
588e70
+	addi	  r4,r4,256
588e70
+
588e70
+	CHECK64(0,r5,tail_64b)
588e70
+	CHECK64(64,r5,tail_64b)
588e70
+	addi	  r5,r5,256
588e70
+
588e70
+	b	  L(rawmemchr_loop)
588e70
+#endif
588e70
 	/* Switch to a more aggressive approach checking 64B each time.  Use 2
588e70
 	   pointers 128B apart and unroll the loop once to make the pointer
588e70
 	   updates and usages separated enough to avoid stalls waiting for
588e70
 	   address calculation.  */
588e70
 	.p2align 5
588e70
 L(loop):
588e70
+#undef RAWMEMCHR_SUBTRACT_VECTORS
588e70
+#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
588e70
 	CHECK64(0,r4,pre_tail_64b)
588e70
 	CHECK64(64,r4,pre_tail_64b)
588e70
 	addi	  r4,r4,256
588e70
@@ -140,10 +225,10 @@ L(tail_64b):
588e70
 	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
588e70
 	   low 16B bytes into vx+1, and the high into vx, so the order here is
588e70
 	   v5, v4, v7, v6.  */
588e70
-	vcmpequb  v1,v5,v18
588e70
-	vcmpequb  v2,v4,v18
588e70
-	vcmpequb  v3,v7,v18
588e70
-	vcmpequb  v4,v6,v18
588e70
+	vcmpequb  v1,v5,VREG_ZERO
588e70
+	vcmpequb  v2,v4,VREG_ZERO
588e70
+	vcmpequb  v3,v7,VREG_ZERO
588e70
+	vcmpequb  v4,v6,VREG_ZERO
588e70
 
588e70
 	/* Take into account the other 64B blocks we had already checked.  */
588e70
 	add	r5,r5,r6
588e70
@@ -165,7 +250,9 @@ L(tail_64b):
588e70
 	or	  r10,r8,r7
588e70
 
588e70
 	cnttzd	  r0,r10	  /* Count trailing zeros before the match.  */
588e70
+#ifndef USE_AS_RAWMEMCHR
588e70
 	subf	  r5,r3,r5
588e70
+#endif
588e70
 	add	  r3,r5,r0	  /* Compute final length.  */
588e70
 	blr
588e70
 
588e70
@@ -213,9 +300,32 @@ L(tail10):
588e70
 L(tail11):
588e70
 	TAIL(v10,160)
588e70
 
588e70
-END (STRLEN)
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+	.p2align  5
588e70
+L(tail12):
588e70
+	TAIL(v0,176)
588e70
+
588e70
+	.p2align  5
588e70
+L(tail13):
588e70
+	TAIL(v1,192)
588e70
+
588e70
+	.p2align  5
588e70
+L(tail14):
588e70
+	TAIL(v2,208)
588e70
+
588e70
+	.p2align  5
588e70
+L(tail15):
588e70
+	TAIL(v3,224)
588e70
+#endif
588e70
+
588e70
+END (FUNCNAME)
588e70
 
588e70
-#ifdef DEFINE_STRLEN_HIDDEN_DEF
588e70
+#ifdef USE_AS_RAWMEMCHR
588e70
+weak_alias (__rawmemchr,rawmemchr)
588e70
+libc_hidden_builtin_def (__rawmemchr)
588e70
+#else
588e70
+# ifdef DEFINE_STRLEN_HIDDEN_DEF
588e70
 weak_alias (__strlen, strlen)
588e70
 libc_hidden_builtin_def (strlen)
588e70
+# endif
588e70
 #endif
588e70
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
588e70
index 1d517698429e1230..ac2446aca62cc4ab 100644
588e70
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
588e70
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
588e70
@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
588e70
 
588e70
 ifneq (,$(filter %le,$(config-machine)))
588e70
 sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
588e70
+		   rawmemchr-power9 rawmemchr-power10 \
588e70
 		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
588e70
-		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
588e70
-		   strlen-power10
588e70
+		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
588e70
 endif
588e70
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
588e70
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
588e70
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
588e70
index 6e36659d1903448a..127af84b32a8196f 100644
588e70
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
588e70
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
588e70
@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
588e70
   /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c.  */
588e70
   IFUNC_IMPL (i, name, rawmemchr,
588e70
 #ifdef __LITTLE_ENDIAN__
588e70
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
588e70
+			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
588e70
+                              && (hwcap & PPC_FEATURE_HAS_VSX),
588e70
+                              __rawmemchr_power10)
588e70
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
588e70
 			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
588e70
 			      __rawmemchr_power9)
588e70
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
588e70
new file mode 100644
588e70
index 0000000000000000..bf1ed7e1941f922d
588e70
--- /dev/null
588e70
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
588e70
@@ -0,0 +1,21 @@
588e70
+/* Optimized rawmemchr implementation for PowerPC64/POWER10.
588e70
+   Copyright (C) 2021 Free Software Foundation, Inc.
588e70
+   This file is part of the GNU C Library.
588e70
+
588e70
+   The GNU C Library is free software; you can redistribute it and/or
588e70
+   modify it under the terms of the GNU Lesser General Public
588e70
+   License as published by the Free Software Foundation; either
588e70
+   version 2.1 of the License, or (at your option) any later version.
588e70
+
588e70
+   The GNU C Library is distributed in the hope that it will be useful,
588e70
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
588e70
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
588e70
+   Lesser General Public License for more details.
588e70
+
588e70
+   You should have received a copy of the GNU Lesser General Public
588e70
+   License along with the GNU C Library; if not, see
588e70
+   <https://www.gnu.org/licenses/>.  */
588e70
+
588e70
+#define RAWMEMCHR __rawmemchr_power10
588e70
+
588e70
+#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S>
588e70
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
588e70
index 2a7ae5a1ed02e556..369d6359e8987052 100644
588e70
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
588e70
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
588e70
@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
588e70
 extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
588e70
 # ifdef __LITTLE_ENDIAN__
588e70
 extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
588e70
+extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden;
588e70
 # endif
588e70
 
588e70
 # undef __rawmemchr
588e70
@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
588e70
    ifunc symbol properly.  */
588e70
 libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
588e70
 # ifdef __LITTLE_ENDIAN__
588e70
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_1)
588e70
+		     && (hwcap & PPC_FEATURE_HAS_VSX)
588e70
+		     ? __rawmemchr_power10 :
588e70
 		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
588e70
 		       ? __rawmemchr_power9 :
588e70
 # endif