Blob Blame History Raw
commit 39037048502d52ab6422c18f2d178d6228d2c7b9
Author: Anton Blanchard via Libc-alpha <libc-alpha@sourceware.org>
Date:   Thu May 14 09:00:26 2020 +1000

    powerpc: Optimized strcpy for POWER9
    
    This version uses VSX store vector with length instructions and is
    significantly faster on small strings and relatively unaligned large
    strings, compared to the POWER8 version. A few examples:
    
                                            __strcpy_power9  __strcpy_power8
    Length   16, alignments in bytes  0/ 0: 2.52454          4.62695
    Length  412, alignments in bytes  4/ 0: 11.6             22.9185

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
new file mode 100644
index 0000000000000000..5749228054667b2d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -0,0 +1,144 @@
+/* Optimized strcpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCPY
+# define STRCPY strcpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (STRCPY, 4)
+	CALL_MCOUNT 2
+
+	/* NULL string optimisation  */
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	cmpwi	r0,0
+	beqlr
+
+	addi	r4,r4,1
+	addi	r11,r3,1
+
+	vspltisb v18,0		/* Zeroes in v18  */
+
+	neg	r5,r4
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpd	r8,r9
+	bgt	L(no_null)
+
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(no_null):
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+
+L(loop):
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+
+	b	L(loop)
+
+L(tail1):
+	vctzlsbb r8,v6
+	addi	r8,r8,1
+	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r9
+	blr
+
+L(tail2):
+	stxv	32+v0,0(r11)
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	addi	r8,r8,1		/* Add null terminator  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+END (STRCPY)
+libc_hidden_builtin_def (strcpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 963ea84dbfa98c74..17057bcbd694a710 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 
 ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9
+sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 1d374f2ae48165bd..2857fa8f36599afd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strcpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
new file mode 100644
index 0000000000000000..d22aa0a8d690cad7
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized strcpy implementation for POWER9/PPC64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRCPY __strcpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index b18a92a62a526d9c..88826392be4bdf48 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -25,9 +25,16 @@
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
 extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
+# endif
 #undef strcpy
 
 libc_ifunc_redirected (__redirect_strcpy, strcpy,
+# ifdef __LITTLE_ENDIAN__
+			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
+			? __strcpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strcpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)