e354a5
commit 39037048502d52ab6422c18f2d178d6228d2c7b9
e354a5
Author: Anton Blanchard via Libc-alpha <libc-alpha@sourceware.org>
e354a5
Date:   Thu May 14 09:00:26 2020 +1000
e354a5
e354a5
    powerpc: Optimized strcpy for POWER9
e354a5
    
e354a5
    This version uses VSX store vector with length instructions and is
e354a5
    significantly faster on small strings and relatively unaligned large
e354a5
    strings, compared to the POWER8 version. A few examples:
e354a5
    
e354a5
                                            __strcpy_power9  __strcpy_power8
e354a5
    Length   16, alignments in bytes  0/ 0: 2.52454          4.62695
e354a5
    Length  412, alignments in bytes  4/ 0: 11.6             22.9185
e354a5
e354a5
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..5749228054667b2d
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
e354a5
@@ -0,0 +1,144 @@
e354a5
+/* Optimized strcpy implementation for PowerPC64/POWER9.
e354a5
+   Copyright (C) 2020 Free Software Foundation, Inc.
e354a5
+   This file is part of the GNU C Library.
e354a5
+
e354a5
+   The GNU C Library is free software; you can redistribute it and/or
e354a5
+   modify it under the terms of the GNU Lesser General Public
e354a5
+   License as published by the Free Software Foundation; either
e354a5
+   version 2.1 of the License, or (at your option) any later version.
e354a5
+
e354a5
+   The GNU C Library is distributed in the hope that it will be useful,
e354a5
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e354a5
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e354a5
+   Lesser General Public License for more details.
e354a5
+
e354a5
+   You should have received a copy of the GNU Lesser General Public
e354a5
+   License along with the GNU C Library; if not, see
e354a5
+   <https://www.gnu.org/licenses/>.  */
e354a5
+
e354a5
+#include <sysdep.h>
e354a5
+
e354a5
+#ifndef STRCPY
e354a5
+# define STRCPY strcpy
e354a5
+#endif
e354a5
+
e354a5
+/* Implements the function
e354a5
+
e354a5
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
e354a5
+
e354a5
+   The implementation can load bytes past a null terminator, but only
e354a5
+   up to the next 16B boundary, so it never crosses a page.  */
e354a5
+
e354a5
+.machine power9
e354a5
+ENTRY_TOCLESS (STRCPY, 4)
e354a5
+	CALL_MCOUNT 2
e354a5
+
e354a5
+	/* NULL string optimisation  */
e354a5
+	lbz	r0,0(r4)
e354a5
+	stb	r0,0(r3)
e354a5
+	cmpwi	r0,0
e354a5
+	beqlr
e354a5
+
e354a5
+	addi	r4,r4,1
e354a5
+	addi	r11,r3,1
e354a5
+
e354a5
+	vspltisb v18,0		/* Zeroes in v18  */
e354a5
+
e354a5
+	neg	r5,r4
e354a5
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
e354a5
+
e354a5
+	/* Get source 16B aligned  */
e354a5
+	lvx	v0,0,r4
e354a5
+	lvsr	v1,0,r4
e354a5
+	vperm	v0,v18,v0,v1
e354a5
+
e354a5
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
e354a5
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
e354a5
+	addi	r8,r8,1		/* Add null terminator  */
e354a5
+
e354a5
+	/* r8 = bytes including null
e354a5
+	   r9 = bytes to get source 16B aligned
e354a5
+	   if r8 > r9
e354a5
+	      no null, copy r9 bytes
e354a5
+	   else
e354a5
+	      there is a null, copy r8 bytes and return.  */
e354a5
+	cmpd	r8,r9
e354a5
+	bgt	L(no_null)
e354a5
+
e354a5
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
e354a5
+	stxvl	32+v0,r11,r10	/* Partial store  */
e354a5
+
e354a5
+	blr
e354a5
+
e354a5
+L(no_null):
e354a5
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
e354a5
+	stxvl	32+v0,r11,r10	/* Partial store  */
e354a5
+
e354a5
+	add	r4,r4,r9
e354a5
+	add	r11,r11,r9
e354a5
+
e354a5
+L(loop):
e354a5
+	lxv	32+v0,0(r4)
e354a5
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
e354a5
+	bne	cr6,L(tail1)
e354a5
+
e354a5
+	lxv	32+v1,16(r4)
e354a5
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
e354a5
+	bne	cr6,L(tail2)
e354a5
+
e354a5
+	lxv	32+v2,32(r4)
e354a5
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
e354a5
+	bne	cr6,L(tail3)
e354a5
+
e354a5
+	lxv	32+v3,48(r4)
e354a5
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
e354a5
+	bne	cr6,L(tail4)
e354a5
+
e354a5
+	stxv	32+v0,0(r11)
e354a5
+	stxv	32+v1,16(r11)
e354a5
+	stxv	32+v2,32(r11)
e354a5
+	stxv	32+v3,48(r11)
e354a5
+
e354a5
+	addi	r4,r4,64
e354a5
+	addi	r11,r11,64
e354a5
+
e354a5
+	b	L(loop)
e354a5
+
e354a5
+L(tail1):
e354a5
+	vctzlsbb r8,v6
e354a5
+	addi	r8,r8,1
e354a5
+	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
e354a5
+	stxvl	32+v0,r11,r9
e354a5
+	blr
e354a5
+
e354a5
+L(tail2):
e354a5
+	stxv	32+v0,0(r11)
e354a5
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
e354a5
+	addi	r8,r8,1		/* Add null terminator  */
e354a5
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
e354a5
+	addi	r11,r11,16
e354a5
+	stxvl	32+v1,r11,r10	/* Partial store  */
e354a5
+	blr
e354a5
+
e354a5
+L(tail3):
e354a5
+	stxv	32+v0,0(r11)
e354a5
+	stxv	32+v1,16(r11)
e354a5
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
e354a5
+	addi	r8,r8,1		/* Add null terminator  */
e354a5
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
e354a5
+	addi	r11,r11,32
e354a5
+	stxvl	32+v2,r11,r10	/* Partial store  */
e354a5
+	blr
e354a5
+
e354a5
+L(tail4):
e354a5
+	stxv	32+v0,0(r11)
e354a5
+	stxv	32+v1,16(r11)
e354a5
+	stxv	32+v2,32(r11)
e354a5
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
e354a5
+	addi	r8,r8,1		/* Add null terminator  */
e354a5
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
e354a5
+	addi	r11,r11,48
e354a5
+	stxvl	32+v3,r11,r10	/* Partial store  */
e354a5
+	blr
e354a5
+END (STRCPY)
e354a5
+libc_hidden_builtin_def (strcpy)
e354a5
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
e354a5
index 963ea84dbfa98c74..17057bcbd694a710 100644
e354a5
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
e354a5
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
e354a5
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
e354a5
 		   strncase-power8
e354a5
 
e354a5
 ifneq (,$(filter %le,$(config-machine)))
e354a5
-sysdep_routines += strcmp-power9 strncmp-power9
e354a5
+sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
e354a5
 endif
e354a5
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
e354a5
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
e354a5
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
e354a5
index 1d374f2ae48165bd..2857fa8f36599afd 100644
e354a5
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
e354a5
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
e354a5
@@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
e354a5
 
e354a5
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
e354a5
   IFUNC_IMPL (i, name, strcpy,
e354a5
+#ifdef __LITTLE_ENDIAN__
e354a5
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
e354a5
+			      __strcpy_power9)
e354a5
+#endif
e354a5
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
e354a5
 			      __strcpy_power8)
e354a5
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
e354a5
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
e354a5
new file mode 100644
e354a5
index 0000000000000000..d22aa0a8d690cad7
e354a5
--- /dev/null
e354a5
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
e354a5
@@ -0,0 +1,26 @@
e354a5
+/* Optimized strcpy implementation for POWER9/PPC64.
e354a5
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
e354a5
+   This file is part of the GNU C Library.
e354a5
+
e354a5
+   The GNU C Library is free software; you can redistribute it and/or
e354a5
+   modify it under the terms of the GNU Lesser General Public
e354a5
+   License as published by the Free Software Foundation; either
e354a5
+   version 2.1 of the License, or (at your option) any later version.
e354a5
+
e354a5
+   The GNU C Library is distributed in the hope that it will be useful,
e354a5
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
e354a5
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
e354a5
+   Lesser General Public License for more details.
e354a5
+
e354a5
+   You should have received a copy of the GNU Lesser General Public
e354a5
+   License along with the GNU C Library; if not, see
e354a5
+   <https://www.gnu.org/licenses/>.  */
e354a5
+
e354a5
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
e354a5
+#define STRCPY __strcpy_power9
e354a5
+
e354a5
+#undef libc_hidden_builtin_def
e354a5
+#define libc_hidden_builtin_def(name)
e354a5
+
e354a5
+#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
e354a5
+#endif
e354a5
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
e354a5
index b18a92a62a526d9c..88826392be4bdf48 100644
e354a5
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
e354a5
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
e354a5
@@ -25,9 +25,16 @@
e354a5
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
e354a5
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
e354a5
 extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
e354a5
+# ifdef __LITTLE_ENDIAN__
e354a5
+extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
e354a5
+# endif
e354a5
 #undef strcpy
e354a5
 
e354a5
 libc_ifunc_redirected (__redirect_strcpy, strcpy,
e354a5
+# ifdef __LITTLE_ENDIAN__
e354a5
+			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
e354a5
+			? __strcpy_power9 :
e354a5
+# endif
e354a5
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
e354a5
 		       ? __strcpy_power8
e354a5
 		       : (hwcap & PPC_FEATURE_HAS_VSX)