446cf2
commit 39037048502d52ab6422c18f2d178d6228d2c7b9
446cf2
Author: Anton Blanchard via Libc-alpha <libc-alpha@sourceware.org>
446cf2
Date:   Thu May 14 09:00:26 2020 +1000
446cf2
446cf2
    powerpc: Optimized strcpy for POWER9
446cf2
    
446cf2
    This version uses VSX store vector with length instructions and is
446cf2
    significantly faster on small strings and relatively unaligned large
446cf2
    strings, compared to the POWER8 version. A few examples:
446cf2
    
446cf2
                                            __strcpy_power9  __strcpy_power8
446cf2
    Length   16, alignments in bytes  0/ 0: 2.52454          4.62695
446cf2
    Length  412, alignments in bytes  4/ 0: 11.6             22.9185
446cf2
446cf2
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..5749228054667b2d
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
446cf2
@@ -0,0 +1,144 @@
446cf2
+/* Optimized strcpy implementation for PowerPC64/POWER9.
446cf2
+   Copyright (C) 2020 Free Software Foundation, Inc.
446cf2
+   This file is part of the GNU C Library.
446cf2
+
446cf2
+   The GNU C Library is free software; you can redistribute it and/or
446cf2
+   modify it under the terms of the GNU Lesser General Public
446cf2
+   License as published by the Free Software Foundation; either
446cf2
+   version 2.1 of the License, or (at your option) any later version.
446cf2
+
446cf2
+   The GNU C Library is distributed in the hope that it will be useful,
446cf2
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
446cf2
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
446cf2
+   Lesser General Public License for more details.
446cf2
+
446cf2
+   You should have received a copy of the GNU Lesser General Public
446cf2
+   License along with the GNU C Library; if not, see
446cf2
+   <https://www.gnu.org/licenses/>.  */
446cf2
+
446cf2
+#include <sysdep.h>
446cf2
+
446cf2
+#ifndef STRCPY
446cf2
+# define STRCPY strcpy
446cf2
+#endif
446cf2
+
446cf2
+/* Implements the function
446cf2
+
446cf2
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
446cf2
+
446cf2
+   The implementation can load bytes past a null terminator, but only
446cf2
+   up to the next 16B boundary, so it never crosses a page.  */
446cf2
+
446cf2
+.machine power9
446cf2
+ENTRY_TOCLESS (STRCPY, 4)
446cf2
+	CALL_MCOUNT 2
446cf2
+
446cf2
+	/* NULL string optimisation  */
446cf2
+	lbz	r0,0(r4)
446cf2
+	stb	r0,0(r3)
446cf2
+	cmpwi	r0,0
446cf2
+	beqlr
446cf2
+
446cf2
+	addi	r4,r4,1
446cf2
+	addi	r11,r3,1
446cf2
+
446cf2
+	vspltisb v18,0		/* Zeroes in v18  */
446cf2
+
446cf2
+	neg	r5,r4
446cf2
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
446cf2
+
446cf2
+	/* Get source 16B aligned  */
446cf2
+	lvx	v0,0,r4
446cf2
+	lvsr	v1,0,r4
446cf2
+	vperm	v0,v18,v0,v1
446cf2
+
446cf2
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
446cf2
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
446cf2
+	addi	r8,r8,1		/* Add null terminator  */
446cf2
+
446cf2
+	/* r8 = bytes including null
446cf2
+	   r9 = bytes to get source 16B aligned
446cf2
+	   if r8 > r9
446cf2
+	      no null, copy r9 bytes
446cf2
+	   else
446cf2
+	      there is a null, copy r8 bytes and return.  */
446cf2
+	cmpd	r8,r9
446cf2
+	bgt	L(no_null)
446cf2
+
446cf2
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
446cf2
+	stxvl	32+v0,r11,r10	/* Partial store  */
446cf2
+
446cf2
+	blr
446cf2
+
446cf2
+L(no_null):
446cf2
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
446cf2
+	stxvl	32+v0,r11,r10	/* Partial store  */
446cf2
+
446cf2
+	add	r4,r4,r9
446cf2
+	add	r11,r11,r9
446cf2
+
446cf2
+L(loop):
446cf2
+	lxv	32+v0,0(r4)
446cf2
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
446cf2
+	bne	cr6,L(tail1)
446cf2
+
446cf2
+	lxv	32+v1,16(r4)
446cf2
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
446cf2
+	bne	cr6,L(tail2)
446cf2
+
446cf2
+	lxv	32+v2,32(r4)
446cf2
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
446cf2
+	bne	cr6,L(tail3)
446cf2
+
446cf2
+	lxv	32+v3,48(r4)
446cf2
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
446cf2
+	bne	cr6,L(tail4)
446cf2
+
446cf2
+	stxv	32+v0,0(r11)
446cf2
+	stxv	32+v1,16(r11)
446cf2
+	stxv	32+v2,32(r11)
446cf2
+	stxv	32+v3,48(r11)
446cf2
+
446cf2
+	addi	r4,r4,64
446cf2
+	addi	r11,r11,64
446cf2
+
446cf2
+	b	L(loop)
446cf2
+
446cf2
+L(tail1):
446cf2
+	vctzlsbb r8,v6
446cf2
+	addi	r8,r8,1
446cf2
+	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
446cf2
+	stxvl	32+v0,r11,r9
446cf2
+	blr
446cf2
+
446cf2
+L(tail2):
446cf2
+	stxv	32+v0,0(r11)
446cf2
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
446cf2
+	addi	r8,r8,1		/* Add null terminator  */
446cf2
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
446cf2
+	addi	r11,r11,16
446cf2
+	stxvl	32+v1,r11,r10	/* Partial store  */
446cf2
+	blr
446cf2
+
446cf2
+L(tail3):
446cf2
+	stxv	32+v0,0(r11)
446cf2
+	stxv	32+v1,16(r11)
446cf2
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
446cf2
+	addi	r8,r8,1		/* Add null terminator  */
446cf2
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
446cf2
+	addi	r11,r11,32
446cf2
+	stxvl	32+v2,r11,r10	/* Partial store  */
446cf2
+	blr
446cf2
+
446cf2
+L(tail4):
446cf2
+	stxv	32+v0,0(r11)
446cf2
+	stxv	32+v1,16(r11)
446cf2
+	stxv	32+v2,32(r11)
446cf2
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
446cf2
+	addi	r8,r8,1		/* Add null terminator  */
446cf2
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
446cf2
+	addi	r11,r11,48
446cf2
+	stxvl	32+v3,r11,r10	/* Partial store  */
446cf2
+	blr
446cf2
+END (STRCPY)
446cf2
+libc_hidden_builtin_def (strcpy)
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
446cf2
index 963ea84dbfa98c74..17057bcbd694a710 100644
446cf2
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
446cf2
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
446cf2
 		   strncase-power8
446cf2
 
446cf2
 ifneq (,$(filter %le,$(config-machine)))
446cf2
-sysdep_routines += strcmp-power9 strncmp-power9
446cf2
+sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
446cf2
 endif
446cf2
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
446cf2
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
446cf2
index 1d374f2ae48165bd..2857fa8f36599afd 100644
446cf2
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
446cf2
@@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
446cf2
 
446cf2
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
446cf2
   IFUNC_IMPL (i, name, strcpy,
446cf2
+#ifdef __LITTLE_ENDIAN__
446cf2
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
446cf2
+			      __strcpy_power9)
446cf2
+#endif
446cf2
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
446cf2
 			      __strcpy_power8)
446cf2
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
446cf2
new file mode 100644
446cf2
index 0000000000000000..d22aa0a8d690cad7
446cf2
--- /dev/null
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
446cf2
@@ -0,0 +1,26 @@
446cf2
+/* Optimized strcpy implementation for POWER9/PPC64.
446cf2
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
446cf2
+   This file is part of the GNU C Library.
446cf2
+
446cf2
+   The GNU C Library is free software; you can redistribute it and/or
446cf2
+   modify it under the terms of the GNU Lesser General Public
446cf2
+   License as published by the Free Software Foundation; either
446cf2
+   version 2.1 of the License, or (at your option) any later version.
446cf2
+
446cf2
+   The GNU C Library is distributed in the hope that it will be useful,
446cf2
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
446cf2
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
446cf2
+   Lesser General Public License for more details.
446cf2
+
446cf2
+   You should have received a copy of the GNU Lesser General Public
446cf2
+   License along with the GNU C Library; if not, see
446cf2
+   <https://www.gnu.org/licenses/>.  */
446cf2
+
446cf2
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
446cf2
+#define STRCPY __strcpy_power9
446cf2
+
446cf2
+#undef libc_hidden_builtin_def
446cf2
+#define libc_hidden_builtin_def(name)
446cf2
+
446cf2
+#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
446cf2
+#endif
446cf2
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
446cf2
index b18a92a62a526d9c..88826392be4bdf48 100644
446cf2
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
446cf2
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
446cf2
@@ -25,9 +25,16 @@
446cf2
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
446cf2
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
446cf2
 extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
446cf2
+# ifdef __LITTLE_ENDIAN__
446cf2
+extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
446cf2
+# endif
446cf2
 #undef strcpy
446cf2
 
446cf2
 libc_ifunc_redirected (__redirect_strcpy, strcpy,
446cf2
+# ifdef __LITTLE_ENDIAN__
446cf2
+			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
446cf2
+			? __strcpy_power9 :
446cf2
+# endif
446cf2
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
446cf2
 		       ? __strcpy_power8
446cf2
 		       : (hwcap & PPC_FEATURE_HAS_VSX)