3b386f
commit 39037048502d52ab6422c18f2d178d6228d2c7b9
3b386f
Author: Anton Blanchard via Libc-alpha <libc-alpha@sourceware.org>
3b386f
Date:   Thu May 14 09:00:26 2020 +1000
3b386f
3b386f
    powerpc: Optimized strcpy for POWER9
3b386f
    
3b386f
    This version uses VSX store vector with length instructions and is
3b386f
    significantly faster on small strings and relatively unaligned large
3b386f
    strings, compared to the POWER8 version. A few examples:
3b386f
    
3b386f
                                            __strcpy_power9  __strcpy_power8
3b386f
    Length   16, alignments in bytes  0/ 0: 2.52454          4.62695
3b386f
    Length  412, alignments in bytes  4/ 0: 11.6             22.9185
3b386f
3b386f
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
3b386f
new file mode 100644
3b386f
index 0000000000000000..5749228054667b2d
3b386f
--- /dev/null
3b386f
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
3b386f
@@ -0,0 +1,144 @@
3b386f
+/* Optimized strcpy implementation for PowerPC64/POWER9.
3b386f
+   Copyright (C) 2020 Free Software Foundation, Inc.
3b386f
+   This file is part of the GNU C Library.
3b386f
+
3b386f
+   The GNU C Library is free software; you can redistribute it and/or
3b386f
+   modify it under the terms of the GNU Lesser General Public
3b386f
+   License as published by the Free Software Foundation; either
3b386f
+   version 2.1 of the License, or (at your option) any later version.
3b386f
+
3b386f
+   The GNU C Library is distributed in the hope that it will be useful,
3b386f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
3b386f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3b386f
+   Lesser General Public License for more details.
3b386f
+
3b386f
+   You should have received a copy of the GNU Lesser General Public
3b386f
+   License along with the GNU C Library; if not, see
3b386f
+   <https://www.gnu.org/licenses/>.  */
3b386f
+
3b386f
+#include <sysdep.h>
3b386f
+
3b386f
+#ifndef STRCPY
3b386f
+# define STRCPY strcpy
3b386f
+#endif
3b386f
+
3b386f
+/* Implements the function
3b386f
+
3b386f
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
3b386f
+
3b386f
+   The implementation can load bytes past a null terminator, but only
3b386f
+   up to the next 16B boundary, so it never crosses a page.  */
3b386f
+
3b386f
+.machine power9
3b386f
+ENTRY_TOCLESS (STRCPY, 4)
3b386f
+	CALL_MCOUNT 2
3b386f
+
3b386f
+	/* NULL string optimisation  */
3b386f
+	lbz	r0,0(r4)
3b386f
+	stb	r0,0(r3)
3b386f
+	cmpwi	r0,0
3b386f
+	beqlr
3b386f
+
3b386f
+	addi	r4,r4,1
3b386f
+	addi	r11,r3,1
3b386f
+
3b386f
+	vspltisb v18,0		/* Zeroes in v18  */
3b386f
+
3b386f
+	neg	r5,r4
3b386f
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
3b386f
+
3b386f
+	/* Get source 16B aligned  */
3b386f
+	lvx	v0,0,r4
3b386f
+	lvsr	v1,0,r4
3b386f
+	vperm	v0,v18,v0,v1
3b386f
+
3b386f
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
3b386f
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
3b386f
+	addi	r8,r8,1		/* Add null terminator  */
3b386f
+
3b386f
+	/* r8 = bytes including null
3b386f
+	   r9 = bytes to get source 16B aligned
3b386f
+	   if r8 > r9
3b386f
+	      no null, copy r9 bytes
3b386f
+	   else
3b386f
+	      there is a null, copy r8 bytes and return.  */
3b386f
+	cmpd	r8,r9
3b386f
+	bgt	L(no_null)
3b386f
+
3b386f
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
3b386f
+	stxvl	32+v0,r11,r10	/* Partial store  */
3b386f
+
3b386f
+	blr
3b386f
+
3b386f
+L(no_null):
3b386f
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
3b386f
+	stxvl	32+v0,r11,r10	/* Partial store  */
3b386f
+
3b386f
+	add	r4,r4,r9
3b386f
+	add	r11,r11,r9
3b386f
+
3b386f
+L(loop):
3b386f
+	lxv	32+v0,0(r4)
3b386f
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
3b386f
+	bne	cr6,L(tail1)
3b386f
+
3b386f
+	lxv	32+v1,16(r4)
3b386f
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
3b386f
+	bne	cr6,L(tail2)
3b386f
+
3b386f
+	lxv	32+v2,32(r4)
3b386f
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
3b386f
+	bne	cr6,L(tail3)
3b386f
+
3b386f
+	lxv	32+v3,48(r4)
3b386f
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
3b386f
+	bne	cr6,L(tail4)
3b386f
+
3b386f
+	stxv	32+v0,0(r11)
3b386f
+	stxv	32+v1,16(r11)
3b386f
+	stxv	32+v2,32(r11)
3b386f
+	stxv	32+v3,48(r11)
3b386f
+
3b386f
+	addi	r4,r4,64
3b386f
+	addi	r11,r11,64
3b386f
+
3b386f
+	b	L(loop)
3b386f
+
3b386f
+L(tail1):
3b386f
+	vctzlsbb r8,v6
3b386f
+	addi	r8,r8,1
3b386f
+	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
3b386f
+	stxvl	32+v0,r11,r9
3b386f
+	blr
3b386f
+
3b386f
+L(tail2):
3b386f
+	stxv	32+v0,0(r11)
3b386f
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
3b386f
+	addi	r8,r8,1		/* Add null terminator  */
3b386f
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
3b386f
+	addi	r11,r11,16
3b386f
+	stxvl	32+v1,r11,r10	/* Partial store  */
3b386f
+	blr
3b386f
+
3b386f
+L(tail3):
3b386f
+	stxv	32+v0,0(r11)
3b386f
+	stxv	32+v1,16(r11)
3b386f
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
3b386f
+	addi	r8,r8,1		/* Add null terminator  */
3b386f
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
3b386f
+	addi	r11,r11,32
3b386f
+	stxvl	32+v2,r11,r10	/* Partial store  */
3b386f
+	blr
3b386f
+
3b386f
+L(tail4):
3b386f
+	stxv	32+v0,0(r11)
3b386f
+	stxv	32+v1,16(r11)
3b386f
+	stxv	32+v2,32(r11)
3b386f
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
3b386f
+	addi	r8,r8,1		/* Add null terminator  */
3b386f
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
3b386f
+	addi	r11,r11,48
3b386f
+	stxvl	32+v3,r11,r10	/* Partial store  */
3b386f
+	blr
3b386f
+END (STRCPY)
3b386f
+libc_hidden_builtin_def (strcpy)
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
3b386f
index 963ea84dbfa98c74..17057bcbd694a710 100644
3b386f
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
3b386f
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
3b386f
 		   strncase-power8
3b386f
 
3b386f
 ifneq (,$(filter %le,$(config-machine)))
3b386f
-sysdep_routines += strcmp-power9 strncmp-power9
3b386f
+sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
3b386f
 endif
3b386f
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
3b386f
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
3b386f
index 1d374f2ae48165bd..2857fa8f36599afd 100644
3b386f
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
3b386f
@@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
3b386f
 
3b386f
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
3b386f
   IFUNC_IMPL (i, name, strcpy,
3b386f
+#ifdef __LITTLE_ENDIAN__
3b386f
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
3b386f
+			      __strcpy_power9)
3b386f
+#endif
3b386f
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
3b386f
 			      __strcpy_power8)
3b386f
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
3b386f
new file mode 100644
3b386f
index 0000000000000000..d22aa0a8d690cad7
3b386f
--- /dev/null
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
3b386f
@@ -0,0 +1,26 @@
3b386f
+/* Optimized strcpy implementation for POWER9/PPC64.
3b386f
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
3b386f
+   This file is part of the GNU C Library.
3b386f
+
3b386f
+   The GNU C Library is free software; you can redistribute it and/or
3b386f
+   modify it under the terms of the GNU Lesser General Public
3b386f
+   License as published by the Free Software Foundation; either
3b386f
+   version 2.1 of the License, or (at your option) any later version.
3b386f
+
3b386f
+   The GNU C Library is distributed in the hope that it will be useful,
3b386f
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
3b386f
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
3b386f
+   Lesser General Public License for more details.
3b386f
+
3b386f
+   You should have received a copy of the GNU Lesser General Public
3b386f
+   License along with the GNU C Library; if not, see
3b386f
+   <https://www.gnu.org/licenses/>.  */
3b386f
+
3b386f
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
3b386f
+#define STRCPY __strcpy_power9
3b386f
+
3b386f
+#undef libc_hidden_builtin_def
3b386f
+#define libc_hidden_builtin_def(name)
3b386f
+
3b386f
+#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
3b386f
+#endif
3b386f
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
3b386f
index b18a92a62a526d9c..88826392be4bdf48 100644
3b386f
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
3b386f
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
3b386f
@@ -25,9 +25,16 @@
3b386f
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
3b386f
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
3b386f
 extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
3b386f
+# ifdef __LITTLE_ENDIAN__
3b386f
+extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
3b386f
+# endif
3b386f
 #undef strcpy
3b386f
 
3b386f
 libc_ifunc_redirected (__redirect_strcpy, strcpy,
3b386f
+# ifdef __LITTLE_ENDIAN__
3b386f
+			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
3b386f
+			? __strcpy_power9 :
3b386f
+# endif
3b386f
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
3b386f
 		       ? __strcpy_power8
3b386f
 		       : (hwcap & PPC_FEATURE_HAS_VSX)