Tree - rpms/glibc - CentOS Git server

rpms / glibc

Files

Commit: 82b174bed8a33f1f15b2ab99da6489e9ad15c853

Blob Blame History Raw

 From 0d3555b9b4d5cefe116c32bfa38ac70f1d6c25cb Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Date: Wed, 11 Nov 2015 17:31:28 -0200
Subject: [PATCH] powerpc: Optimization for strlen for POWER8.
 
This implementation takes advantage of vectorization to improve performance of
the loop over the current strlen implementation for POWER7.
 
(cherry picked from commit 1b045ee53e0b8bed75745b931b33f27d21c9ed22)
---
 ChangeLog                                          |  13 +
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   2 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   2 +
 .../powerpc/powerpc64/multiarch/strlen-power8.S    |  39 +++
 sysdeps/powerpc/powerpc64/multiarch/strlen.c       |   9 +-
 sysdeps/powerpc/powerpc64/power8/strlen.S          | 297 +++++++++++++++++++++
 6 files changed, 358 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strlen.S
 
diff --git a/ChangeLog b/ChangeLog
index f030b68..e7ea58a 100644
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 7ed56bf..57abe8f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
                   strncpy-power8 strncpy-power7 strncpy-ppc64 \
                   strncat-power7 \
                   strstr-power7 strstr-ppc64 \
-                  strspn-power8 strspn-ppc64 \
+                  strspn-power8 strspn-ppc64 strlen-power8 \
                   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
                   strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
                   strncase-power7 strncase_l-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index f6c70ba..583885c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
+             IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+                             __strlen_power8)
              IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
                              __strlen_power7)
              IFUNC_IMPL_ADD (array, i, strlen, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
new file mode 100644
index 0000000..686dc3d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strlen implementation for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strlen_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strlen_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strlen_power8)
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strlen_power8)					\
+  END_2(__strlen_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index 79a53d9..4b400a5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen;
 
 extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
 
 libc_ifunc (__libc_strlen,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strlen_power7
-            : __strlen_ppc);
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+	    ? __strlen_power8 :
+	      (hwcap & PPC_FEATURE_HAS_VSX)
+	      ? __strlen_power7
+	      : __strlen_ppc);
 
 #undef strlen
 strong_alias (__libc_strlen, strlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
new file mode 100644
index 0000000..0142747
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,297 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+   loop.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+   binutils allows it.  */
+#define MFVRD(r,v)	.long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b)	.long (0x1000054c \
+			       | ((t)<<(32-11))	\
+			       | ((a)<<(32-16))	\
+			       | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3])  */
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+   allows it.  */
+	.machine  power7
+EALIGN (strlen, 4, 0)
+	CALL_MCOUNT 1
+	dcbt	0,r3
+	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
+	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
+	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
+	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	/* For shorter strings (< 64 bytes), we will not use vector registers,
+	   as the overhead isn't worth it.  So, let's use GPRs instead.  This
+	   will be done the same way as we do in the POWER7 implementation.
+	   Let's see if we are aligned to a quadword boundary.  If so, we can
+	   jump to the first (non-vectorized) loop.  Otherwise, we have to
+	   handle the next DWORD first.  */
+	mtcrf	0x01,r4
+	mr	r9,r4
+	addi	r9,r9,8
+	bt	28,L(align64)
+
+	/* Handle the next 8 bytes so we are aligned to a quadword
+	   boundary.  */
+	ldu	r5,8(r4)
+	cmpb	r10,r5,r0
+	cmpdi	cr7,r10,0
+	addi	r9,r9,8
+	bne	cr7,L(done)
+
+L(align64):
+	/* Proceed to the old (POWER7) implementation, checking two doublewords
+	   per iteraction.  For the first 56 bytes, we will just check for null
+	   characters.  After that, we will also check if we are 64-byte aligned
+	   so we can jump to the vectorized implementation.  We will unroll
+	   these loops to avoid excessive branching.  */
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
+	   Note: aligning to 64-byte will necessarily slow down performance for
+	   strings around 64 bytes in length due to the extra comparisons
+	   required to check alignment for the vectorized loop.  This is a
+	   necessary tradeoff we are willing to take in order to speed up the
+	   calculation for larger strings.  */
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+	bne	cr7,L(dword_zero)
+
+	andi.	r10,r9,63
+	beq	cr0,L(preloop)
+	ld	r6,8(r4)
+	ldu	r5,16(r4)
+	cmpb	r10,r6,r0
+	cmpb	r11,r5,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	addi	r9,r9,16
+
+	/* At this point, we are necessarily 64-byte aligned.  If no zeroes were
+	   found, jump to the vectorized loop.  */
+	beq	cr7,L(preloop)
+
+L(dword_zero):
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r10,0
+	addi	r4,r4,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r11
+	addi	r4,r4,8
+
+	/* If the null byte was found in the non-vectorized code, compute the
+	   final length.  r10 has the output of the cmpb instruction, that is,
+	   it contains 0xff in the same position as the null byte in the
+	   original doubleword from the string.  Use that to calculate the
+	   length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+
+	/* Vectorized implementation starts here.  */
+	.p2align  4
+L(preloop):
+	/* Set up for the loop.  */
+	mr	r4,r9
+	li	r7, 16	      /* Load required offsets.  */
+	li	r8, 32
+	li	r9, 48
+	li	r12, 8
+	vxor	v0,v0,v0      /* VR with null chars to use with
+				 vcmpequb.  */
+
+	/* Main loop to look for the end of the string.  We will read in
+	   64-byte chunks.  Align it to 32 bytes and unroll it 3 times to
+	   leverage the icache performance.  */
+	.p2align  5
+L(loop):
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	bne	  cr6,L(vmx_zero)
+
+	lvx	  v1,r4,r0  /* Load 4 quadwords.  */
+	lvx	  v2,r4,r7
+	lvx	  v3,r4,r8
+	lvx	  v4,r4,r9
+	vminub	  v5,v1,v2  /* Compare and merge into one VR for speed.  */
+	vminub	  v6,v3,v4
+	vminub	  v7,v5,v6
+	vcmpequb. v7,v7,v0  /* Check for NULLs.  */
+	addi	  r4,r4,64  /* Adjust address for the next iteration.  */
+	beq	  cr6,L(loop)
+
+L(vmx_zero):
+	/* OK, we found a null byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	vcmpequb  v1,v1,v0
+	vcmpequb  v2,v2,v0
+	vcmpequb  v3,v3,v0
+	vcmpequb  v4,v4,v0
+
+	/* We will now 'compress' the result into a single doubleword, so it
+	   can be moved to a GPR for the final calculation.  First, we
+	   generate an appropriate mask for vbpermq, so we can permute bits into
+	   the first halfword.  */
+	vspltisb  v10,3
+	lvsl	  v11,r0,r0
+	vslb	  v10,v11,v10
+
+	/* Permute the first bit of each byte into bits 48-63.  */
+	VBPERMQ(v1,v1,v10)
+	VBPERMQ(v2,v2,v10)
+	VBPERMQ(v3,v3,v10)
+	VBPERMQ(v4,v4,v10)
+
+	/* Shift each component into its correct position for merging.  */
+#ifdef __LITTLE_ENDIAN__
+	vsldoi  v2,v2,v2,2
+	vsldoi  v3,v3,v3,4
+	vsldoi  v4,v4,v4,6
+#else
+	vsldoi	v1,v1,v1,6
+	vsldoi	v2,v2,v2,4
+	vsldoi	v3,v3,v3,2
+#endif
+
+	/* Merge the results and move to a GPR.  */
+	vor	v1,v2,v1
+	vor	v2,v3,v4
+	vor	v4,v1,v2
+	MFVRD(r10,v4)
+
+	 /* Adjust address to the begninning of the current 64-byte block.  */
+	addi	r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10,-1    /* Form a mask from trailing zeros.  */
+	andc	r9, r9,r10
+	popcntd	r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+
+END (strlen)
+libc_hidden_builtin_def (strlen)
-- 
2.1.0

	From 0d3555b9b4d5cefe116c32bfa38ac70f1d6c25cb Mon Sep 17 00:00:00 2001
	From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
	Date: Wed, 11 Nov 2015 17:31:28 -0200
	Subject: [PATCH] powerpc: Optimization for strlen for POWER8.

	This implementation takes advantage of vectorization to improve performance of
	the loop over the current strlen implementation for POWER7.

	(cherry picked from commit 1b045ee53e0b8bed75745b931b33f27d21c9ed22)
	---
	ChangeLog \| 13 +
	sysdeps/powerpc/powerpc64/multiarch/Makefile \| 2 +-
	.../powerpc/powerpc64/multiarch/ifunc-impl-list.c \| 2 +
	.../powerpc/powerpc64/multiarch/strlen-power8.S \| 39 +++
	sysdeps/powerpc/powerpc64/multiarch/strlen.c \| 9 +-
	sysdeps/powerpc/powerpc64/power8/strlen.S \| 297 +++++++++++++++++++++
	6 files changed, 358 insertions(+), 4 deletions(-)
	create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
	create mode 100644 sysdeps/powerpc/powerpc64/power8/strlen.S

	diff --git a/ChangeLog b/ChangeLog
	index f030b68..e7ea58a 100644
	diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
	index 7ed56bf..57abe8f 100644
	--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
	+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
	@@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
	strncpy-power8 strncpy-power7 strncpy-ppc64 \
	strncat-power7 \
	strstr-power7 strstr-ppc64 \
	- strspn-power8 strspn-ppc64 \
	+ strspn-power8 strspn-ppc64 strlen-power8 \
	rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
	strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
	strncase-power7 strncase_l-power7 \
	diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	index f6c70ba..583885c 100644
	--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	@@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,

	/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
	IFUNC_IMPL (i, name, strlen,
	+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
	+ __strlen_power8)
	IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
	__strlen_power7)
	IFUNC_IMPL_ADD (array, i, strlen, 1,
	diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
	new file mode 100644
	index 0000000..686dc3d
	--- /dev/null
	+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
	@@ -0,0 +1,39 @@
	+/* Optimized strlen implementation for POWER8.
	+ Copyright (C) 2016 Free Software Foundation, Inc.
	+ This file is part of the GNU C Library.
	+
	+ The GNU C Library is free software; you can redistribute it and/or
	+ modify it under the terms of the GNU Lesser General Public
	+ License as published by the Free Software Foundation; either
	+ version 2.1 of the License, or (at your option) any later version.
	+
	+ The GNU C Library is distributed in the hope that it will be useful,
	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	+ Lesser General Public License for more details.
	+
	+ You should have received a copy of the GNU Lesser General Public
	+ License along with the GNU C Library; if not, see
	+ <http://www.gnu.org/licenses/>. */
	+
	+#include <sysdep.h>
	+
	+#undef EALIGN
	+#define EALIGN(name, alignt, words) \
	+ .section ".text"; \
	+ ENTRY_2(__strlen_power8) \
	+ .align ALIGNARG(alignt); \
	+ EALIGN_W_##words; \
	+ BODY_LABEL(__strlen_power8): \
	+ cfi_startproc; \
	+ LOCALENTRY(__strlen_power8)
	+#undef END
	+#define END(name) \
	+ cfi_endproc; \
	+ TRACEBACK(__strlen_power8) \
	+ END_2(__strlen_power8)
	+
	+#undef libc_hidden_builtin_def
	+#define libc_hidden_builtin_def(name)
	+
	+#include <sysdeps/powerpc/powerpc64/power8/strlen.S>
	diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
	index 79a53d9..4b400a5 100644
	--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
	+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
	@@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen;

	extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
	extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
	+extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;

	libc_ifunc (__libc_strlen,
	- (hwcap & PPC_FEATURE_HAS_VSX)
	- ? __strlen_power7
	- : __strlen_ppc);
	+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
	+ ? __strlen_power8 :
	+ (hwcap & PPC_FEATURE_HAS_VSX)
	+ ? __strlen_power7
	+ : __strlen_ppc);

	#undef strlen
	strong_alias (__libc_strlen, strlen)
	diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
	new file mode 100644
	index 0000000..0142747
	--- /dev/null
	+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
	@@ -0,0 +1,297 @@
	+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
	+ loop.
	+ Copyright (C) 2016 Free Software Foundation, Inc.
	+ This file is part of the GNU C Library.
	+
	+ The GNU C Library is free software; you can redistribute it and/or
	+ modify it under the terms of the GNU Lesser General Public
	+ License as published by the Free Software Foundation; either
	+ version 2.1 of the License, or (at your option) any later version.
	+
	+ The GNU C Library is distributed in the hope that it will be useful,
	+ but WITHOUT ANY WARRANTY; without even the implied warranty of
	+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	+ Lesser General Public License for more details.
	+
	+ You should have received a copy of the GNU Lesser General Public
	+ License along with the GNU C Library; if not, see
	+ <http://www.gnu.org/licenses/>. */
	+
	+#include <sysdep.h>
	+
	+/* TODO: change these to the actual instructions when the minimum required
	+ binutils allows it. */
	+#define MFVRD(r,v) .long (0x7c000067 \| ((v)<<(32-11)) \| ((r)<<(32-16)))
	+#define VBPERMQ(t,a,b) .long (0x1000054c \
	+ \| ((t)<<(32-11)) \
	+ \| ((a)<<(32-16)) \
	+ \| ((b)<<(32-21)) )
	+
	+/* int [r3] strlen (char s [r3]) /
	+
	+/* TODO: change this to .machine power8 when the minimum required binutils
	+ allows it. */
	+ .machine power7
	+EALIGN (strlen, 4, 0)
	+ CALL_MCOUNT 1
	+ dcbt 0,r3
	+ clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
	+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
	+ li r0,0 /* Doubleword with null chars to use
	+ with cmpb. */
	+ li r5,-1 /* MASK = 0xffffffffffffffff. */
	+ ld r12,0(r4) /* Load doubleword from memory. */
	+#ifdef __LITTLE_ENDIAN__
	+ sld r5,r5,r6
	+#else
	+ srd r5,r5,r6 /* MASK = MASK >> padding. */
	+#endif
	+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
	+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
	+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
	+ bne cr7,L(done)
	+
	+ /* For shorter strings (< 64 bytes), we will not use vector registers,
	+ as the overhead isn't worth it. So, let's use GPRs instead. This
	+ will be done the same way as we do in the POWER7 implementation.
	+ Let's see if we are aligned to a quadword boundary. If so, we can
	+ jump to the first (non-vectorized) loop. Otherwise, we have to
	+ handle the next DWORD first. */
	+ mtcrf 0x01,r4
	+ mr r9,r4
	+ addi r9,r9,8
	+ bt 28,L(align64)
	+
	+ /* Handle the next 8 bytes so we are aligned to a quadword
	+ boundary. */
	+ ldu r5,8(r4)
	+ cmpb r10,r5,r0
	+ cmpdi cr7,r10,0
	+ addi r9,r9,8
	+ bne cr7,L(done)
	+
	+L(align64):
	+ /* Proceed to the old (POWER7) implementation, checking two doublewords
	+ per iteraction. For the first 56 bytes, we will just check for null
	+ characters. After that, we will also check if we are 64-byte aligned
	+ so we can jump to the vectorized implementation. We will unroll
	+ these loops to avoid excessive branching. */
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+ bne cr7,L(dword_zero)
	+
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+ bne cr7,L(dword_zero)
	+
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+ bne cr7,L(dword_zero)
	+
	+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
	+ Note: aligning to 64-byte will necessarily slow down performance for
	+ strings around 64 bytes in length due to the extra comparisons
	+ required to check alignment for the vectorized loop. This is a
	+ necessary tradeoff we are willing to take in order to speed up the
	+ calculation for larger strings. */
	+ andi. r10,r9,63
	+ beq cr0,L(preloop)
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+ bne cr7,L(dword_zero)
	+
	+ andi. r10,r9,63
	+ beq cr0,L(preloop)
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+ bne cr7,L(dword_zero)
	+
	+ andi. r10,r9,63
	+ beq cr0,L(preloop)
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+ bne cr7,L(dword_zero)
	+
	+ andi. r10,r9,63
	+ beq cr0,L(preloop)
	+ ld r6,8(r4)
	+ ldu r5,16(r4)
	+ cmpb r10,r6,r0
	+ cmpb r11,r5,r0
	+ or r5,r10,r11
	+ cmpdi cr7,r5,0
	+ addi r9,r9,16
	+
	+ /* At this point, we are necessarily 64-byte aligned. If no zeroes were
	+ found, jump to the vectorized loop. */
	+ beq cr7,L(preloop)
	+
	+L(dword_zero):
	+ /* OK, one (or both) of the doublewords contains a null byte. Check
	+ the first doubleword and decrement the address in case the first
	+ doubleword really contains a null byte. */
	+
	+ cmpdi cr6,r10,0
	+ addi r4,r4,-8
	+ bne cr6,L(done)
	+
	+ /* The null byte must be in the second doubleword. Adjust the address
	+ again and move the result of cmpb to r10 so we can calculate the
	+ length. */
	+
	+ mr r10,r11
	+ addi r4,r4,8
	+
	+ /* If the null byte was found in the non-vectorized code, compute the
	+ final length. r10 has the output of the cmpb instruction, that is,
	+ it contains 0xff in the same position as the null byte in the
	+ original doubleword from the string. Use that to calculate the
	+ length. */
	+L(done):
	+#ifdef __LITTLE_ENDIAN__
	+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
	+ andc r9, r9,r10
	+ popcntd r0, r9 /* Count the bits in the mask. */
	+#else
	+ cntlzd r0,r10 /* Count leading zeros before the match. */
	+#endif
	+ subf r5,r3,r4
	+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
	+ add r3,r5,r0 /* Compute final length. */
	+ blr
	+
	+ /* Vectorized implementation starts here. */
	+ .p2align 4
	+L(preloop):
	+ /* Set up for the loop. */
	+ mr r4,r9
	+ li r7, 16 /* Load required offsets. */
	+ li r8, 32
	+ li r9, 48
	+ li r12, 8
	+ vxor v0,v0,v0 /* VR with null chars to use with
	+ vcmpequb. */
	+
	+ /* Main loop to look for the end of the string. We will read in
	+ 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
	+ leverage the icache performance. */
	+ .p2align 5
	+L(loop):
	+ lvx v1,r4,r0 /* Load 4 quadwords. */
	+ lvx v2,r4,r7
	+ lvx v3,r4,r8
	+ lvx v4,r4,r9
	+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
	+ vminub v6,v3,v4
	+ vminub v7,v5,v6
	+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
	+ addi r4,r4,64 /* Adjust address for the next iteration. */
	+ bne cr6,L(vmx_zero)
	+
	+ lvx v1,r4,r0 /* Load 4 quadwords. */
	+ lvx v2,r4,r7
	+ lvx v3,r4,r8
	+ lvx v4,r4,r9
	+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
	+ vminub v6,v3,v4
	+ vminub v7,v5,v6
	+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
	+ addi r4,r4,64 /* Adjust address for the next iteration. */
	+ bne cr6,L(vmx_zero)
	+
	+ lvx v1,r4,r0 /* Load 4 quadwords. */
	+ lvx v2,r4,r7
	+ lvx v3,r4,r8
	+ lvx v4,r4,r9
	+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
	+ vminub v6,v3,v4
	+ vminub v7,v5,v6
	+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
	+ addi r4,r4,64 /* Adjust address for the next iteration. */
	+ beq cr6,L(loop)
	+
	+L(vmx_zero):
	+ /* OK, we found a null byte. Let's look for it in the current 64-byte
	+ block and mark it in its corresponding VR. */
	+ vcmpequb v1,v1,v0
	+ vcmpequb v2,v2,v0
	+ vcmpequb v3,v3,v0
	+ vcmpequb v4,v4,v0
	+
	+ /* We will now 'compress' the result into a single doubleword, so it
	+ can be moved to a GPR for the final calculation. First, we
	+ generate an appropriate mask for vbpermq, so we can permute bits into
	+ the first halfword. */
	+ vspltisb v10,3
	+ lvsl v11,r0,r0
	+ vslb v10,v11,v10
	+
	+ /* Permute the first bit of each byte into bits 48-63. */
	+ VBPERMQ(v1,v1,v10)
	+ VBPERMQ(v2,v2,v10)
	+ VBPERMQ(v3,v3,v10)
	+ VBPERMQ(v4,v4,v10)
	+
	+ /* Shift each component into its correct position for merging. */
	+#ifdef __LITTLE_ENDIAN__
	+ vsldoi v2,v2,v2,2
	+ vsldoi v3,v3,v3,4
	+ vsldoi v4,v4,v4,6
	+#else
	+ vsldoi v1,v1,v1,6
	+ vsldoi v2,v2,v2,4
	+ vsldoi v3,v3,v3,2
	+#endif
	+
	+ /* Merge the results and move to a GPR. */
	+ vor v1,v2,v1
	+ vor v2,v3,v4
	+ vor v4,v1,v2
	+ MFVRD(r10,v4)
	+
	+ /* Adjust address to the begninning of the current 64-byte block. */
	+ addi r4,r4,-64
	+
	+#ifdef __LITTLE_ENDIAN__
	+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
	+ andc r9, r9,r10
	+ popcntd r0, r9 /* Count the bits in the mask. */
	+#else
	+ cntlzd r0,r10 /* Count leading zeros before the match. */
	+#endif
	+ subf r5,r3,r4
	+ add r3,r5,r0 /* Compute final length. */
	+ blr
	+
	+END (strlen)
	+libc_hidden_builtin_def (strlen)
	--
	2.1.0

rpms / glibc

Source Code

Files