From 0d3555b9b4d5cefe116c32bfa38ac70f1d6c25cb Mon Sep 17 00:00:00 2001
From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Date: Wed, 11 Nov 2015 17:31:28 -0200
Subject: [PATCH] powerpc: Optimization for strlen for POWER8.
This implementation takes advantage of vectorization to improve performance of
the loop over the current strlen implementation for POWER7.
(cherry picked from commit 1b045ee53e0b8bed75745b931b33f27d21c9ed22)
---
ChangeLog | 13 +
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 2 +
.../powerpc/powerpc64/multiarch/strlen-power8.S | 39 +++
sysdeps/powerpc/powerpc64/multiarch/strlen.c | 9 +-
sysdeps/powerpc/powerpc64/power8/strlen.S | 297 +++++++++++++++++++++
6 files changed, 358 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/strlen.S
diff --git a/ChangeLog b/ChangeLog
index f030b68..e7ea58a 100644
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 7ed56bf..57abe8f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strncpy-power8 strncpy-power7 strncpy-ppc64 \
strncat-power7 \
strstr-power7 strstr-ppc64 \
- strspn-power8 strspn-ppc64 \
+ strspn-power8 strspn-ppc64 strlen-power8 \
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
strncase-power7 strncase_l-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index f6c70ba..583885c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strlen_power8)
IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
__strlen_power7)
IFUNC_IMPL_ADD (array, i, strlen, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
new file mode 100644
index 0000000..686dc3d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S
@@ -0,0 +1,39 @@
+/* Optimized strlen implementation for POWER8.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strlen_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strlen_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strlen_power8)
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strlen_power8) \
+ END_2(__strlen_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index 79a53d9..4b400a5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen;
extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
libc_ifunc (__libc_strlen,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strlen_power7
- : __strlen_ppc);
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strlen_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strlen_power7
+ : __strlen_ppc);
#undef strlen
strong_alias (__libc_strlen, strlen)
diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S
new file mode 100644
index 0000000..0142747
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strlen.S
@@ -0,0 +1,297 @@
+/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized
+ loop.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* TODO: change these to the actual instructions when the minimum required
+ binutils allows it. */
+#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16)))
+#define VBPERMQ(t,a,b) .long (0x1000054c \
+ | ((t)<<(32-11)) \
+ | ((a)<<(32-16)) \
+ | ((b)<<(32-21)) )
+
+/* int [r3] strlen (char *s [r3]) */
+
+/* TODO: change this to .machine power8 when the minimum required binutils
+ allows it. */
+ .machine power7
+EALIGN (strlen, 4, 0)
+ CALL_MCOUNT 1
+ dcbt 0,r3
+ clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r3,3,26,28 /* Calculate padding. */
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r4) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ /* For shorter strings (< 64 bytes), we will not use vector registers,
+ as the overhead isn't worth it. So, let's use GPRs instead. This
+ will be done the same way as we do in the POWER7 implementation.
+ Let's see if we are aligned to a quadword boundary. If so, we can
+ jump to the first (non-vectorized) loop. Otherwise, we have to
+ handle the next DWORD first. */
+ mtcrf 0x01,r4
+ mr r9,r4
+ addi r9,r9,8
+ bt 28,L(align64)
+
+ /* Handle the next 8 bytes so we are aligned to a quadword
+ boundary. */
+ ldu r5,8(r4)
+ cmpb r10,r5,r0
+ cmpdi cr7,r10,0
+ addi r9,r9,8
+ bne cr7,L(done)
+
+L(align64):
+ /* Proceed to the old (POWER7) implementation, checking two doublewords
+ per iteraction. For the first 56 bytes, we will just check for null
+ characters. After that, we will also check if we are 64-byte aligned
+ so we can jump to the vectorized implementation. We will unroll
+ these loops to avoid excessive branching. */
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Note: aligning to 64-byte will necessarily slow down performance for
+ strings around 64 bytes in length due to the extra comparisons
+ required to check alignment for the vectorized loop. This is a
+ necessary tradeoff we are willing to take in order to speed up the
+ calculation for larger strings. */
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+ bne cr7,L(dword_zero)
+
+ andi. r10,r9,63
+ beq cr0,L(preloop)
+ ld r6,8(r4)
+ ldu r5,16(r4)
+ cmpb r10,r6,r0
+ cmpb r11,r5,r0
+ or r5,r10,r11
+ cmpdi cr7,r5,0
+ addi r9,r9,16
+
+ /* At this point, we are necessarily 64-byte aligned. If no zeroes were
+ found, jump to the vectorized loop. */
+ beq cr7,L(preloop)
+
+L(dword_zero):
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ cmpdi cr6,r10,0
+ addi r4,r4,-8
+ bne cr6,L(done)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r11
+ addi r4,r4,8
+
+ /* If the null byte was found in the non-vectorized code, compute the
+ final length. r10 has the output of the cmpb instruction, that is,
+ it contains 0xff in the same position as the null byte in the
+ original doubleword from the string. Use that to calculate the
+ length. */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
+ andc r9, r9,r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+ /* Vectorized implementation starts here. */
+ .p2align 4
+L(preloop):
+ /* Set up for the loop. */
+ mr r4,r9
+ li r7, 16 /* Load required offsets. */
+ li r8, 32
+ li r9, 48
+ li r12, 8
+ vxor v0,v0,v0 /* VR with null chars to use with
+ vcmpequb. */
+
+ /* Main loop to look for the end of the string. We will read in
+ 64-byte chunks. Align it to 32 bytes and unroll it 3 times to
+ leverage the icache performance. */
+ .p2align 5
+L(loop):
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ bne cr6,L(vmx_zero)
+
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ bne cr6,L(vmx_zero)
+
+ lvx v1,r4,r0 /* Load 4 quadwords. */
+ lvx v2,r4,r7
+ lvx v3,r4,r8
+ lvx v4,r4,r9
+ vminub v5,v1,v2 /* Compare and merge into one VR for speed. */
+ vminub v6,v3,v4
+ vminub v7,v5,v6
+ vcmpequb. v7,v7,v0 /* Check for NULLs. */
+ addi r4,r4,64 /* Adjust address for the next iteration. */
+ beq cr6,L(loop)
+
+L(vmx_zero):
+ /* OK, we found a null byte. Let's look for it in the current 64-byte
+ block and mark it in its corresponding VR. */
+ vcmpequb v1,v1,v0
+ vcmpequb v2,v2,v0
+ vcmpequb v3,v3,v0
+ vcmpequb v4,v4,v0
+
+ /* We will now 'compress' the result into a single doubleword, so it
+ can be moved to a GPR for the final calculation. First, we
+ generate an appropriate mask for vbpermq, so we can permute bits into
+ the first halfword. */
+ vspltisb v10,3
+ lvsl v11,r0,r0
+ vslb v10,v11,v10
+
+ /* Permute the first bit of each byte into bits 48-63. */
+ VBPERMQ(v1,v1,v10)
+ VBPERMQ(v2,v2,v10)
+ VBPERMQ(v3,v3,v10)
+ VBPERMQ(v4,v4,v10)
+
+ /* Shift each component into its correct position for merging. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v2,v2,v2,2
+ vsldoi v3,v3,v3,4
+ vsldoi v4,v4,v4,6
+#else
+ vsldoi v1,v1,v1,6
+ vsldoi v2,v2,v2,4
+ vsldoi v3,v3,v3,2
+#endif
+
+ /* Merge the results and move to a GPR. */
+ vor v1,v2,v1
+ vor v2,v3,v4
+ vor v4,v1,v2
+ MFVRD(r10,v4)
+
+ /* Adjust address to the begninning of the current 64-byte block. */
+ addi r4,r4,-64
+
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10,-1 /* Form a mask from trailing zeros. */
+ andc r9, r9,r10
+ popcntd r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r0,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r3,r4
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+END (strlen)
+libc_hidden_builtin_def (strlen)
--
2.1.0