| From 0d3555b9b4d5cefe116c32bfa38ac70f1d6c25cb Mon Sep 17 00:00:00 2001 |
| From: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com> |
| Date: Wed, 11 Nov 2015 17:31:28 -0200 |
| Subject: [PATCH] powerpc: Optimization for strlen for POWER8. |
| |
| This implementation takes advantage of vectorization to improve performance of |
| the loop over the current strlen implementation for POWER7. |
| |
| (cherry picked from commit 1b045ee53e0b8bed75745b931b33f27d21c9ed22) |
| |
| ChangeLog | 13 + |
| sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +- |
| .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 2 + |
| .../powerpc/powerpc64/multiarch/strlen-power8.S | 39 +++ |
| sysdeps/powerpc/powerpc64/multiarch/strlen.c | 9 +- |
| sysdeps/powerpc/powerpc64/power8/strlen.S | 297 +++++++++++++++++++++ |
| 6 files changed, 358 insertions(+), 4 deletions(-) |
| create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S |
| create mode 100644 sysdeps/powerpc/powerpc64/power8/strlen.S |
| |
| diff --git a/ChangeLog b/ChangeLog |
| index f030b68..e7ea58a 100644 |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile |
| index 7ed56bf..57abe8f 100644 |
| |
| |
| @@ -20,7 +20,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ |
| strncpy-power8 strncpy-power7 strncpy-ppc64 \ |
| strncat-power7 \ |
| strstr-power7 strstr-ppc64 \ |
| - strspn-power8 strspn-ppc64 \ |
| + strspn-power8 strspn-ppc64 strlen-power8 \ |
| rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \ |
| strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \ |
| strncase-power7 strncase_l-power7 \ |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c |
| index f6c70ba..583885c 100644 |
| |
| |
| @@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| |
| /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */ |
| IFUNC_IMPL (i, name, strlen, |
| + IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07, |
| + __strlen_power8) |
| IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX, |
| __strlen_power7) |
| IFUNC_IMPL_ADD (array, i, strlen, 1, |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S |
| new file mode 100644 |
| index 0000000..686dc3d |
| |
| |
| @@ -0,0 +1,39 @@ |
| +/* Optimized strlen implementation for POWER8. |
| + Copyright (C) 2016 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include <sysdep.h> |
| + |
| +#undef EALIGN |
| +#define EALIGN(name, alignt, words) \ |
| + .section ".text"; \ |
| + ENTRY_2(__strlen_power8) \ |
| + .align ALIGNARG(alignt); \ |
| + EALIGN_W_##words; \ |
| + BODY_LABEL(__strlen_power8): \ |
| + cfi_startproc; \ |
| + LOCALENTRY(__strlen_power8) |
| +#undef END |
| +#define END(name) \ |
| + cfi_endproc; \ |
| + TRACEBACK(__strlen_power8) \ |
| + END_2(__strlen_power8) |
| + |
| +#undef libc_hidden_builtin_def |
| +#define libc_hidden_builtin_def(name) |
| + |
| +#include <sysdeps/powerpc/powerpc64/power8/strlen.S> |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c |
| index 79a53d9..4b400a5 100644 |
| |
| |
| @@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen; |
| |
| extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden; |
| extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden; |
| +extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden; |
| |
| libc_ifunc (__libc_strlen, |
| - (hwcap & PPC_FEATURE_HAS_VSX) |
| - ? __strlen_power7 |
| - : __strlen_ppc); |
| + (hwcap2 & PPC_FEATURE2_ARCH_2_07) |
| + ? __strlen_power8 : |
| + (hwcap & PPC_FEATURE_HAS_VSX) |
| + ? __strlen_power7 |
| + : __strlen_ppc); |
| |
| #undef strlen |
| strong_alias (__libc_strlen, strlen) |
| diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S |
| new file mode 100644 |
| index 0000000..0142747 |
| |
| |
| @@ -0,0 +1,297 @@ |
| +/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized |
| + loop. |
| + Copyright (C) 2016 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include <sysdep.h> |
| + |
| +/* TODO: change these to the actual instructions when the minimum required |
| + binutils allows it. */ |
| +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) |
| +#define VBPERMQ(t,a,b) .long (0x1000054c \ |
| + | ((t)<<(32-11)) \ |
| + | ((a)<<(32-16)) \ |
| + | ((b)<<(32-21)) ) |
| + |
| +/* int [r3] strlen (char *s [r3]) */ |
| + |
| +/* TODO: change this to .machine power8 when the minimum required binutils |
| + allows it. */ |
| + .machine power7 |
| +EALIGN (strlen, 4, 0) |
| + CALL_MCOUNT 1 |
| + dcbt 0,r3 |
| + clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ |
| + rlwinm r6,r3,3,26,28 /* Calculate padding. */ |
| + li r0,0 /* Doubleword with null chars to use |
| + with cmpb. */ |
| + li r5,-1 /* MASK = 0xffffffffffffffff. */ |
| + ld r12,0(r4) /* Load doubleword from memory. */ |
| +#ifdef __LITTLE_ENDIAN__ |
| + sld r5,r5,r6 |
| +#else |
| + srd r5,r5,r6 /* MASK = MASK >> padding. */ |
| +#endif |
| + orc r9,r12,r5 /* Mask bits that are not part of the string. */ |
| + cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ |
| + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ |
| + bne cr7,L(done) |
| + |
| + /* For shorter strings (< 64 bytes), we will not use vector registers, |
| + as the overhead isn't worth it. So, let's use GPRs instead. This |
| + will be done the same way as we do in the POWER7 implementation. |
| + Let's see if we are aligned to a quadword boundary. If so, we can |
| + jump to the first (non-vectorized) loop. Otherwise, we have to |
| + handle the next DWORD first. */ |
| + mtcrf 0x01,r4 |
| + mr r9,r4 |
| + addi r9,r9,8 |
| + bt 28,L(align64) |
| + |
| + /* Handle the next 8 bytes so we are aligned to a quadword |
| + boundary. */ |
| + ldu r5,8(r4) |
| + cmpb r10,r5,r0 |
| + cmpdi cr7,r10,0 |
| + addi r9,r9,8 |
| + bne cr7,L(done) |
| + |
| +L(align64): |
| + /* Proceed to the old (POWER7) implementation, checking two doublewords |
| + per iteraction. For the first 56 bytes, we will just check for null |
| + characters. After that, we will also check if we are 64-byte aligned |
| + so we can jump to the vectorized implementation. We will unroll |
| + these loops to avoid excessive branching. */ |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + bne cr7,L(dword_zero) |
| + |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + bne cr7,L(dword_zero) |
| + |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + bne cr7,L(dword_zero) |
| + |
| + /* Are we 64-byte aligned? If so, jump to the vectorized loop. |
| + Note: aligning to 64-byte will necessarily slow down performance for |
| + strings around 64 bytes in length due to the extra comparisons |
| + required to check alignment for the vectorized loop. This is a |
| + necessary tradeoff we are willing to take in order to speed up the |
| + calculation for larger strings. */ |
| + andi. r10,r9,63 |
| + beq cr0,L(preloop) |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + bne cr7,L(dword_zero) |
| + |
| + andi. r10,r9,63 |
| + beq cr0,L(preloop) |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + bne cr7,L(dword_zero) |
| + |
| + andi. r10,r9,63 |
| + beq cr0,L(preloop) |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + bne cr7,L(dword_zero) |
| + |
| + andi. r10,r9,63 |
| + beq cr0,L(preloop) |
| + ld r6,8(r4) |
| + ldu r5,16(r4) |
| + cmpb r10,r6,r0 |
| + cmpb r11,r5,r0 |
| + or r5,r10,r11 |
| + cmpdi cr7,r5,0 |
| + addi r9,r9,16 |
| + |
| + /* At this point, we are necessarily 64-byte aligned. If no zeroes were |
| + found, jump to the vectorized loop. */ |
| + beq cr7,L(preloop) |
| + |
| +L(dword_zero): |
| + /* OK, one (or both) of the doublewords contains a null byte. Check |
| + the first doubleword and decrement the address in case the first |
| + doubleword really contains a null byte. */ |
| + |
| + cmpdi cr6,r10,0 |
| + addi r4,r4,-8 |
| + bne cr6,L(done) |
| + |
| + /* The null byte must be in the second doubleword. Adjust the address |
| + again and move the result of cmpb to r10 so we can calculate the |
| + length. */ |
| + |
| + mr r10,r11 |
| + addi r4,r4,8 |
| + |
| + /* If the null byte was found in the non-vectorized code, compute the |
| + final length. r10 has the output of the cmpb instruction, that is, |
| + it contains 0xff in the same position as the null byte in the |
| + original doubleword from the string. Use that to calculate the |
| + length. */ |
| +L(done): |
| +#ifdef __LITTLE_ENDIAN__ |
| + addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
| + andc r9, r9,r10 |
| + popcntd r0, r9 /* Count the bits in the mask. */ |
| +#else |
| + cntlzd r0,r10 /* Count leading zeros before the match. */ |
| +#endif |
| + subf r5,r3,r4 |
| + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ |
| + add r3,r5,r0 /* Compute final length. */ |
| + blr |
| + |
| + /* Vectorized implementation starts here. */ |
| + .p2align 4 |
| +L(preloop): |
| + /* Set up for the loop. */ |
| + mr r4,r9 |
| + li r7, 16 /* Load required offsets. */ |
| + li r8, 32 |
| + li r9, 48 |
| + li r12, 8 |
| + vxor v0,v0,v0 /* VR with null chars to use with |
| + vcmpequb. */ |
| + |
| + /* Main loop to look for the end of the string. We will read in |
| + 64-byte chunks. Align it to 32 bytes and unroll it 3 times to |
| + leverage the icache performance. */ |
| + .p2align 5 |
| +L(loop): |
| + lvx v1,r4,r0 /* Load 4 quadwords. */ |
| + lvx v2,r4,r7 |
| + lvx v3,r4,r8 |
| + lvx v4,r4,r9 |
| + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| + vminub v6,v3,v4 |
| + vminub v7,v5,v6 |
| + vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| + addi r4,r4,64 /* Adjust address for the next iteration. */ |
| + bne cr6,L(vmx_zero) |
| + |
| + lvx v1,r4,r0 /* Load 4 quadwords. */ |
| + lvx v2,r4,r7 |
| + lvx v3,r4,r8 |
| + lvx v4,r4,r9 |
| + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| + vminub v6,v3,v4 |
| + vminub v7,v5,v6 |
| + vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| + addi r4,r4,64 /* Adjust address for the next iteration. */ |
| + bne cr6,L(vmx_zero) |
| + |
| + lvx v1,r4,r0 /* Load 4 quadwords. */ |
| + lvx v2,r4,r7 |
| + lvx v3,r4,r8 |
| + lvx v4,r4,r9 |
| + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ |
| + vminub v6,v3,v4 |
| + vminub v7,v5,v6 |
| + vcmpequb. v7,v7,v0 /* Check for NULLs. */ |
| + addi r4,r4,64 /* Adjust address for the next iteration. */ |
| + beq cr6,L(loop) |
| + |
| +L(vmx_zero): |
| + /* OK, we found a null byte. Let's look for it in the current 64-byte |
| + block and mark it in its corresponding VR. */ |
| + vcmpequb v1,v1,v0 |
| + vcmpequb v2,v2,v0 |
| + vcmpequb v3,v3,v0 |
| + vcmpequb v4,v4,v0 |
| + |
| + /* We will now 'compress' the result into a single doubleword, so it |
| + can be moved to a GPR for the final calculation. First, we |
| + generate an appropriate mask for vbpermq, so we can permute bits into |
| + the first halfword. */ |
| + vspltisb v10,3 |
| + lvsl v11,r0,r0 |
| + vslb v10,v11,v10 |
| + |
| + /* Permute the first bit of each byte into bits 48-63. */ |
| + VBPERMQ(v1,v1,v10) |
| + VBPERMQ(v2,v2,v10) |
| + VBPERMQ(v3,v3,v10) |
| + VBPERMQ(v4,v4,v10) |
| + |
| + /* Shift each component into its correct position for merging. */ |
| +#ifdef __LITTLE_ENDIAN__ |
| + vsldoi v2,v2,v2,2 |
| + vsldoi v3,v3,v3,4 |
| + vsldoi v4,v4,v4,6 |
| +#else |
| + vsldoi v1,v1,v1,6 |
| + vsldoi v2,v2,v2,4 |
| + vsldoi v3,v3,v3,2 |
| +#endif |
| + |
| + /* Merge the results and move to a GPR. */ |
| + vor v1,v2,v1 |
| + vor v2,v3,v4 |
| + vor v4,v1,v2 |
| + MFVRD(r10,v4) |
| + |
| + /* Adjust address to the begninning of the current 64-byte block. */ |
| + addi r4,r4,-64 |
| + |
| +#ifdef __LITTLE_ENDIAN__ |
| + addi r9, r10,-1 /* Form a mask from trailing zeros. */ |
| + andc r9, r9,r10 |
| + popcntd r0, r9 /* Count the bits in the mask. */ |
| +#else |
| + cntlzd r0,r10 /* Count leading zeros before the match. */ |
| +#endif |
| + subf r5,r3,r4 |
| + add r3,r5,r0 /* Compute final length. */ |
| + blr |
| + |
| +END (strlen) |
| +libc_hidden_builtin_def (strlen) |
| -- |
| 2.1.0 |
| |