| commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd |
| Author: Matheus Castanho <msc@linux.ibm.com> |
| Date: Tue May 11 17:53:07 2021 -0300 |
| |
| powerpc: Add optimized rawmemchr for POWER10 |
| |
| Reuse code for optimized strlen to implement a faster version of rawmemchr. |
| This takes advantage of the same benefits provided by the strlen implementation, |
| but needs some extra steps. __strlen_power10 code should be unchanged after this |
| change. |
| |
| rawmemchr returns a pointer to the char found, while strlen returns only the |
| length, so we have to take that into account when preparing the return value. |
| |
| To quickly check 64B, the loop on __strlen_power10 merges the whole block into |
| 16B by using unsigned minimum vector operations (vminub) and checks if there are |
| any \0 on the resulting vector. The same code is used by rawmemchr if the char c |
| is 0. However, this approach does not work when c != 0. We first need to |
| subtract each byte by c, so that the value we are looking for is converted to a |
| 0, then taking the minimum and checking for nulls works again. |
| |
| The new code branches after it has compared ~256 bytes and chooses which of the |
| two strategies above will be used in the main loop, based on the char c. This |
| extra branch adds some overhead (~5%) for length ~256, but is quickly amortized |
| by the faster loop for larger sizes. |
| |
| Compared to __rawmemchr_power9, this version is ~20% faster for length < 256. |
| Because of the optimized main loop, the improvement becomes ~35% for c != 0 |
| and ~50% for c = 0 for strings longer than 256. |
| |
| Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com> |
| Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com> |
| |
| diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S |
| new file mode 100644 |
| index 0000000000000000..5351c2634f6086bf |
| |
| |
| @@ -0,0 +1,22 @@ |
| +/* Optimized rawmemchr implementation for POWER10 LE. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#include <sysdep.h> |
| + |
| +#define USE_AS_RAWMEMCHR 1 |
| +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S> |
| diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S |
| index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644 |
| |
| |
| @@ -18,10 +18,50 @@ |
| |
| #include <sysdep.h> |
| |
| -#ifndef STRLEN |
| -# define STRLEN __strlen |
| -# define DEFINE_STRLEN_HIDDEN_DEF 1 |
| -#endif |
| +/* To reuse the code for rawmemchr, we have some extra steps compared to the |
| + strlen implementation: |
| + - Sum the initial value of r3 with the position at which the char was |
| + found, to guarantee we return a pointer and not the length. |
| + - In the main loop, subtract each byte by the char we are looking for, |
| + so we can keep using vminub to quickly check 64B at once. */ |
| +#ifdef USE_AS_RAWMEMCHR |
| +# ifndef RAWMEMCHR |
| +# define FUNCNAME __rawmemchr |
| +# else |
| +# define FUNCNAME RAWMEMCHR |
| +# endif |
| +# define MCOUNT_NARGS 2 |
| +# define VREG_ZERO v20 |
| +# define OFF_START_LOOP 256 |
| +# define RAWMEMCHR_SUBTRACT_VECTORS \ |
| + vsububm v4,v4,v18; \ |
| + vsububm v5,v5,v18; \ |
| + vsububm v6,v6,v18; \ |
| + vsububm v7,v7,v18; |
| +# define TAIL(vreg,increment) \ |
| + vctzlsbb r4,vreg; \ |
| + addi r4,r4,increment; \ |
| + add r3,r5,r4; \ |
| + blr |
| + |
| +#else /* strlen */ |
| + |
| +# ifndef STRLEN |
| +# define FUNCNAME __strlen |
| +# define DEFINE_STRLEN_HIDDEN_DEF 1 |
| +# else |
| +# define FUNCNAME STRLEN |
| +# endif |
| +# define MCOUNT_NARGS 1 |
| +# define VREG_ZERO v18 |
| +# define OFF_START_LOOP 192 |
| +# define TAIL(vreg,increment) \ |
| + vctzlsbb r4,vreg; \ |
| + subf r3,r3,r5; \ |
| + addi r4,r4,increment; \ |
| + add r3,r3,r4; \ |
| + blr |
| +#endif /* USE_AS_RAWMEMCHR */ |
| |
| /* TODO: Replace macros by the actual instructions when minimum binutils becomes |
| >= 2.35. This is used to keep compatibility with older versions. */ |
| @@ -50,33 +90,41 @@ |
| li r6,offset; \ |
| LXVP(v4+32,offset,addr); \ |
| LXVP(v6+32,offset+32,addr); \ |
| + RAWMEMCHR_SUBTRACT_VECTORS; \ |
| vminub v14,v4,v5; \ |
| vminub v15,v6,v7; \ |
| vminub v16,v14,v15; \ |
| - vcmpequb. v0,v16,v18; \ |
| + vcmpequb. v0,v16,VREG_ZERO; \ |
| bne cr6,L(label) |
| |
| -#define TAIL(vreg,increment) \ |
| - vctzlsbb r4,vreg; \ |
| - subf r3,r3,r5; \ |
| - addi r4,r4,increment; \ |
| - add r3,r3,r4; \ |
| - blr |
| - |
| /* Implements the function |
| |
| int [r3] strlen (const void *s [r3]) |
| |
| + but when USE_AS_RAWMEMCHR is set, implements the function |
| + |
| + void* [r3] rawmemchr (const void *s [r3], int c [r4]) |
| + |
| The implementation can load bytes past a matching byte, but only |
| up to the next 64B boundary, so it never crosses a page. */ |
| |
| .machine power9 |
| |
| -ENTRY_TOCLESS (STRLEN, 4) |
| - CALL_MCOUNT 1 |
| +ENTRY_TOCLESS (FUNCNAME, 4) |
| + CALL_MCOUNT MCOUNT_NARGS |
| |
| - vspltisb v18,0 |
| +#ifdef USE_AS_RAWMEMCHR |
| + xori r5,r4,0xff |
| + |
| + mtvsrd v18+32,r4 /* matching char in v18 */ |
| + mtvsrd v19+32,r5 /* non matching char in v19 */ |
| + |
| + vspltb v18,v18,7 /* replicate */ |
| + vspltb v19,v19,7 /* replicate */ |
| +#else |
| vspltisb v19,-1 |
| +#endif |
| + vspltisb VREG_ZERO,0 |
| |
| /* Next 16B-aligned address. Prepare address for L(aligned). */ |
| addi r5,r3,16 |
| @@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4) |
| vcmpequb. v6,v0,v18 |
| beq cr6,L(aligned) |
| |
| +#ifdef USE_AS_RAWMEMCHR |
| + vctzlsbb r6,v6 |
| + add r3,r3,r6 |
| +#else |
| vctzlsbb r3,v6 |
| +#endif |
| blr |
| |
| - /* Test next 176B, 16B at a time. The main loop is optimized for longer |
| - strings, so checking the first bytes in 16B chunks benefits a lot |
| - small strings. */ |
| + /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is |
| + optimized for longer strings, so checking the first bytes in 16B |
| + chunks benefits a lot small strings. */ |
| .p2align 5 |
| L(aligned): |
| +#ifdef USE_AS_RAWMEMCHR |
| + cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to |
| + choose how we will perform the main loop. */ |
| +#endif |
| /* Prepare address for the loop. */ |
| - addi r4,r3,192 |
| + addi r4,r3,OFF_START_LOOP |
| clrrdi r4,r4,6 |
| |
| CHECK16(v0,0,r5,tail1) |
| @@ -113,15 +170,43 @@ L(aligned): |
| CHECK16(v8,128,r5,tail9) |
| CHECK16(v9,144,r5,tail10) |
| CHECK16(v10,160,r5,tail11) |
| +#ifdef USE_AS_RAWMEMCHR |
| + CHECK16(v0,176,r5,tail12) |
| + CHECK16(v1,192,r5,tail13) |
| + CHECK16(v2,208,r5,tail14) |
| + CHECK16(v3,224,r5,tail15) |
| +#endif |
| |
| addi r5,r4,128 |
| |
| +#ifdef USE_AS_RAWMEMCHR |
| + /* If c == 0, use the same loop as strlen, without the vsububm. */ |
| + beq cr5,L(loop) |
| + |
| + /* This is very similar to the block after L(loop), the difference is |
| + that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract |
| + each byte loaded by the char we are looking for, this way we can keep |
| + using vminub to merge the results and checking for nulls. */ |
| + .p2align 5 |
| +L(rawmemchr_loop): |
| + CHECK64(0,r4,pre_tail_64b) |
| + CHECK64(64,r4,pre_tail_64b) |
| + addi r4,r4,256 |
| + |
| + CHECK64(0,r5,tail_64b) |
| + CHECK64(64,r5,tail_64b) |
| + addi r5,r5,256 |
| + |
| + b L(rawmemchr_loop) |
| +#endif |
| /* Switch to a more aggressive approach checking 64B each time. Use 2 |
| pointers 128B apart and unroll the loop once to make the pointer |
| updates and usages separated enough to avoid stalls waiting for |
| address calculation. */ |
| .p2align 5 |
| L(loop): |
| +#undef RAWMEMCHR_SUBTRACT_VECTORS |
| +#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ |
| CHECK64(0,r4,pre_tail_64b) |
| CHECK64(64,r4,pre_tail_64b) |
| addi r4,r4,256 |
| @@ -140,10 +225,10 @@ L(tail_64b): |
| block and mark it in its corresponding VR. lxvp vx,0(ry) puts the |
| low 16B bytes into vx+1, and the high into vx, so the order here is |
| v5, v4, v7, v6. */ |
| - vcmpequb v1,v5,v18 |
| - vcmpequb v2,v4,v18 |
| - vcmpequb v3,v7,v18 |
| - vcmpequb v4,v6,v18 |
| + vcmpequb v1,v5,VREG_ZERO |
| + vcmpequb v2,v4,VREG_ZERO |
| + vcmpequb v3,v7,VREG_ZERO |
| + vcmpequb v4,v6,VREG_ZERO |
| |
| /* Take into account the other 64B blocks we had already checked. */ |
| add r5,r5,r6 |
| @@ -165,7 +250,9 @@ L(tail_64b): |
| or r10,r8,r7 |
| |
| cnttzd r0,r10 /* Count trailing zeros before the match. */ |
| +#ifndef USE_AS_RAWMEMCHR |
| subf r5,r3,r5 |
| +#endif |
| add r3,r5,r0 /* Compute final length. */ |
| blr |
| |
| @@ -213,9 +300,32 @@ L(tail10): |
| L(tail11): |
| TAIL(v10,160) |
| |
| -END (STRLEN) |
| +#ifdef USE_AS_RAWMEMCHR |
| + .p2align 5 |
| +L(tail12): |
| + TAIL(v0,176) |
| + |
| + .p2align 5 |
| +L(tail13): |
| + TAIL(v1,192) |
| + |
| + .p2align 5 |
| +L(tail14): |
| + TAIL(v2,208) |
| + |
| + .p2align 5 |
| +L(tail15): |
| + TAIL(v3,224) |
| +#endif |
| + |
| +END (FUNCNAME) |
| |
| -#ifdef DEFINE_STRLEN_HIDDEN_DEF |
| +#ifdef USE_AS_RAWMEMCHR |
| +weak_alias (__rawmemchr,rawmemchr) |
| +libc_hidden_builtin_def (__rawmemchr) |
| +#else |
| +# ifdef DEFINE_STRLEN_HIDDEN_DEF |
| weak_alias (__strlen, strlen) |
| libc_hidden_builtin_def (strlen) |
| +# endif |
| #endif |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile |
| index 1d517698429e1230..ac2446aca62cc4ab 100644 |
| |
| |
| @@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ |
| |
| ifneq (,$(filter %le,$(config-machine))) |
| sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ |
| + rawmemchr-power9 rawmemchr-power10 \ |
| strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ |
| - rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ |
| - strlen-power10 |
| + strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 |
| endif |
| CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops |
| CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c |
| index 6e36659d1903448a..127af84b32a8196f 100644 |
| |
| |
| @@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */ |
| IFUNC_IMPL (i, name, rawmemchr, |
| #ifdef __LITTLE_ENDIAN__ |
| + IFUNC_IMPL_ADD (array, i, rawmemchr, |
| + (hwcap2 & PPC_FEATURE2_ARCH_3_1) |
| + && (hwcap & PPC_FEATURE_HAS_VSX), |
| + __rawmemchr_power10) |
| IFUNC_IMPL_ADD (array, i, rawmemchr, |
| hwcap2 & PPC_FEATURE2_ARCH_3_00, |
| __rawmemchr_power9) |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S |
| new file mode 100644 |
| index 0000000000000000..bf1ed7e1941f922d |
| |
| |
| @@ -0,0 +1,21 @@ |
| +/* Optimized rawmemchr implementation for PowerPC64/POWER10. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#define RAWMEMCHR __rawmemchr_power10 |
| + |
| +#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S> |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c |
| index 2a7ae5a1ed02e556..369d6359e8987052 100644 |
| |
| |
| @@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden; |
| extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden; |
| # ifdef __LITTLE_ENDIAN__ |
| extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; |
| +extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden; |
| # endif |
| |
| # undef __rawmemchr |
| @@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; |
| ifunc symbol properly. */ |
| libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr, |
| # ifdef __LITTLE_ENDIAN__ |
| + (hwcap2 & PPC_FEATURE2_ARCH_3_1) |
| + && (hwcap & PPC_FEATURE_HAS_VSX) |
| + ? __rawmemchr_power10 : |
| (hwcap2 & PPC_FEATURE2_ARCH_3_00) |
| ? __rawmemchr_power9 : |
| # endif |