| commit e941e0ae80626b7661c1db8953a673cafd3b8b19 |
| Author: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com> |
| Date: Fri Apr 30 18:12:08 2021 -0300 |
| |
| powerpc64le: Optimize memcpy for POWER10 |
| |
| This implementation is based on __memcpy_power8_cached and integrates |
| suggestions from Anton Blanchard. |
| It benefits from loads and stores with length for short lengths and for |
| tail code, simplifying the code. |
| |
| All unaligned memory accesses use instructions that do not generate |
| alignment interrupts on POWER10, making it safe to use on |
| caching-inhibited memory. |
| |
| The main loop has also been modified in order to increase instruction |
| throughput by reducing the dependency on updates from previous iterations. |
| |
| On average, this implementation provides around 30% improvement when |
| compared to __memcpy_power7 and 10% improvement in comparison to |
| __memcpy_power8_cached. |
| |
| diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S |
| new file mode 100644 |
| index 0000000000000000..ad1414db4a3a8b9f |
| |
| |
| @@ -0,0 +1,198 @@ |
| +/* Optimized memcpy implementation for POWER10. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <http://www.gnu.org/licenses/>. */ |
| + |
| +#include <sysdep.h> |
| + |
| + |
| +#ifndef MEMCPY |
| +# define MEMCPY memcpy |
| +#endif |
| + |
| +/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); |
| + Returns 'dst'. */ |
| + |
| + .machine power9 |
| +ENTRY_TOCLESS (MEMCPY, 5) |
| + CALL_MCOUNT 3 |
| + |
| + /* Copy up to 16 bytes. */ |
| + sldi r6,r5,56 /* Prepare [l|st]xvl counter. */ |
| + lxvl v10,r4,r6 |
| + stxvl v10,r3,r6 |
| + subic. r6,r5,16 /* Return if len <= 16. */ |
| + blelr |
| + |
| + /* If len >= 256, assume nothing got copied before and copy |
| + again. This might cause issues with overlapped memory, but memcpy |
| + is not expected to treat overlapped memory. */ |
| + cmpdi r5,256 |
| + bge L(copy_ge_256) |
| + /* 16 < len < 256 and the first 16 bytes have already been copied. */ |
| + addi r10,r3,16 /* Keep r3 intact as return value. */ |
| + addi r4,r4,16 |
| + subi r5,r5,16 |
| + b L(copy_lt_256) /* Avoid the main loop if len < 256. */ |
| + |
| + .p2align 5 |
| +L(copy_ge_256): |
| + mr r10,r3 /* Keep r3 intact as return value. */ |
| + /* Align dst to 16 bytes. */ |
| + andi. r9,r10,0xf |
| + beq L(dst_is_align_16) |
| + lxv v10,0(r4) |
| + subfic r12,r9,16 |
| + subf r5,r12,r5 |
| + add r4,r4,r12 |
| + stxv v10,0(r3) |
| + add r10,r3,r12 |
| + |
| +L(dst_is_align_16): |
| + srdi r9,r5,7 /* Divide by 128. */ |
| + mtctr r9 |
| + addi r6,r4,64 |
| + addi r7,r10,64 |
| + |
| + |
| + /* Main loop, copy 128 bytes per iteration. |
| + Use r6=src+64 and r7=dest+64 in order to reduce the dependency on |
| + r4 and r10. */ |
| + .p2align 5 |
| +L(copy_128): |
| + |
| + lxv v10, 0(r4) |
| + lxv v11, 16(r4) |
| + lxv v12, 32(r4) |
| + lxv v13, 48(r4) |
| + |
| + addi r4,r4,128 |
| + |
| + stxv v10, 0(r10) |
| + stxv v11, 16(r10) |
| + stxv v12, 32(r10) |
| + stxv v13, 48(r10) |
| + |
| + addi r10,r10,128 |
| + |
| + lxv v10, 0(r6) |
| + lxv v11, 16(r6) |
| + lxv v12, 32(r6) |
| + lxv v13, 48(r6) |
| + |
| + addi r6,r6,128 |
| + |
| + stxv v10, 0(r7) |
| + stxv v11, 16(r7) |
| + stxv v12, 32(r7) |
| + stxv v13, 48(r7) |
| + |
| + addi r7,r7,128 |
| + |
| + bdnz L(copy_128) |
| + |
| + clrldi. r5,r5,64-7 /* Have we copied everything? */ |
| + beqlr |
| + |
| + .p2align 5 |
| +L(copy_lt_256): |
| + cmpdi r5,16 |
| + ble L(copy_le_16) |
| + srdi. r9,r5,5 /* Divide by 32. */ |
| + beq L(copy_lt_32) |
| + mtctr r9 |
| + /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce |
| + the dependency on r4 and r10. */ |
| + addi r6,r4,32 |
| + addi r7,r10,32 |
| + addi r8,r4,64 |
| + addi r9,r10,64 |
| + |
| + .p2align 5 |
| + /* Copy 32 bytes at a time, unaligned. |
| + The loop is unrolled 3 times in order to reduce the dependency on |
| + r4 and r10, copying up-to 96 bytes per iteration. */ |
| +L(copy_32): |
| + lxv v10, 0(r4) |
| + lxv v11, 16(r4) |
| + stxv v10, 0(r10) |
| + stxv v11, 16(r10) |
| + bdz L(end_copy_32a) |
| + addi r4,r4,96 |
| + addi r10,r10,96 |
| + |
| + lxv v10, 0(r6) |
| + lxv v11, 16(r6) |
| + addi r6,r6,96 |
| + stxv v10, 0(r7) |
| + stxv v11, 16(r7) |
| + bdz L(end_copy_32b) |
| + addi r7,r7,96 |
| + |
| + lxv v12, 0(r8) |
| + lxv v13, 16(r8) |
| + addi r8,r8,96 |
| + stxv v12, 0(r9) |
| + stxv v13, 16(r9) |
| + addi r9,r9,96 |
| + bdnz L(copy_32) |
| + |
| + clrldi. r5,r5,64-5 /* Have we copied everything? */ |
| + beqlr |
| + cmpdi r5,16 |
| + ble L(copy_le_16) |
| + b L(copy_lt_32) |
| + |
| + .p2align 5 |
| +L(end_copy_32a): |
| + clrldi. r5,r5,64-5 /* Have we copied everything? */ |
| + beqlr |
| + /* 32 bytes have been copied since the last update of r4 and r10. */ |
| + addi r4,r4,32 |
| + addi r10,r10,32 |
| + cmpdi r5,16 |
| + ble L(copy_le_16) |
| + b L(copy_lt_32) |
| + |
| + .p2align 5 |
| +L(end_copy_32b): |
| + clrldi. r5,r5,64-5 /* Have we copied everything? */ |
| + beqlr |
| + /* The last iteration of the loop copied 64 bytes. Update r4 and r10 |
| + accordingly. */ |
| + addi r4,r4,-32 |
| + addi r10,r10,-32 |
| + cmpdi r5,16 |
| + ble L(copy_le_16) |
| + |
| + .p2align 5 |
| +L(copy_lt_32): |
| + lxv v10, 0(r4) |
| + stxv v10, 0(r10) |
| + addi r4,r4,16 |
| + addi r10,r10,16 |
| + subi r5,r5,16 |
| + |
| + .p2align 5 |
| +L(copy_le_16): |
| + sldi r6,r5,56 |
| + lxvl v10,r4,r6 |
| + stxvl v10,r10,r6 |
| + blr |
| + |
| + |
| +END_GEN_TB (MEMCPY,TB_TOCLESS) |
| +libc_hidden_builtin_def (memcpy) |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile |
| index 66f8c6ace9824d4a..2e3c8f2e8a81cda4 100644 |
| |
| |
| @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ |
| strncase-power8 |
| |
| ifneq (,$(filter %le,$(config-machine))) |
| -sysdep_routines += memmove-power10 \ |
| +sysdep_routines += memcpy-power10 memmove-power10 \ |
| strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ |
| rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ |
| strlen-power10 |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c |
| index 4ce04bc51574cca1..9d5a14e480c02171 100644 |
| |
| |
| @@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, |
| #ifdef SHARED |
| /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */ |
| IFUNC_IMPL (i, name, memcpy, |
| +#ifdef __LITTLE_ENDIAN__ |
| + IFUNC_IMPL_ADD (array, i, memcpy, |
| + hwcap2 & PPC_FEATURE2_ARCH_3_1 |
| + && hwcap & PPC_FEATURE_HAS_VSX, |
| + __memcpy_power10) |
| +#endif |
| IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, |
| __memcpy_power8_cached) |
| IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX, |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S |
| new file mode 100644 |
| index 0000000000000000..70e0fc3ed610cdc3 |
| |
| |
| @@ -0,0 +1,26 @@ |
| +/* Optimized memcpy implementation for POWER10. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#if defined __LITTLE_ENDIAN__ && IS_IN (libc) |
| +#define MEMCPY __memcpy_power10 |
| + |
| +#undef libc_hidden_builtin_def |
| +#define libc_hidden_builtin_def(name) |
| + |
| +#include <sysdeps/powerpc/powerpc64/le/power10/memcpy.S> |
| +#endif |
| diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c |
| index 44dea594f3770673..be0e47f32dde2ccf 100644 |
| |
| |
| @@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden; |
| extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden; |
| extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden; |
| extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden; |
| +# if defined __LITTLE_ENDIAN__ |
| +extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden; |
| +# endif |
| |
| libc_ifunc (__libc_memcpy, |
| +# if defined __LITTLE_ENDIAN__ |
| + (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX) |
| + ? __memcpy_power10 : |
| +# endif |
| ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt) |
| ? __memcpy_power8_cached : |
| (hwcap & PPC_FEATURE_HAS_VSX) |