|
|
179894 |
commit a55e2da2702e235fa0ae66a116d304d1bffc060a
|
|
|
179894 |
Author: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
|
|
|
179894 |
Date: Thu May 6 17:01:52 2021 -0300
|
|
|
179894 |
|
|
|
179894 |
powerpc: Optimized memcmp for power10
|
|
|
179894 |
|
|
|
179894 |
This patch was based on the __memcmp_power8 and the recent
|
|
|
179894 |
__strlen_power10.
|
|
|
179894 |
|
|
|
179894 |
Improvements from __memcmp_power8:
|
|
|
179894 |
|
|
|
179894 |
1. Don't need alignment code.
|
|
|
179894 |
|
|
|
179894 |
On POWER10 lxvp and lxvl do not generate alignment interrupts, so
|
|
|
179894 |
they are safe for use on caching-inhibited memory. Notice that the
|
|
|
179894 |
comparison on the main loop will wait for both VSR to be ready.
|
|
|
179894 |
Therefore aligning one of the input address does not improve
|
|
|
179894 |
performance. In order to align both registers a vperm is necessary
|
|
|
179894 |
which add too much overhead.
|
|
|
179894 |
|
|
|
179894 |
2. Uses new POWER10 instructions
|
|
|
179894 |
|
|
|
179894 |
This code uses lxvp to decrease contention on load by loading 32 bytes
|
|
|
179894 |
per instruction.
|
|
|
179894 |
The vextractbm is used to have a smaller tail code for calculating the
|
|
|
179894 |
return value.
|
|
|
179894 |
|
|
|
179894 |
3. Performance improvement
|
|
|
179894 |
|
|
|
179894 |
This version has around 35% better performance on average. I saw no
|
|
|
179894 |
performance regressions for any length or alignment.
|
|
|
179894 |
|
|
|
179894 |
Thanks Matheus for helping me out with some details.
|
|
|
179894 |
|
|
|
179894 |
Co-authored-by: Matheus Castanho <msc@linux.ibm.com>
|
|
|
179894 |
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
|
|
|
179894 |
|
|
|
179894 |
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcmp.S b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
|
|
|
179894 |
new file mode 100644
|
|
|
179894 |
index 0000000000000000..52f244e7e77cbdf9
|
|
|
179894 |
--- /dev/null
|
|
|
179894 |
+++ b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
|
|
|
179894 |
@@ -0,0 +1,179 @@
|
|
|
179894 |
+/* Optimized memcmp implementation for POWER10.
|
|
|
179894 |
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
|
179894 |
+ This file is part of the GNU C Library.
|
|
|
179894 |
+
|
|
|
179894 |
+ The GNU C Library is free software; you can redistribute it and/or
|
|
|
179894 |
+ modify it under the terms of the GNU Lesser General Public
|
|
|
179894 |
+ License as published by the Free Software Foundation; either
|
|
|
179894 |
+ version 2.1 of the License, or (at your option) any later version.
|
|
|
179894 |
+
|
|
|
179894 |
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
|
179894 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
179894 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
179894 |
+ Lesser General Public License for more details.
|
|
|
179894 |
+
|
|
|
179894 |
+ You should have received a copy of the GNU Lesser General Public
|
|
|
179894 |
+ License along with the GNU C Library; if not, see
|
|
|
179894 |
+ <https://www.gnu.org/licenses/>. */
|
|
|
179894 |
+
|
|
|
179894 |
+#include <sysdep.h>
|
|
|
179894 |
+
|
|
|
179894 |
+/* TODO: Replace macros by the actual instructions when minimum binutils becomes
|
|
|
179894 |
+ >= 2.35. This is used to keep compatibility with older versions. */
|
|
|
179894 |
+#define VEXTRACTBM(rt,vrb) \
|
|
|
179894 |
+ .long(((4)<<(32-6)) \
|
|
|
179894 |
+ | ((rt)<<(32-11)) \
|
|
|
179894 |
+ | ((8)<<(32-16)) \
|
|
|
179894 |
+ | ((vrb)<<(32-21)) \
|
|
|
179894 |
+ | 1602)
|
|
|
179894 |
+
|
|
|
179894 |
+#define LXVP(xtp,dq,ra) \
|
|
|
179894 |
+ .long(((6)<<(32-6)) \
|
|
|
179894 |
+ | ((((xtp)-32)>>1)<<(32-10)) \
|
|
|
179894 |
+ | ((1)<<(32-11)) \
|
|
|
179894 |
+ | ((ra)<<(32-16)) \
|
|
|
179894 |
+ | dq)
|
|
|
179894 |
+
|
|
|
179894 |
+/* Compare 32 bytes. */
|
|
|
179894 |
+#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\
|
|
|
179894 |
+ LXVP(32+vr1,offset,r3); \
|
|
|
179894 |
+ LXVP(32+vr2,offset,r4); \
|
|
|
179894 |
+ vcmpneb. v5,vr1+1,vr2+1; \
|
|
|
179894 |
+ bne cr6,L(tail_2); \
|
|
|
179894 |
+ vcmpneb. v4,vr1,vr2; \
|
|
|
179894 |
+ bne cr6,L(tail_1); \
|
|
|
179894 |
+
|
|
|
179894 |
+#define TAIL(v_res,s1,s2) \
|
|
|
179894 |
+ vctzlsbb r7,v_res; \
|
|
|
179894 |
+ vextubrx r8,r7,s1; \
|
|
|
179894 |
+ vextubrx r9,r7,s2; \
|
|
|
179894 |
+ subf r3,r9,r8; \
|
|
|
179894 |
+ blr; \
|
|
|
179894 |
+
|
|
|
179894 |
+/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4],
|
|
|
179894 |
+ size_t size [r5]) */
|
|
|
179894 |
+
|
|
|
179894 |
+#ifndef MEMCMP
|
|
|
179894 |
+# define MEMCMP memcmp
|
|
|
179894 |
+#endif
|
|
|
179894 |
+ .machine power9
|
|
|
179894 |
+ENTRY_TOCLESS (MEMCMP, 4)
|
|
|
179894 |
+ CALL_MCOUNT 3
|
|
|
179894 |
+
|
|
|
179894 |
+ cmpldi cr6,r5,64
|
|
|
179894 |
+ bgt cr6,L(loop_head)
|
|
|
179894 |
+
|
|
|
179894 |
+/* Compare 64 bytes. This section is used for lengths <= 64 and for the last
|
|
|
179894 |
+ bytes for larger lengths. */
|
|
|
179894 |
+L(last_compare):
|
|
|
179894 |
+ li r8,16
|
|
|
179894 |
+
|
|
|
179894 |
+ sldi r9,r5,56
|
|
|
179894 |
+ sldi r8,r8,56
|
|
|
179894 |
+ addi r6,r3,16
|
|
|
179894 |
+ addi r7,r4,16
|
|
|
179894 |
+
|
|
|
179894 |
+ /* Align up to 16 bytes. */
|
|
|
179894 |
+ lxvl 32+v0,r3,r9
|
|
|
179894 |
+ lxvl 32+v2,r4,r9
|
|
|
179894 |
+
|
|
|
179894 |
+ /* The sub. and vcmpneb. results are concatenated by the crnand in order
|
|
|
179894 |
+ to do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) then
|
|
|
179894 |
+ loading to CR0.LT. That means r9 is not bigger than 0 and v4 is not
|
|
|
179894 |
+ all equal to 0. */
|
|
|
179894 |
+ sub. r9,r9,r8
|
|
|
179894 |
+ vcmpneb. v4,v0,v2
|
|
|
179894 |
+ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
|
|
|
179894 |
+ bt 4*cr0+lt,L(tail1)
|
|
|
179894 |
+
|
|
|
179894 |
+ addi r3,r3,32
|
|
|
179894 |
+ addi r4,r4,32
|
|
|
179894 |
+
|
|
|
179894 |
+ lxvl 32+v1,r6,r9
|
|
|
179894 |
+ lxvl 32+v3,r7,r9
|
|
|
179894 |
+ sub. r9,r9,r8
|
|
|
179894 |
+ vcmpneb. v5,v1,v3
|
|
|
179894 |
+ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
|
|
|
179894 |
+ bt 4*cr0+lt,L(tail2)
|
|
|
179894 |
+
|
|
|
179894 |
+ addi r6,r3,16
|
|
|
179894 |
+ addi r7,r4,16
|
|
|
179894 |
+
|
|
|
179894 |
+ lxvl 32+v6,r3,r9
|
|
|
179894 |
+ lxvl 32+v8,r4,r9
|
|
|
179894 |
+ sub. r9,r9,r8
|
|
|
179894 |
+ vcmpneb. v4,v6,v8
|
|
|
179894 |
+ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
|
|
|
179894 |
+ bt 4*cr0+lt,L(tail3)
|
|
|
179894 |
+
|
|
|
179894 |
+ lxvl 32+v7,r6,r9
|
|
|
179894 |
+ lxvl 32+v9,r7,r9
|
|
|
179894 |
+ vcmpneb. v5,v7,v9
|
|
|
179894 |
+ bne cr6,L(tail4)
|
|
|
179894 |
+
|
|
|
179894 |
+L(finish):
|
|
|
179894 |
+ /* The contents are equal. */
|
|
|
179894 |
+ li r3,0
|
|
|
179894 |
+ blr
|
|
|
179894 |
+
|
|
|
179894 |
+L(loop_head):
|
|
|
179894 |
+ /* Calculate how many loops to run. */
|
|
|
179894 |
+ srdi. r8,r5,7
|
|
|
179894 |
+ beq L(loop_tail)
|
|
|
179894 |
+ mtctr r8
|
|
|
179894 |
+
|
|
|
179894 |
+/* Main loop. Compares 128 bytes each loop. */
|
|
|
179894 |
+ .p2align 5
|
|
|
179894 |
+L(loop_128):
|
|
|
179894 |
+ COMPARE_32(v0,v2,0,tail1,tail2)
|
|
|
179894 |
+ COMPARE_32(v6,v8,32,tail3,tail4)
|
|
|
179894 |
+ COMPARE_32(v10,v12,64,tail5,tail6)
|
|
|
179894 |
+ COMPARE_32(v14,v16,96,tail7,tail8)
|
|
|
179894 |
+
|
|
|
179894 |
+ addi r3,r3,128
|
|
|
179894 |
+ addi r4,r4,128
|
|
|
179894 |
+ bdnz L(loop_128)
|
|
|
179894 |
+
|
|
|
179894 |
+ /* Account loop comparisons. */
|
|
|
179894 |
+ clrldi. r5,r5,57
|
|
|
179894 |
+ beq L(finish)
|
|
|
179894 |
+
|
|
|
179894 |
+/* Compares 64 bytes if length is still bigger than 64 bytes. */
|
|
|
179894 |
+ .p2align 5
|
|
|
179894 |
+L(loop_tail):
|
|
|
179894 |
+ cmpldi r5,64
|
|
|
179894 |
+ ble L(last_compare)
|
|
|
179894 |
+ COMPARE_32(v0,v2,0,tail1,tail2)
|
|
|
179894 |
+ COMPARE_32(v6,v8,32,tail3,tail4)
|
|
|
179894 |
+ addi r3,r3,64
|
|
|
179894 |
+ addi r4,r4,64
|
|
|
179894 |
+ subi r5,r5,64
|
|
|
179894 |
+ b L(last_compare)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail1):
|
|
|
179894 |
+ TAIL(v4,v0,v2)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail2):
|
|
|
179894 |
+ TAIL(v5,v1,v3)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail3):
|
|
|
179894 |
+ TAIL(v4,v6,v8)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail4):
|
|
|
179894 |
+ TAIL(v5,v7,v9)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail5):
|
|
|
179894 |
+ TAIL(v4,v10,v12)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail6):
|
|
|
179894 |
+ TAIL(v5,v11,v13)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail7):
|
|
|
179894 |
+ TAIL(v4,v14,v16)
|
|
|
179894 |
+
|
|
|
179894 |
+L(tail8):
|
|
|
179894 |
+ TAIL(v5,v15,v17)
|
|
|
179894 |
+
|
|
|
179894 |
+END (MEMCMP)
|
|
|
179894 |
+libc_hidden_builtin_def (memcmp)
|
|
|
179894 |
+weak_alias (memcmp, bcmp)
|
|
|
179894 |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
|
|
179894 |
index ac2446aca62cc4ab..ee98417f4a383356 100644
|
|
|
179894 |
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
|
|
179894 |
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
|
|
179894 |
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
|
|
179894 |
strncase-power8
|
|
|
179894 |
|
|
|
179894 |
ifneq (,$(filter %le,$(config-machine)))
|
|
|
179894 |
-sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
|
|
|
179894 |
+sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
|
|
|
179894 |
rawmemchr-power9 rawmemchr-power10 \
|
|
|
179894 |
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
|
|
179894 |
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
|
|
|
179894 |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
|
|
179894 |
index 127af84b32a8196f..5213abdf87c79c88 100644
|
|
|
179894 |
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
|
|
179894 |
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
|
|
179894 |
@@ -184,6 +184,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
|
179894 |
|
|
|
179894 |
/* Support sysdeps/powerpc/powerpc64/multiarch/memcmp.c. */
|
|
|
179894 |
IFUNC_IMPL (i, name, memcmp,
|
|
|
179894 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
179894 |
+ IFUNC_IMPL_ADD (array, i, memcmp,
|
|
|
179894 |
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
|
|
179894 |
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
|
|
179894 |
+ __memcmp_power10)
|
|
|
179894 |
+#endif
|
|
|
179894 |
IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
|
|
179894 |
__memcmp_power8)
|
|
|
179894 |
IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
|
|
|
179894 |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
|
|
|
179894 |
new file mode 100644
|
|
|
179894 |
index 0000000000000000..73a0debd4a811d8e
|
|
|
179894 |
--- /dev/null
|
|
|
179894 |
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
|
|
|
179894 |
@@ -0,0 +1,26 @@
|
|
|
179894 |
+/* Optimized memcmp implementation for POWER10.
|
|
|
179894 |
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
|
|
179894 |
+ This file is part of the GNU C Library.
|
|
|
179894 |
+
|
|
|
179894 |
+ The GNU C Library is free software; you can redistribute it and/or
|
|
|
179894 |
+ modify it under the terms of the GNU Lesser General Public
|
|
|
179894 |
+ License as published by the Free Software Foundation; either
|
|
|
179894 |
+ version 2.1 of the License, or (at your option) any later version.
|
|
|
179894 |
+
|
|
|
179894 |
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
|
179894 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
179894 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
179894 |
+ Lesser General Public License for more details.
|
|
|
179894 |
+
|
|
|
179894 |
+ You should have received a copy of the GNU Lesser General Public
|
|
|
179894 |
+ License along with the GNU C Library; if not, see
|
|
|
179894 |
+ <https://www.gnu.org/licenses/>. */
|
|
|
179894 |
+
|
|
|
179894 |
+#define MEMCMP __memcmp_power10
|
|
|
179894 |
+
|
|
|
179894 |
+#undef libc_hidden_builtin_def
|
|
|
179894 |
+#define libc_hidden_builtin_def(name)
|
|
|
179894 |
+#undef weak_alias
|
|
|
179894 |
+#define weak_alias(name,alias)
|
|
|
179894 |
+
|
|
|
179894 |
+#include <sysdeps/powerpc/powerpc64/le/power10/memcmp.S>
|
|
|
179894 |
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
|
|
|
179894 |
index 2c7a083a6560f920..0b8c0c1d8aa3f90a 100644
|
|
|
179894 |
--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
|
|
|
179894 |
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
|
|
|
179894 |
@@ -27,11 +27,17 @@ extern __typeof (memcmp) __memcmp_ppc attribute_hidden;
|
|
|
179894 |
extern __typeof (memcmp) __memcmp_power4 attribute_hidden;
|
|
|
179894 |
extern __typeof (memcmp) __memcmp_power7 attribute_hidden;
|
|
|
179894 |
extern __typeof (memcmp) __memcmp_power8 attribute_hidden;
|
|
|
179894 |
+extern __typeof (memcmp) __memcmp_power10 attribute_hidden;
|
|
|
179894 |
# undef memcmp
|
|
|
179894 |
|
|
|
179894 |
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
|
|
179894 |
ifunc symbol properly. */
|
|
|
179894 |
libc_ifunc_redirected (__redirect_memcmp, memcmp,
|
|
|
179894 |
+#ifdef __LITTLE_ENDIAN__
|
|
|
179894 |
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
|
|
179894 |
+ && hwcap & PPC_FEATURE_HAS_VSX)
|
|
|
179894 |
+ ? __memcmp_power10 :
|
|
|
179894 |
+#endif
|
|
|
179894 |
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
|
|
179894 |
? __memcmp_power8 :
|
|
|
179894 |
(hwcap & PPC_FEATURE_HAS_VSX)
|