Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-rh1929928-4.patch

Blob History Raw

		179894	`commit fa527f345cbbe852ec085932fbea979956c195b5`
		179894	`Author: Naohiro Tamura <naohirot@jp.fujitsu.com>`
		179894	`Date: Thu May 27 07:42:35 2021 +0000`
		179894
		179894	`aarch64: Added optimized memcpy and memmove for A64FX`
		179894
		179894	`This patch optimizes the performance of memcpy/memmove for A64FX [1]`
		179894	`which implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB`
		179894	`cache per NUMA node.`
		179894
		179894	`The performance optimization makes use of Scalable Vector Register`
		179894	`with several techniques such as loop unrolling, memory access`
		179894	`alignment, cache zero fill, and software pipelining.`
		179894
		179894	`SVE assembler code for memcpy/memmove is implemented as Vector Length`
		179894	`Agnostic code so theoretically it can be run on any SOC which supports`
		179894	`ARMv8-A SVE standard.`
		179894
		179894	`We confirmed that all testcases have been passed by running 'make`
		179894	`check' and 'make xcheck' not only on A64FX but also on ThunderX2.`
		179894
		179894	`And also we confirmed that the SVE 512 bit vector register performance`
		179894	`is roughly 4 times better than Advanced SIMD 128 bit register and 8`
		179894	`times better than scalar 64 bit register by running 'make bench'.`
		179894
		179894	`[1] https://github.com/fujitsu/A64FX`
		179894
		179894	`Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>`
		179894	`Reviewed-by: Szabolcs Nagy <Szabolcs.Nagy@arm.com>`
		179894
		179894	`Conflicts:`
		179894	`manual/tunables.texi`
		179894	`sysdeps/aarch64/multiarch/Makefile`
		179894	`sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`sysdeps/aarch64/multiarch/init-arch.h`
		179894	`sysdeps/aarch64/multiarch/memcpy.c`
		179894	`sysdeps/aarch64/multiarch/memmove.c`
		179894	`sysdeps/unix/sysv/linux/aarch64/cpu-features.c`
		179894	`sysdeps/unix/sysv/linux/aarch64/cpu-features.h`
		179894	`(all conflicts due to missing optimizations for other CPUs)`
		179894
		179894	`diff --git a/manual/tunables.texi b/manual/tunables.texi`
		179894	`index bd737b5d57080462..07887981748bc44b 100644`
		179894	`--- a/manual/tunables.texi`
		179894	`+++ b/manual/tunables.texi`
		179894	`@@ -386,7 +386,7 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.`
		179894	`The @code{glibc.cpu.name=xxx} tunable allows the user to tell @theglibc{} to`
		179894	`assume that the CPU is @code{xxx} where xxx may have one of these values:`
		179894	`@code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},`
		179894	`-@code{thunderx2t99p1}.`
		179894	`+@code{thunderx2t99p1}, @code{a64fx}.`
		179894
		179894	`This tunable is specific to aarch64.`
		179894	`@end deftp`
		179894	`diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile`
		179894	`index 57ffdf72382c0a44..5a19ba0308e80983 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/Makefile`
		179894	`+++ b/sysdeps/aarch64/multiarch/Makefile`
		179894	`@@ -1,4 +1,5 @@`
		179894	`ifeq ($(subdir),string)`
		179894	`sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \`
		179894	`- memcpy_falkor memmove_falkor memset_generic memset_falkor`
		179894	`+ memcpy_falkor memcpy_a64fx \`
		179894	`+ memmove_falkor memset_generic memset_falkor`
		179894	`endif`
		179894	`diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`index e55be80103b948a2..f53db12acce37877 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`@@ -25,7 +25,7 @@`
		179894	`#include <stdio.h>`
		179894
		179894	`/* Maximum number of IFUNC implementations. */`
		179894	`-#define MAX_IFUNC 4`
		179894	`+#define MAX_IFUNC 7`
		179894
		179894	`size_t`
		179894	`__libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		179894	`@@ -42,10 +42,16 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		179894	`IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)`
		179894	`IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)`
		179894	`IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)`
		179894	`+#if HAVE_AARCH64_SVE_ASM`
		179894	`+ IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)`
		179894	`+#endif`
		179894	`IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))`
		179894	`IFUNC_IMPL (i, name, memmove,`
		179894	`IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)`
		179894	`IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)`
		179894	`+#if HAVE_AARCH64_SVE_ASM`
		179894	`+ IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)`
		179894	`+#endif`
		179894	`IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))`
		179894	`IFUNC_IMPL (i, name, memset,`
		179894	`/* Enable this on non-falkor processors too so that other cores`
		179894	`diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h`
		179894	`index d1e5703cb25fdcff..65dc8f82ff23c754 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/init-arch.h`
		179894	`+++ b/sysdeps/aarch64/multiarch/init-arch.h`
		179894	`@@ -22,4 +22,6 @@`
		179894	`uint64_t __attribute__((unused)) midr = \`
		179894	`GLRO(dl_aarch64_cpu_features).midr_el1; \`
		179894	`unsigned __attribute__((unused)) zva_size = \`
		179894	`- GLRO(dl_aarch64_cpu_features).zva_size;`
		179894	`+ GLRO(dl_aarch64_cpu_features).zva_size; \`
		179894	`+ bool __attribute__((unused)) sve = \`
		179894	`+ GLRO(dl_aarch64_cpu_features).sve;`
		179894	`diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c`
		179894	`index 4a04a63b0fe0c84b..e0313c42e82a7b86 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/memcpy.c`
		179894	`+++ b/sysdeps/aarch64/multiarch/memcpy.c`
		179894	`@@ -32,6 +32,9 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;`
		179894	`extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;`
		179894	`extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;`
		179894	`extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;`
		179894	`+# if HAVE_AARCH64_SVE_ASM`
		179894	`+extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;`
		179894	`+# endif`
		179894
		179894	`libc_ifunc (__libc_memcpy,`
		179894	`(IS_THUNDERX (midr)`
		179894	`@@ -40,8 +43,13 @@ libc_ifunc (__libc_memcpy,`
		179894	`? __memcpy_falkor`
		179894	`: (IS_THUNDERX2 (midr) \|\| IS_THUNDERX2PA (midr)`
		179894	`? __memcpy_thunderx2`
		179894	`+# if HAVE_AARCH64_SVE_ASM`
		179894	`+ : (IS_A64FX (midr)`
		179894	`+ ? __memcpy_a64fx`
		179894	`+ : __memcpy_generic)))));`
		179894	`+# else`
		179894	`: __memcpy_generic))));`
		179894	`-`
		179894	`+# endif`
		179894	`# undef memcpy`
		179894	`strong_alias (__libc_memcpy, memcpy);`
		179894	`#endif`
		179894	`diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
		179894	`new file mode 100644`
		179894	`index 0000000000000000..65528405bb123737`
		179894	`--- /dev/null`
		179894	`+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
		179894	`@@ -0,0 +1,406 @@`
		179894	`+/* Optimized memcpy for Fujitsu A64FX processor.`
		179894	`+ Copyright (C) 2021 Free Software Foundation, Inc.`
		179894	`+`
		179894	`+ This file is part of the GNU C Library.`
		179894	`+`
		179894	`+ The GNU C Library is free software; you can redistribute it and/or`
		179894	`+ modify it under the terms of the GNU Lesser General Public`
		179894	`+ License as published by the Free Software Foundation; either`
		179894	`+ version 2.1 of the License, or (at your option) any later version.`
		179894	`+`
		179894	`+ The GNU C Library is distributed in the hope that it will be useful,`
		179894	`+ but WITHOUT ANY WARRANTY; without even the implied warranty of`
		179894	`+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
		179894	`+ Lesser General Public License for more details.`
		179894	`+`
		179894	`+ You should have received a copy of the GNU Lesser General Public`
		179894	`+ License along with the GNU C Library. If not, see`
		179894	`+ <https://www.gnu.org/licenses/>. */`
		179894	`+`
		179894	`+#include <sysdep.h>`
		179894	`+`
		179894	`+/* Assumptions:`
		179894	`+ *`
		179894	`+ * ARMv8.2-a, AArch64, unaligned accesses, sve`
		179894	`+ *`
		179894	`+ */`
		179894	`+`
		179894	`+#define L2_SIZE (810241024)/2 // L2 8MB/2`
		179894	`+#define CACHE_LINE_SIZE 256`
		179894	`+#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance`
		179894	`+#define dest x0`
		179894	`+#define src x1`
		179894	`+#define n x2 // size`
		179894	`+#define tmp1 x3`
		179894	`+#define tmp2 x4`
		179894	`+#define tmp3 x5`
		179894	`+#define rest x6`
		179894	`+#define dest_ptr x7`
		179894	`+#define src_ptr x8`
		179894	`+#define vector_length x9`
		179894	`+#define cl_remainder x10 // CACHE_LINE_SIZE remainder`
		179894	`+`
		179894	`+#if HAVE_AARCH64_SVE_ASM`
		179894	`+# if IS_IN (libc)`
		179894	`+# define MEMCPY __memcpy_a64fx`
		179894	`+# define MEMMOVE __memmove_a64fx`
		179894	`+`
		179894	`+ .arch armv8.2-a+sve`
		179894	`+`
		179894	`+ .macro dc_zva times`
		179894	`+ dc zva, tmp1`
		179894	`+ add tmp1, tmp1, CACHE_LINE_SIZE`
		179894	`+ .if \times-1`
		179894	`+ dc_zva "(\times-1)"`
		179894	`+ .endif`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro ld1b_unroll8`
		179894	`+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]`
		179894	`+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]`
		179894	`+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]`
		179894	`+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]`
		179894	`+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]`
		179894	`+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]`
		179894	`+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro stld1b_unroll4a`
		179894	`+ st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z1.b, p0, [dest_ptr, #1, mul vl]`
		179894	`+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z1.b, p0/z, [src_ptr, #1, mul vl]`
		179894	`+ st1b z2.b, p0, [dest_ptr, #2, mul vl]`
		179894	`+ st1b z3.b, p0, [dest_ptr, #3, mul vl]`
		179894	`+ ld1b z2.b, p0/z, [src_ptr, #2, mul vl]`
		179894	`+ ld1b z3.b, p0/z, [src_ptr, #3, mul vl]`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro stld1b_unroll4b`
		179894	`+ st1b z4.b, p0, [dest_ptr, #4, mul vl]`
		179894	`+ st1b z5.b, p0, [dest_ptr, #5, mul vl]`
		179894	`+ ld1b z4.b, p0/z, [src_ptr, #4, mul vl]`
		179894	`+ ld1b z5.b, p0/z, [src_ptr, #5, mul vl]`
		179894	`+ st1b z6.b, p0, [dest_ptr, #6, mul vl]`
		179894	`+ st1b z7.b, p0, [dest_ptr, #7, mul vl]`
		179894	`+ ld1b z6.b, p0/z, [src_ptr, #6, mul vl]`
		179894	`+ ld1b z7.b, p0/z, [src_ptr, #7, mul vl]`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro stld1b_unroll8`
		179894	`+ stld1b_unroll4a`
		179894	`+ stld1b_unroll4b`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro st1b_unroll8`
		179894	`+ st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z1.b, p0, [dest_ptr, #1, mul vl]`
		179894	`+ st1b z2.b, p0, [dest_ptr, #2, mul vl]`
		179894	`+ st1b z3.b, p0, [dest_ptr, #3, mul vl]`
		179894	`+ st1b z4.b, p0, [dest_ptr, #4, mul vl]`
		179894	`+ st1b z5.b, p0, [dest_ptr, #5, mul vl]`
		179894	`+ st1b z6.b, p0, [dest_ptr, #6, mul vl]`
		179894	`+ st1b z7.b, p0, [dest_ptr, #7, mul vl]`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro shortcut_for_small_size exit`
		179894	`+ // if rest <= vector_length * 2`
		179894	`+ whilelo p0.b, xzr, n`
		179894	`+ whilelo p1.b, vector_length, n`
		179894	`+ b.last 1f`
		179894	`+ ld1b z0.b, p0/z, [src, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src, #1, mul vl]`
		179894	`+ st1b z0.b, p0, [dest, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest, #1, mul vl]`
		179894	`+ ret`
		179894	`+1: // if rest > vector_length * 8`
		179894	`+ cmp n, vector_length, lsl 3 // vector_length * 8`
		179894	`+ b.hi \exit`
		179894	`+ // if rest <= vector_length * 4`
		179894	`+ lsl tmp1, vector_length, 1 // vector_length * 2`
		179894	`+ whilelo p2.b, tmp1, n`
		179894	`+ incb tmp1`
		179894	`+ whilelo p3.b, tmp1, n`
		179894	`+ b.last 1f`
		179894	`+ ld1b z0.b, p0/z, [src, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src, #1, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src, #2, mul vl]`
		179894	`+ ld1b z3.b, p3/z, [src, #3, mul vl]`
		179894	`+ st1b z0.b, p0, [dest, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest, #1, mul vl]`
		179894	`+ st1b z2.b, p2, [dest, #2, mul vl]`
		179894	`+ st1b z3.b, p3, [dest, #3, mul vl]`
		179894	`+ ret`
		179894	`+1: // if rest <= vector_length * 8`
		179894	`+ lsl tmp1, vector_length, 2 // vector_length * 4`
		179894	`+ whilelo p4.b, tmp1, n`
		179894	`+ incb tmp1`
		179894	`+ whilelo p5.b, tmp1, n`
		179894	`+ b.last 1f`
		179894	`+ ld1b z0.b, p0/z, [src, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src, #1, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src, #2, mul vl]`
		179894	`+ ld1b z3.b, p3/z, [src, #3, mul vl]`
		179894	`+ ld1b z4.b, p4/z, [src, #4, mul vl]`
		179894	`+ ld1b z5.b, p5/z, [src, #5, mul vl]`
		179894	`+ st1b z0.b, p0, [dest, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest, #1, mul vl]`
		179894	`+ st1b z2.b, p2, [dest, #2, mul vl]`
		179894	`+ st1b z3.b, p3, [dest, #3, mul vl]`
		179894	`+ st1b z4.b, p4, [dest, #4, mul vl]`
		179894	`+ st1b z5.b, p5, [dest, #5, mul vl]`
		179894	`+ ret`
		179894	`+1: lsl tmp1, vector_length, 2 // vector_length * 4`
		179894	`+ incb tmp1 // vector_length * 5`
		179894	`+ incb tmp1 // vector_length * 6`
		179894	`+ whilelo p6.b, tmp1, n`
		179894	`+ incb tmp1`
		179894	`+ whilelo p7.b, tmp1, n`
		179894	`+ ld1b z0.b, p0/z, [src, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src, #1, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src, #2, mul vl]`
		179894	`+ ld1b z3.b, p3/z, [src, #3, mul vl]`
		179894	`+ ld1b z4.b, p4/z, [src, #4, mul vl]`
		179894	`+ ld1b z5.b, p5/z, [src, #5, mul vl]`
		179894	`+ ld1b z6.b, p6/z, [src, #6, mul vl]`
		179894	`+ ld1b z7.b, p7/z, [src, #7, mul vl]`
		179894	`+ st1b z0.b, p0, [dest, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest, #1, mul vl]`
		179894	`+ st1b z2.b, p2, [dest, #2, mul vl]`
		179894	`+ st1b z3.b, p3, [dest, #3, mul vl]`
		179894	`+ st1b z4.b, p4, [dest, #4, mul vl]`
		179894	`+ st1b z5.b, p5, [dest, #5, mul vl]`
		179894	`+ st1b z6.b, p6, [dest, #6, mul vl]`
		179894	`+ st1b z7.b, p7, [dest, #7, mul vl]`
		179894	`+ ret`
		179894	`+ .endm`
		179894	`+`
		179894	`+ENTRY (MEMCPY)`
		179894	`+`
		179894	`+ PTR_ARG (0)`
		179894	`+ PTR_ARG (1)`
		179894	`+ SIZE_ARG (2)`
		179894	`+`
		179894	`+L(memcpy):`
		179894	`+ cntb vector_length`
		179894	`+ // shortcut for less than vector_length * 8`
		179894	`+ // gives a free ptrue to p0.b for n >= vector_length`
		179894	`+ shortcut_for_small_size L(vl_agnostic)`
		179894	`+ // end of shortcut`
		179894	`+`
		179894	`+L(vl_agnostic): // VL Agnostic`
		179894	`+ mov rest, n`
		179894	`+ mov dest_ptr, dest`
		179894	`+ mov src_ptr, src`
		179894	`+ // if rest >= L2_SIZE && vector_length == 64 then L(L2)`
		179894	`+ mov tmp1, 64`
		179894	`+ cmp rest, L2_SIZE`
		179894	`+ ccmp vector_length, tmp1, 0, cs`
		179894	`+ b.eq L(L2)`
		179894	`+`
		179894	`+L(unroll8): // unrolling and software pipeline`
		179894	`+ lsl tmp1, vector_length, 3 // vector_length * 8`
		179894	`+ .p2align 3`
		179894	`+ cmp rest, tmp1`
		179894	`+ b.cc L(last)`
		179894	`+ ld1b_unroll8`
		179894	`+ add src_ptr, src_ptr, tmp1`
		179894	`+ sub rest, rest, tmp1`
		179894	`+ cmp rest, tmp1`
		179894	`+ b.cc 2f`
		179894	`+ .p2align 3`
		179894	`+1: stld1b_unroll8`
		179894	`+ add dest_ptr, dest_ptr, tmp1`
		179894	`+ add src_ptr, src_ptr, tmp1`
		179894	`+ sub rest, rest, tmp1`
		179894	`+ cmp rest, tmp1`
		179894	`+ b.ge 1b`
		179894	`+2: st1b_unroll8`
		179894	`+ add dest_ptr, dest_ptr, tmp1`
		179894	`+`
		179894	`+ .p2align 3`
		179894	`+L(last):`
		179894	`+ whilelo p0.b, xzr, rest`
		179894	`+ whilelo p1.b, vector_length, rest`
		179894	`+ b.last 1f`
		179894	`+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
		179894	`+ st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest_ptr, #1, mul vl]`
		179894	`+ ret`
		179894	`+1: lsl tmp1, vector_length, 1 // vector_length * 2`
		179894	`+ whilelo p2.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p3.b, tmp1, rest`
		179894	`+ b.last 1f`
		179894	`+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src_ptr, #2, mul vl]`
		179894	`+ ld1b z3.b, p3/z, [src_ptr, #3, mul vl]`
		179894	`+ st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest_ptr, #1, mul vl]`
		179894	`+ st1b z2.b, p2, [dest_ptr, #2, mul vl]`
		179894	`+ st1b z3.b, p3, [dest_ptr, #3, mul vl]`
		179894	`+ ret`
		179894	`+1: lsl tmp1, vector_length, 2 // vector_length * 4`
		179894	`+ whilelo p4.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p5.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p6.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p7.b, tmp1, rest`
		179894	`+ ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src_ptr, #2, mul vl]`
		179894	`+ ld1b z3.b, p3/z, [src_ptr, #3, mul vl]`
		179894	`+ ld1b z4.b, p4/z, [src_ptr, #4, mul vl]`
		179894	`+ ld1b z5.b, p5/z, [src_ptr, #5, mul vl]`
		179894	`+ ld1b z6.b, p6/z, [src_ptr, #6, mul vl]`
		179894	`+ ld1b z7.b, p7/z, [src_ptr, #7, mul vl]`
		179894	`+ st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z1.b, p1, [dest_ptr, #1, mul vl]`
		179894	`+ st1b z2.b, p2, [dest_ptr, #2, mul vl]`
		179894	`+ st1b z3.b, p3, [dest_ptr, #3, mul vl]`
		179894	`+ st1b z4.b, p4, [dest_ptr, #4, mul vl]`
		179894	`+ st1b z5.b, p5, [dest_ptr, #5, mul vl]`
		179894	`+ st1b z6.b, p6, [dest_ptr, #6, mul vl]`
		179894	`+ st1b z7.b, p7, [dest_ptr, #7, mul vl]`
		179894	`+ ret`
		179894	`+`
		179894	`+L(L2):`
		179894	`+ // align dest address at CACHE_LINE_SIZE byte boundary`
		179894	`+ mov tmp1, CACHE_LINE_SIZE`
		179894	`+ ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1`
		179894	`+ // if cl_remainder == 0`
		179894	`+ b.eq L(L2_dc_zva)`
		179894	`+ sub cl_remainder, tmp1, tmp2`
		179894	`+ // process remainder until the first CACHE_LINE_SIZE boundary`
		179894	`+ whilelo p1.b, xzr, cl_remainder // keep p0.b all true`
		179894	`+ whilelo p2.b, vector_length, cl_remainder`
		179894	`+ b.last 1f`
		179894	`+ ld1b z1.b, p1/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src_ptr, #1, mul vl]`
		179894	`+ st1b z1.b, p1, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z2.b, p2, [dest_ptr, #1, mul vl]`
		179894	`+ b 2f`
		179894	`+1: lsl tmp1, vector_length, 1 // vector_length * 2`
		179894	`+ whilelo p3.b, tmp1, cl_remainder`
		179894	`+ incb tmp1`
		179894	`+ whilelo p4.b, tmp1, cl_remainder`
		179894	`+ ld1b z1.b, p1/z, [src_ptr, #0, mul vl]`
		179894	`+ ld1b z2.b, p2/z, [src_ptr, #1, mul vl]`
		179894	`+ ld1b z3.b, p3/z, [src_ptr, #2, mul vl]`
		179894	`+ ld1b z4.b, p4/z, [src_ptr, #3, mul vl]`
		179894	`+ st1b z1.b, p1, [dest_ptr, #0, mul vl]`
		179894	`+ st1b z2.b, p2, [dest_ptr, #1, mul vl]`
		179894	`+ st1b z3.b, p3, [dest_ptr, #2, mul vl]`
		179894	`+ st1b z4.b, p4, [dest_ptr, #3, mul vl]`
		179894	`+2: add dest_ptr, dest_ptr, cl_remainder`
		179894	`+ add src_ptr, src_ptr, cl_remainder`
		179894	`+ sub rest, rest, cl_remainder`
		179894	`+`
		179894	`+L(L2_dc_zva):`
		179894	`+ // zero fill`
		179894	`+ and tmp1, dest, 0xffffffffffffff`
		179894	`+ and tmp2, src, 0xffffffffffffff`
		179894	`+ subs tmp1, tmp1, tmp2 // diff`
		179894	`+ b.ge 1f`
		179894	`+ neg tmp1, tmp1`
		179894	`+1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2`
		179894	`+ cmp tmp1, tmp3`
		179894	`+ b.lo L(unroll8)`
		179894	`+ mov tmp1, dest_ptr`
		179894	`+ dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1`
		179894	`+ // unroll`
		179894	`+ ld1b_unroll8 // this line has to be after "b.lo L(unroll8)"`
		179894	`+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2`
		179894	`+ sub rest, rest, CACHE_LINE_SIZE * 2`
		179894	`+ mov tmp1, ZF_DIST`
		179894	`+ .p2align 3`
		179894	`+1: stld1b_unroll4a`
		179894	`+ add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST`
		179894	`+ dc zva, tmp2`
		179894	`+ stld1b_unroll4b`
		179894	`+ add tmp2, tmp2, CACHE_LINE_SIZE`
		179894	`+ dc zva, tmp2`
		179894	`+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2`
		179894	`+ add src_ptr, src_ptr, CACHE_LINE_SIZE * 2`
		179894	`+ sub rest, rest, CACHE_LINE_SIZE * 2`
		179894	`+ cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2`
		179894	`+ b.ge 1b`
		179894	`+ st1b_unroll8`
		179894	`+ add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2`
		179894	`+ b L(unroll8)`
		179894	`+`
		179894	`+END (MEMCPY)`
		179894	`+libc_hidden_builtin_def (MEMCPY)`
		179894	`+`
		179894	`+`
		179894	`+ENTRY (MEMMOVE)`
		179894	`+`
		179894	`+ PTR_ARG (0)`
		179894	`+ PTR_ARG (1)`
		179894	`+ SIZE_ARG (2)`
		179894	`+`
		179894	`+ // remove tag address`
		179894	`+ // dest has to be immutable because it is the return value`
		179894	`+ // src has to be immutable because it is used in L(bwd_last)`
		179894	`+ and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2`
		179894	`+ and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3`
		179894	`+ cmp n, 0`
		179894	`+ ccmp tmp2, tmp3, 4, ne`
		179894	`+ b.ne 1f`
		179894	`+ ret`
		179894	`+1: cntb vector_length`
		179894	`+ // shortcut for less than vector_length * 8`
		179894	`+ // gives a free ptrue to p0.b for n >= vector_length`
		179894	`+ // tmp2 and tmp3 should not be used in this macro to keep`
		179894	`+ // notag addresses`
		179894	`+ shortcut_for_small_size L(dispatch)`
		179894	`+ // end of shortcut`
		179894	`+`
		179894	`+L(dispatch):`
		179894	`+ // tmp2 = dest_notag, tmp3 = src_notag`
		179894	`+ // diff = dest_notag - src_notag`
		179894	`+ sub tmp1, tmp2, tmp3`
		179894	`+ // if diff <= 0 \|\| diff >= n then memcpy`
		179894	`+ cmp tmp1, 0`
		179894	`+ ccmp tmp1, n, 2, gt`
		179894	`+ b.cs L(vl_agnostic)`
		179894	`+`
		179894	`+L(bwd_start):`
		179894	`+ mov rest, n`
		179894	`+ add dest_ptr, dest, n // dest_end`
		179894	`+ add src_ptr, src, n // src_end`
		179894	`+`
		179894	`+L(bwd_unroll8): // unrolling and software pipeline`
		179894	`+ lsl tmp1, vector_length, 3 // vector_length * 8`
		179894	`+ .p2align 3`
		179894	`+ cmp rest, tmp1`
		179894	`+ b.cc L(bwd_last)`
		179894	`+ sub src_ptr, src_ptr, tmp1`
		179894	`+ ld1b_unroll8`
		179894	`+ sub rest, rest, tmp1`
		179894	`+ cmp rest, tmp1`
		179894	`+ b.cc 2f`
		179894	`+ .p2align 3`
		179894	`+1: sub src_ptr, src_ptr, tmp1`
		179894	`+ sub dest_ptr, dest_ptr, tmp1`
		179894	`+ stld1b_unroll8`
		179894	`+ sub rest, rest, tmp1`
		179894	`+ cmp rest, tmp1`
		179894	`+ b.ge 1b`
		179894	`+2: sub dest_ptr, dest_ptr, tmp1`
		179894	`+ st1b_unroll8`
		179894	`+`
		179894	`+L(bwd_last):`
		179894	`+ mov dest_ptr, dest`
		179894	`+ mov src_ptr, src`
		179894	`+ b L(last)`
		179894	`+`
		179894	`+END (MEMMOVE)`
		179894	`+libc_hidden_builtin_def (MEMMOVE)`
		179894	`+# endif /* IS_IN (libc) */`
		179894	`+#endif /* HAVE_AARCH64_SVE_ASM */`
		179894	`diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c`
		179894	`index e69d8162910b938e..d96612b9cf7c3a4e 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/memmove.c`
		179894	`+++ b/sysdeps/aarch64/multiarch/memmove.c`
		179894	`@@ -31,14 +31,22 @@ extern __typeof (__redirect_memmove) __libc_memmove;`
		179894	`extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;`
		179894	`extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;`
		179894	`extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;`
		179894	`+# if HAVE_AARCH64_SVE_ASM`
		179894	`+extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;`
		179894	`+# endif`
		179894
		179894	`libc_ifunc (__libc_memmove,`
		179894	`(IS_THUNDERX (midr)`
		179894	`? __memmove_thunderx`
		179894	`: (IS_FALKOR (midr) \|\| IS_PHECDA (midr)`
		179894	`? __memmove_falkor`
		179894	`+# if HAVE_AARCH64_SVE_ASM`
		179894	`+ : (IS_A64FX (midr)`
		179894	`+ ? __memmove_a64fx`
		179894	`+ : __memmove_generic))));`
		179894	`+# else`
		179894	`: __memmove_generic)));`
		179894	`-`
		179894	`+# endif`
		179894	`# undef memmove`
		179894	`strong_alias (__libc_memmove, memmove);`
		179894	`#endif`
		179894	`diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c`
		179894	`index b4f348509eb1c6b3..71e4355c972f1ffb 100644`
		179894	`--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c`
		179894	`+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c`
		179894	`@@ -36,6 +36,7 @@ static struct cpu_list cpu_list[] = {`
		179894	`{"thunderx2t99", 0x431F0AF0},`
		179894	`{"thunderx2t99p1", 0x420F5160},`
		179894	`{"phecda", 0x680F0000},`
		179894	`+ {"a64fx", 0x460F0010},`
		179894	`{"generic", 0x0}`
		179894	`};`
		179894
		179894	`@@ -80,4 +81,7 @@ init_cpu_features (struct cpu_features *cpu_features)`
		179894
		179894	`if ((dczid & DCZID_DZP_MASK) == 0)`
		179894	`cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);`
		179894	`+`
		179894	`+ /* Check if SVE is supported. */`
		179894	`+ cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;`
		179894	`}`
		179894	`diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h`
		179894	`index eb35adfbe9d429d5..5691aea6de3cb7f4 100644`
		179894	`--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h`
		179894	`+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h`
		179894	`@@ -20,6 +20,7 @@`
		179894	`#define _CPU_FEATURES_AARCH64_H`
		179894
		179894	`#include <stdint.h>`
		179894	`+#include <stdbool.h>`
		179894
		179894	`#define MIDR_PARTNUM_SHIFT 4`
		179894	`#define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT)`
		179894	`@@ -52,10 +53,14 @@`
		179894	`#define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h' \`
		179894	`&& MIDR_PARTNUM(midr) == 0x000)`
		179894
		179894	`+#define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F' \`
		179894	`+ && MIDR_PARTNUM(midr) == 0x001)`
		179894	`+`
		179894	`struct cpu_features`
		179894	`{`
		179894	`uint64_t midr_el1;`
		179894	`unsigned zva_size;`
		179894	`+ bool sve;`
		179894	`};`
		179894
		179894	`#endif /* _CPU_FEATURES_AARCH64_H */`

rpms / glibc

Source Code

Blame SOURCES/glibc-rh1929928-4.patch