Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-rh1929928-5.patch

Blob History Raw

		179894	`commit 4f26956d5ba394eb3ade6c1c20b5c16864a00766`
		179894	`Author: Naohiro Tamura <naohirot@jp.fujitsu.com>`
		179894	`Date: Thu May 27 07:44:12 2021 +0000`
		179894
		179894	`aarch64: Added optimized memset for A64FX`
		179894
		179894	`This patch optimizes the performance of memset for A64FX [1] which`
		179894	`implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB cache`
		179894	`per NUMA node.`
		179894
		179894	`The performance optimization makes use of Scalable Vector Register`
		179894	`with several techniques such as loop unrolling, memory access`
		179894	`alignment, cache zero fill and prefetch.`
		179894
		179894	`SVE assembler code for memset is implemented as Vector Length Agnostic`
		179894	`code so theoretically it can be run on any SOC which supports ARMv8-A`
		179894	`SVE standard.`
		179894
		179894	`We confirmed that all testcases have been passed by running 'make`
		179894	`check' and 'make xcheck' not only on A64FX but also on ThunderX2.`
		179894
		179894	`And also we confirmed that the SVE 512 bit vector register performance`
		179894	`is roughly 4 times better than Advanced SIMD 128 bit register and 8`
		179894	`times better than scalar 64 bit register by running 'make bench'.`
		179894
		179894	`[1] https://github.com/fujitsu/A64FX`
		179894
		179894	`Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>`
		179894	`Reviewed-by: Szabolcs Nagy <Szabolcs.Nagy@arm.com>`
		179894
		179894	`Conflicts:`
		179894	`sysdeps/aarch64/multiarch/Makefile`
		179894	`sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`sysdeps/aarch64/multiarch/memset.c`
		179894	`(all conflicts due to missing other CPU implementations downstream)`
		179894
		179894	`diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile`
		179894	`index 5a19ba0308e80983..5ff883a8ad8e3067 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/Makefile`
		179894	`+++ b/sysdeps/aarch64/multiarch/Makefile`
		179894	`@@ -1,5 +1,6 @@`
		179894	`ifeq ($(subdir),string)`
		179894	`sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \`
		179894	`memcpy_falkor memcpy_a64fx \`
		179894	`- memmove_falkor memset_generic memset_falkor`
		179894	`+ memmove_falkor memset_generic memset_falkor \`
		179894	`+ memset_a64fx`
		179894	`endif`
		179894	`diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`index f53db12acce37877..53e3e162a1025e40 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c`
		179894	`@@ -37,7 +37,7 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		179894
		179894	`INIT_ARCH ();`
		179894
		179894	`- /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */`
		179894	`+ /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */`
		179894	`IFUNC_IMPL (i, name, memcpy,`
		179894	`IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)`
		179894	`IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)`
		179894	`@@ -57,6 +57,9 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		179894	`/* Enable this on non-falkor processors too so that other cores`
		179894	`can do a comparative analysis with __memset_generic. */`
		179894	`IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)`
		179894	`+#if HAVE_AARCH64_SVE_ASM`
		179894	`+ IFUNC_IMPL_ADD (array, i, memset, sve, __memset_a64fx)`
		179894	`+#endif`
		179894	`IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))`
		179894
		179894	`return i;`
		179894	`diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c`
		179894	`index d74ed3a549a54b10..2c8cc72bb0b18474 100644`
		179894	`--- a/sysdeps/aarch64/multiarch/memset.c`
		179894	`+++ b/sysdeps/aarch64/multiarch/memset.c`
		179894	`@@ -29,12 +29,21 @@`
		179894	`extern __typeof (__redirect_memset) __libc_memset;`
		179894
		179894	`extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;`
		179894	`+# if HAVE_AARCH64_SVE_ASM`
		179894	`+extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;`
		179894	`+# endif`
		179894	`extern __typeof (__redirect_memset) __memset_generic attribute_hidden;`
		179894
		179894	`libc_ifunc (__libc_memset,`
		179894	`((IS_FALKOR (midr) \|\| IS_PHECDA (midr)) && zva_size == 64`
		179894	`? __memset_falkor`
		179894	`+# if HAVE_AARCH64_SVE_ASM`
		179894	`+ : (IS_A64FX (midr)`
		179894	`+ ? __memset_a64fx`
		179894	`+ : __memset_generic)));`
		179894	`+# else`
		179894	`: __memset_generic));`
		179894	`+# endif`
		179894
		179894	`# undef memset`
		179894	`strong_alias (__libc_memset, memset);`
		179894	`diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S`
		179894	`new file mode 100644`
		179894	`index 0000000000000000..ce54e5418b08c8bc`
		179894	`--- /dev/null`
		179894	`+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S`
		179894	`@@ -0,0 +1,268 @@`
		179894	`+/* Optimized memset for Fujitsu A64FX processor.`
		179894	`+ Copyright (C) 2021 Free Software Foundation, Inc.`
		179894	`+`
		179894	`+ This file is part of the GNU C Library.`
		179894	`+`
		179894	`+ The GNU C Library is free software; you can redistribute it and/or`
		179894	`+ modify it under the terms of the GNU Lesser General Public`
		179894	`+ License as published by the Free Software Foundation; either`
		179894	`+ version 2.1 of the License, or (at your option) any later version.`
		179894	`+`
		179894	`+ The GNU C Library is distributed in the hope that it will be useful,`
		179894	`+ but WITHOUT ANY WARRANTY; without even the implied warranty of`
		179894	`+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
		179894	`+ Lesser General Public License for more details.`
		179894	`+`
		179894	`+ You should have received a copy of the GNU Lesser General Public`
		179894	`+ License along with the GNU C Library. If not, see`
		179894	`+ <https://www.gnu.org/licenses/>. */`
		179894	`+`
		179894	`+#include <sysdep.h>`
		179894	`+#include <sysdeps/aarch64/memset-reg.h>`
		179894	`+`
		179894	`+/* Assumptions:`
		179894	`+ *`
		179894	`+ * ARMv8.2-a, AArch64, unaligned accesses, sve`
		179894	`+ *`
		179894	`+ */`
		179894	`+`
		179894	`+#define L1_SIZE (64*1024) // L1 64KB`
		179894	`+#define L2_SIZE (810241024) // L2 8MB - 1MB`
		179894	`+#define CACHE_LINE_SIZE 256`
		179894	`+#define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1`
		179894	`+#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance`
		179894	`+#define rest x8`
		179894	`+#define vector_length x9`
		179894	`+#define vl_remainder x10 // vector_length remainder`
		179894	`+#define cl_remainder x11 // CACHE_LINE_SIZE remainder`
		179894	`+`
		179894	`+#if HAVE_AARCH64_SVE_ASM`
		179894	`+# if IS_IN (libc)`
		179894	`+# define MEMSET __memset_a64fx`
		179894	`+`
		179894	`+ .arch armv8.2-a+sve`
		179894	`+`
		179894	`+ .macro dc_zva times`
		179894	`+ dc zva, tmp1`
		179894	`+ add tmp1, tmp1, CACHE_LINE_SIZE`
		179894	`+ .if \times-1`
		179894	`+ dc_zva "(\times-1)"`
		179894	`+ .endif`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro st1b_unroll first=0, last=7`
		179894	`+ st1b z0.b, p0, [dst, #\first, mul vl]`
		179894	`+ .if \last-\first`
		179894	`+ st1b_unroll "(\first+1)", \last`
		179894	`+ .endif`
		179894	`+ .endm`
		179894	`+`
		179894	`+ .macro shortcut_for_small_size exit`
		179894	`+ // if rest <= vector_length * 2`
		179894	`+ whilelo p0.b, xzr, count`
		179894	`+ whilelo p1.b, vector_length, count`
		179894	`+ b.last 1f`
		179894	`+ st1b z0.b, p0, [dstin, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dstin, #1, mul vl]`
		179894	`+ ret`
		179894	`+1: // if rest > vector_length * 8`
		179894	`+ cmp count, vector_length, lsl 3 // vector_length * 8`
		179894	`+ b.hi \exit`
		179894	`+ // if rest <= vector_length * 4`
		179894	`+ lsl tmp1, vector_length, 1 // vector_length * 2`
		179894	`+ whilelo p2.b, tmp1, count`
		179894	`+ incb tmp1`
		179894	`+ whilelo p3.b, tmp1, count`
		179894	`+ b.last 1f`
		179894	`+ st1b z0.b, p0, [dstin, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dstin, #1, mul vl]`
		179894	`+ st1b z0.b, p2, [dstin, #2, mul vl]`
		179894	`+ st1b z0.b, p3, [dstin, #3, mul vl]`
		179894	`+ ret`
		179894	`+1: // if rest <= vector_length * 8`
		179894	`+ lsl tmp1, vector_length, 2 // vector_length * 4`
		179894	`+ whilelo p4.b, tmp1, count`
		179894	`+ incb tmp1`
		179894	`+ whilelo p5.b, tmp1, count`
		179894	`+ b.last 1f`
		179894	`+ st1b z0.b, p0, [dstin, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dstin, #1, mul vl]`
		179894	`+ st1b z0.b, p2, [dstin, #2, mul vl]`
		179894	`+ st1b z0.b, p3, [dstin, #3, mul vl]`
		179894	`+ st1b z0.b, p4, [dstin, #4, mul vl]`
		179894	`+ st1b z0.b, p5, [dstin, #5, mul vl]`
		179894	`+ ret`
		179894	`+1: lsl tmp1, vector_length, 2 // vector_length * 4`
		179894	`+ incb tmp1 // vector_length * 5`
		179894	`+ incb tmp1 // vector_length * 6`
		179894	`+ whilelo p6.b, tmp1, count`
		179894	`+ incb tmp1`
		179894	`+ whilelo p7.b, tmp1, count`
		179894	`+ st1b z0.b, p0, [dstin, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dstin, #1, mul vl]`
		179894	`+ st1b z0.b, p2, [dstin, #2, mul vl]`
		179894	`+ st1b z0.b, p3, [dstin, #3, mul vl]`
		179894	`+ st1b z0.b, p4, [dstin, #4, mul vl]`
		179894	`+ st1b z0.b, p5, [dstin, #5, mul vl]`
		179894	`+ st1b z0.b, p6, [dstin, #6, mul vl]`
		179894	`+ st1b z0.b, p7, [dstin, #7, mul vl]`
		179894	`+ ret`
		179894	`+ .endm`
		179894	`+`
		179894	`+ENTRY (MEMSET)`
		179894	`+`
		179894	`+ PTR_ARG (0)`
		179894	`+ SIZE_ARG (2)`
		179894	`+`
		179894	`+ cbnz count, 1f`
		179894	`+ ret`
		179894	`+1: dup z0.b, valw`
		179894	`+ cntb vector_length`
		179894	`+ // shortcut for less than vector_length * 8`
		179894	`+ // gives a free ptrue to p0.b for n >= vector_length`
		179894	`+ shortcut_for_small_size L(vl_agnostic)`
		179894	`+ // end of shortcut`
		179894	`+`
		179894	`+L(vl_agnostic): // VL Agnostic`
		179894	`+ mov rest, count`
		179894	`+ mov dst, dstin`
		179894	`+ add dstend, dstin, count`
		179894	`+ // if rest >= L2_SIZE && vector_length == 64 then L(L2)`
		179894	`+ mov tmp1, 64`
		179894	`+ cmp rest, L2_SIZE`
		179894	`+ ccmp vector_length, tmp1, 0, cs`
		179894	`+ b.eq L(L2)`
		179894	`+ // if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)`
		179894	`+ cmp rest, L1_SIZE`
		179894	`+ ccmp vector_length, tmp1, 0, cs`
		179894	`+ b.eq L(L1_prefetch)`
		179894	`+`
		179894	`+L(unroll32):`
		179894	`+ lsl tmp1, vector_length, 3 // vector_length * 8`
		179894	`+ lsl tmp2, vector_length, 5 // vector_length * 32`
		179894	`+ .p2align 3`
		179894	`+1: cmp rest, tmp2`
		179894	`+ b.cc L(unroll8)`
		179894	`+ st1b_unroll`
		179894	`+ add dst, dst, tmp1`
		179894	`+ st1b_unroll`
		179894	`+ add dst, dst, tmp1`
		179894	`+ st1b_unroll`
		179894	`+ add dst, dst, tmp1`
		179894	`+ st1b_unroll`
		179894	`+ add dst, dst, tmp1`
		179894	`+ sub rest, rest, tmp2`
		179894	`+ b 1b`
		179894	`+`
		179894	`+L(unroll8):`
		179894	`+ lsl tmp1, vector_length, 3`
		179894	`+ .p2align 3`
		179894	`+1: cmp rest, tmp1`
		179894	`+ b.cc L(last)`
		179894	`+ st1b_unroll`
		179894	`+ add dst, dst, tmp1`
		179894	`+ sub rest, rest, tmp1`
		179894	`+ b 1b`
		179894	`+`
		179894	`+L(last):`
		179894	`+ whilelo p0.b, xzr, rest`
		179894	`+ whilelo p1.b, vector_length, rest`
		179894	`+ b.last 1f`
		179894	`+ st1b z0.b, p0, [dst, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dst, #1, mul vl]`
		179894	`+ ret`
		179894	`+1: lsl tmp1, vector_length, 1 // vector_length * 2`
		179894	`+ whilelo p2.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p3.b, tmp1, rest`
		179894	`+ b.last 1f`
		179894	`+ st1b z0.b, p0, [dst, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dst, #1, mul vl]`
		179894	`+ st1b z0.b, p2, [dst, #2, mul vl]`
		179894	`+ st1b z0.b, p3, [dst, #3, mul vl]`
		179894	`+ ret`
		179894	`+1: lsl tmp1, vector_length, 2 // vector_length * 4`
		179894	`+ whilelo p4.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p5.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p6.b, tmp1, rest`
		179894	`+ incb tmp1`
		179894	`+ whilelo p7.b, tmp1, rest`
		179894	`+ st1b z0.b, p0, [dst, #0, mul vl]`
		179894	`+ st1b z0.b, p1, [dst, #1, mul vl]`
		179894	`+ st1b z0.b, p2, [dst, #2, mul vl]`
		179894	`+ st1b z0.b, p3, [dst, #3, mul vl]`
		179894	`+ st1b z0.b, p4, [dst, #4, mul vl]`
		179894	`+ st1b z0.b, p5, [dst, #5, mul vl]`
		179894	`+ st1b z0.b, p6, [dst, #6, mul vl]`
		179894	`+ st1b z0.b, p7, [dst, #7, mul vl]`
		179894	`+ ret`
		179894	`+`
		179894	`+L(L1_prefetch): // if rest >= L1_SIZE`
		179894	`+ .p2align 3`
		179894	`+1: st1b_unroll 0, 3`
		179894	`+ prfm pstl1keep, [dst, PF_DIST_L1]`
		179894	`+ st1b_unroll 4, 7`
		179894	`+ prfm pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]`
		179894	`+ add dst, dst, CACHE_LINE_SIZE * 2`
		179894	`+ sub rest, rest, CACHE_LINE_SIZE * 2`
		179894	`+ cmp rest, L1_SIZE`
		179894	`+ b.ge 1b`
		179894	`+ cbnz rest, L(unroll32)`
		179894	`+ ret`
		179894	`+`
		179894	`+L(L2):`
		179894	`+ // align dst address at vector_length byte boundary`
		179894	`+ sub tmp1, vector_length, 1`
		179894	`+ ands tmp2, dst, tmp1`
		179894	`+ // if vl_remainder == 0`
		179894	`+ b.eq 1f`
		179894	`+ sub vl_remainder, vector_length, tmp2`
		179894	`+ // process remainder until the first vector_length boundary`
		179894	`+ whilelt p2.b, xzr, vl_remainder`
		179894	`+ st1b z0.b, p2, [dst]`
		179894	`+ add dst, dst, vl_remainder`
		179894	`+ sub rest, rest, vl_remainder`
		179894	`+ // align dstin address at CACHE_LINE_SIZE byte boundary`
		179894	`+1: mov tmp1, CACHE_LINE_SIZE`
		179894	`+ ands tmp2, dst, CACHE_LINE_SIZE - 1`
		179894	`+ // if cl_remainder == 0`
		179894	`+ b.eq L(L2_dc_zva)`
		179894	`+ sub cl_remainder, tmp1, tmp2`
		179894	`+ // process remainder until the first CACHE_LINE_SIZE boundary`
		179894	`+ mov tmp1, xzr // index`
		179894	`+2: whilelt p2.b, tmp1, cl_remainder`
		179894	`+ st1b z0.b, p2, [dst, tmp1]`
		179894	`+ incb tmp1`
		179894	`+ cmp tmp1, cl_remainder`
		179894	`+ b.lo 2b`
		179894	`+ add dst, dst, cl_remainder`
		179894	`+ sub rest, rest, cl_remainder`
		179894	`+`
		179894	`+L(L2_dc_zva):`
		179894	`+ // zero fill`
		179894	`+ mov tmp1, dst`
		179894	`+ dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1`
		179894	`+ mov zva_len, ZF_DIST`
		179894	`+ add tmp1, zva_len, CACHE_LINE_SIZE * 2`
		179894	`+ // unroll`
		179894	`+ .p2align 3`
		179894	`+1: st1b_unroll 0, 3`
		179894	`+ add tmp2, dst, zva_len`
		179894	`+ dc zva, tmp2`
		179894	`+ st1b_unroll 4, 7`
		179894	`+ add tmp2, tmp2, CACHE_LINE_SIZE`
		179894	`+ dc zva, tmp2`
		179894	`+ add dst, dst, CACHE_LINE_SIZE * 2`
		179894	`+ sub rest, rest, CACHE_LINE_SIZE * 2`
		179894	`+ cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2`
		179894	`+ b.ge 1b`
		179894	`+ cbnz rest, L(unroll8)`
		179894	`+ ret`
		179894	`+`
		179894	`+END (MEMSET)`
		179894	`+libc_hidden_builtin_def (MEMSET)`
		179894	`+`
		179894	`+#endif /* IS_IN (libc) */`
		179894	`+#endif /* HAVE_AARCH64_SVE_ASM */`

rpms / glibc

Source Code

Blame SOURCES/glibc-rh1929928-5.patch