548bcb
commit 4f26956d5ba394eb3ade6c1c20b5c16864a00766
548bcb
Author: Naohiro Tamura <naohirot@jp.fujitsu.com>
548bcb
Date:   Thu May 27 07:44:12 2021 +0000
548bcb
548bcb
    aarch64: Added optimized memset for A64FX
548bcb
    
548bcb
    This patch optimizes the performance of memset for A64FX [1] which
548bcb
    implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB cache
548bcb
    per NUMA node.
548bcb
    
548bcb
    The performance optimization makes use of Scalable Vector Register
548bcb
    with several techniques such as loop unrolling, memory access
548bcb
    alignment, cache zero fill and prefetch.
548bcb
    
548bcb
    SVE assembler code for memset is implemented as Vector Length Agnostic
548bcb
    code so theoretically it can be run on any SOC which supports ARMv8-A
548bcb
    SVE standard.
548bcb
    
548bcb
    We confirmed that all testcases have been passed by running 'make
548bcb
    check' and 'make xcheck' not only on A64FX but also on ThunderX2.
548bcb
    
548bcb
    And also we confirmed that the SVE 512 bit vector register performance
548bcb
    is roughly 4 times better than Advanced SIMD 128 bit register and 8
548bcb
    times better than scalar 64 bit register by running 'make bench'.
548bcb
    
548bcb
    [1] https://github.com/fujitsu/A64FX
548bcb
    
548bcb
    Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
548bcb
    Reviewed-by: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
548bcb
548bcb
Conflicts:
548bcb
	sysdeps/aarch64/multiarch/Makefile
548bcb
	sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
	sysdeps/aarch64/multiarch/memset.c
548bcb
	  (all conflicts due to missing other CPU implementations downstream)
548bcb
548bcb
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
548bcb
index 5a19ba0308e80983..5ff883a8ad8e3067 100644
548bcb
--- a/sysdeps/aarch64/multiarch/Makefile
548bcb
+++ b/sysdeps/aarch64/multiarch/Makefile
548bcb
@@ -1,5 +1,6 @@
548bcb
 ifeq ($(subdir),string)
548bcb
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
548bcb
 		   memcpy_falkor memcpy_a64fx \
548bcb
-		   memmove_falkor memset_generic memset_falkor
548bcb
+		   memmove_falkor memset_generic memset_falkor \
548bcb
+		   memset_a64fx
548bcb
 endif
548bcb
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
index f53db12acce37877..53e3e162a1025e40 100644
548bcb
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
@@ -37,7 +37,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
548bcb
 
548bcb
   INIT_ARCH ();
548bcb
 
548bcb
-  /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
548bcb
+  /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c.  */
548bcb
   IFUNC_IMPL (i, name, memcpy,
548bcb
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
548bcb
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
548bcb
@@ -57,6 +57,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
548bcb
 	      /* Enable this on non-falkor processors too so that other cores
548bcb
 		 can do a comparative analysis with __memset_generic.  */
548bcb
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
548bcb
+#if HAVE_AARCH64_SVE_ASM
548bcb
+	      IFUNC_IMPL_ADD (array, i, memset, sve, __memset_a64fx)
548bcb
+#endif
548bcb
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
548bcb
 
548bcb
   return i;
548bcb
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
548bcb
index d74ed3a549a54b10..2c8cc72bb0b18474 100644
548bcb
--- a/sysdeps/aarch64/multiarch/memset.c
548bcb
+++ b/sysdeps/aarch64/multiarch/memset.c
548bcb
@@ -29,12 +29,21 @@
548bcb
 extern __typeof (__redirect_memset) __libc_memset;
548bcb
 
548bcb
 extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
548bcb
+# if HAVE_AARCH64_SVE_ASM
548bcb
+extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
548bcb
+# endif
548bcb
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
548bcb
 
548bcb
 libc_ifunc (__libc_memset,
548bcb
 	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
548bcb
 	     ? __memset_falkor
548bcb
+# if HAVE_AARCH64_SVE_ASM
548bcb
+	     : (IS_A64FX (midr)
548bcb
+		? __memset_a64fx
548bcb
+		: __memset_generic)));
548bcb
+# else
548bcb
 	     : __memset_generic));
548bcb
+# endif
548bcb
 
548bcb
 # undef memset
548bcb
 strong_alias (__libc_memset, memset);
548bcb
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
548bcb
new file mode 100644
548bcb
index 0000000000000000..ce54e5418b08c8bc
548bcb
--- /dev/null
548bcb
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
548bcb
@@ -0,0 +1,268 @@
548bcb
+/* Optimized memset for Fujitsu A64FX processor.
548bcb
+   Copyright (C) 2021 Free Software Foundation, Inc.
548bcb
+
548bcb
+   This file is part of the GNU C Library.
548bcb
+
548bcb
+   The GNU C Library is free software; you can redistribute it and/or
548bcb
+   modify it under the terms of the GNU Lesser General Public
548bcb
+   License as published by the Free Software Foundation; either
548bcb
+   version 2.1 of the License, or (at your option) any later version.
548bcb
+
548bcb
+   The GNU C Library is distributed in the hope that it will be useful,
548bcb
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
548bcb
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
548bcb
+   Lesser General Public License for more details.
548bcb
+
548bcb
+   You should have received a copy of the GNU Lesser General Public
548bcb
+   License along with the GNU C Library.  If not, see
548bcb
+   <https://www.gnu.org/licenses/>.  */
548bcb
+
548bcb
+#include <sysdep.h>
548bcb
+#include <sysdeps/aarch64/memset-reg.h>
548bcb
+
548bcb
+/* Assumptions:
548bcb
+ *
548bcb
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
548bcb
+ *
548bcb
+ */
548bcb
+
548bcb
+#define L1_SIZE		(64*1024)	// L1 64KB
548bcb
+#define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
548bcb
+#define CACHE_LINE_SIZE	256
548bcb
+#define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
548bcb
+#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
548bcb
+#define rest		x8
548bcb
+#define vector_length	x9
548bcb
+#define vl_remainder	x10	// vector_length remainder
548bcb
+#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
548bcb
+
548bcb
+#if HAVE_AARCH64_SVE_ASM
548bcb
+# if IS_IN (libc)
548bcb
+#  define MEMSET __memset_a64fx
548bcb
+
548bcb
+	.arch armv8.2-a+sve
548bcb
+
548bcb
+	.macro dc_zva times
548bcb
+	dc	zva, tmp1
548bcb
+	add	tmp1, tmp1, CACHE_LINE_SIZE
548bcb
+	.if \times-1
548bcb
+	dc_zva "(\times-1)"
548bcb
+	.endif
548bcb
+	.endm
548bcb
+
548bcb
+	.macro st1b_unroll first=0, last=7
548bcb
+	st1b	z0.b, p0, [dst, #\first, mul vl]
548bcb
+	.if \last-\first
548bcb
+	st1b_unroll "(\first+1)", \last
548bcb
+	.endif
548bcb
+	.endm
548bcb
+
548bcb
+	.macro shortcut_for_small_size exit
548bcb
+	// if rest <= vector_length * 2
548bcb
+	whilelo	p0.b, xzr, count
548bcb
+	whilelo	p1.b, vector_length, count
548bcb
+	b.last	1f
548bcb
+	st1b	z0.b, p0, [dstin, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dstin, #1, mul vl]
548bcb
+	ret
548bcb
+1:	// if rest > vector_length * 8
548bcb
+	cmp	count, vector_length, lsl 3	// vector_length * 8
548bcb
+	b.hi	\exit
548bcb
+	// if rest <= vector_length * 4
548bcb
+	lsl	tmp1, vector_length, 1	// vector_length * 2
548bcb
+	whilelo	p2.b, tmp1, count
548bcb
+	incb	tmp1
548bcb
+	whilelo	p3.b, tmp1, count
548bcb
+	b.last	1f
548bcb
+	st1b	z0.b, p0, [dstin, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dstin, #1, mul vl]
548bcb
+	st1b	z0.b, p2, [dstin, #2, mul vl]
548bcb
+	st1b	z0.b, p3, [dstin, #3, mul vl]
548bcb
+	ret
548bcb
+1:	// if rest <= vector_length * 8
548bcb
+	lsl	tmp1, vector_length, 2	// vector_length * 4
548bcb
+	whilelo	p4.b, tmp1, count
548bcb
+	incb	tmp1
548bcb
+	whilelo	p5.b, tmp1, count
548bcb
+	b.last	1f
548bcb
+	st1b	z0.b, p0, [dstin, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dstin, #1, mul vl]
548bcb
+	st1b	z0.b, p2, [dstin, #2, mul vl]
548bcb
+	st1b	z0.b, p3, [dstin, #3, mul vl]
548bcb
+	st1b	z0.b, p4, [dstin, #4, mul vl]
548bcb
+	st1b	z0.b, p5, [dstin, #5, mul vl]
548bcb
+	ret
548bcb
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
548bcb
+	incb	tmp1			// vector_length * 5
548bcb
+	incb	tmp1			// vector_length * 6
548bcb
+	whilelo	p6.b, tmp1, count
548bcb
+	incb	tmp1
548bcb
+	whilelo	p7.b, tmp1, count
548bcb
+	st1b	z0.b, p0, [dstin, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dstin, #1, mul vl]
548bcb
+	st1b	z0.b, p2, [dstin, #2, mul vl]
548bcb
+	st1b	z0.b, p3, [dstin, #3, mul vl]
548bcb
+	st1b	z0.b, p4, [dstin, #4, mul vl]
548bcb
+	st1b	z0.b, p5, [dstin, #5, mul vl]
548bcb
+	st1b	z0.b, p6, [dstin, #6, mul vl]
548bcb
+	st1b	z0.b, p7, [dstin, #7, mul vl]
548bcb
+	ret
548bcb
+	.endm
548bcb
+
548bcb
+ENTRY (MEMSET)
548bcb
+
548bcb
+	PTR_ARG (0)
548bcb
+	SIZE_ARG (2)
548bcb
+
548bcb
+	cbnz	count, 1f
548bcb
+	ret
548bcb
+1:	dup	z0.b, valw
548bcb
+	cntb	vector_length
548bcb
+	// shortcut for less than vector_length * 8
548bcb
+	// gives a free ptrue to p0.b for n >= vector_length
548bcb
+	shortcut_for_small_size L(vl_agnostic)
548bcb
+	// end of shortcut
548bcb
+
548bcb
+L(vl_agnostic): // VL Agnostic
548bcb
+	mov	rest, count
548bcb
+	mov	dst, dstin
548bcb
+	add	dstend, dstin, count
548bcb
+	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
548bcb
+	mov	tmp1, 64
548bcb
+	cmp	rest, L2_SIZE
548bcb
+	ccmp	vector_length, tmp1, 0, cs
548bcb
+	b.eq	L(L2)
548bcb
+	// if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
548bcb
+	cmp	rest, L1_SIZE
548bcb
+	ccmp	vector_length, tmp1, 0, cs
548bcb
+	b.eq	L(L1_prefetch)
548bcb
+
548bcb
+L(unroll32):
548bcb
+	lsl	tmp1, vector_length, 3	// vector_length * 8
548bcb
+	lsl	tmp2, vector_length, 5	// vector_length * 32
548bcb
+	.p2align 3
548bcb
+1:	cmp	rest, tmp2
548bcb
+	b.cc	L(unroll8)
548bcb
+	st1b_unroll
548bcb
+	add	dst, dst, tmp1
548bcb
+	st1b_unroll
548bcb
+	add	dst, dst, tmp1
548bcb
+	st1b_unroll
548bcb
+	add	dst, dst, tmp1
548bcb
+	st1b_unroll
548bcb
+	add	dst, dst, tmp1
548bcb
+	sub	rest, rest, tmp2
548bcb
+	b	1b
548bcb
+
548bcb
+L(unroll8):
548bcb
+	lsl	tmp1, vector_length, 3
548bcb
+	.p2align 3
548bcb
+1:	cmp	rest, tmp1
548bcb
+	b.cc	L(last)
548bcb
+	st1b_unroll
548bcb
+	add	dst, dst, tmp1
548bcb
+	sub	rest, rest, tmp1
548bcb
+	b	1b
548bcb
+
548bcb
+L(last):
548bcb
+	whilelo	p0.b, xzr, rest
548bcb
+	whilelo	p1.b, vector_length, rest
548bcb
+	b.last	1f
548bcb
+	st1b	z0.b, p0, [dst, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dst, #1, mul vl]
548bcb
+	ret
548bcb
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
548bcb
+	whilelo	p2.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p3.b, tmp1, rest
548bcb
+	b.last	1f
548bcb
+	st1b	z0.b, p0, [dst, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dst, #1, mul vl]
548bcb
+	st1b	z0.b, p2, [dst, #2, mul vl]
548bcb
+	st1b	z0.b, p3, [dst, #3, mul vl]
548bcb
+	ret
548bcb
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
548bcb
+	whilelo	p4.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p5.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p6.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p7.b, tmp1, rest
548bcb
+	st1b	z0.b, p0, [dst, #0, mul vl]
548bcb
+	st1b	z0.b, p1, [dst, #1, mul vl]
548bcb
+	st1b	z0.b, p2, [dst, #2, mul vl]
548bcb
+	st1b	z0.b, p3, [dst, #3, mul vl]
548bcb
+	st1b	z0.b, p4, [dst, #4, mul vl]
548bcb
+	st1b	z0.b, p5, [dst, #5, mul vl]
548bcb
+	st1b	z0.b, p6, [dst, #6, mul vl]
548bcb
+	st1b	z0.b, p7, [dst, #7, mul vl]
548bcb
+	ret
548bcb
+
548bcb
+L(L1_prefetch): // if rest >= L1_SIZE
548bcb
+	.p2align 3
548bcb
+1:	st1b_unroll 0, 3
548bcb
+	prfm	pstl1keep, [dst, PF_DIST_L1]
548bcb
+	st1b_unroll 4, 7
548bcb
+	prfm	pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
548bcb
+	add	dst, dst, CACHE_LINE_SIZE * 2
548bcb
+	sub	rest, rest, CACHE_LINE_SIZE * 2
548bcb
+	cmp	rest, L1_SIZE
548bcb
+	b.ge	1b
548bcb
+	cbnz	rest, L(unroll32)
548bcb
+	ret
548bcb
+
548bcb
+L(L2):
548bcb
+	// align dst address at vector_length byte boundary
548bcb
+	sub	tmp1, vector_length, 1
548bcb
+	ands	tmp2, dst, tmp1
548bcb
+	// if vl_remainder == 0
548bcb
+	b.eq	1f
548bcb
+	sub	vl_remainder, vector_length, tmp2
548bcb
+	// process remainder until the first vector_length boundary
548bcb
+	whilelt	p2.b, xzr, vl_remainder
548bcb
+	st1b	z0.b, p2, [dst]
548bcb
+	add	dst, dst, vl_remainder
548bcb
+	sub	rest, rest, vl_remainder
548bcb
+	// align dstin address at CACHE_LINE_SIZE byte boundary
548bcb
+1:	mov	tmp1, CACHE_LINE_SIZE
548bcb
+	ands	tmp2, dst, CACHE_LINE_SIZE - 1
548bcb
+	// if cl_remainder == 0
548bcb
+	b.eq	L(L2_dc_zva)
548bcb
+	sub	cl_remainder, tmp1, tmp2
548bcb
+	// process remainder until the first CACHE_LINE_SIZE boundary
548bcb
+	mov	tmp1, xzr       // index
548bcb
+2:	whilelt	p2.b, tmp1, cl_remainder
548bcb
+	st1b	z0.b, p2, [dst, tmp1]
548bcb
+	incb	tmp1
548bcb
+	cmp	tmp1, cl_remainder
548bcb
+	b.lo	2b
548bcb
+	add	dst, dst, cl_remainder
548bcb
+	sub	rest, rest, cl_remainder
548bcb
+
548bcb
+L(L2_dc_zva):
548bcb
+	// zero fill
548bcb
+	mov	tmp1, dst
548bcb
+	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
548bcb
+	mov	zva_len, ZF_DIST
548bcb
+	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
548bcb
+	// unroll
548bcb
+	.p2align 3
548bcb
+1:	st1b_unroll 0, 3
548bcb
+	add	tmp2, dst, zva_len
548bcb
+	dc	 zva, tmp2
548bcb
+	st1b_unroll 4, 7
548bcb
+	add	tmp2, tmp2, CACHE_LINE_SIZE
548bcb
+	dc	zva, tmp2
548bcb
+	add	dst, dst, CACHE_LINE_SIZE * 2
548bcb
+	sub	rest, rest, CACHE_LINE_SIZE * 2
548bcb
+	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
548bcb
+	b.ge	1b
548bcb
+	cbnz	rest, L(unroll8)
548bcb
+	ret
548bcb
+
548bcb
+END (MEMSET)
548bcb
+libc_hidden_builtin_def (MEMSET)
548bcb
+
548bcb
+#endif /* IS_IN (libc) */
548bcb
+#endif /* HAVE_AARCH64_SVE_ASM */