548bcb
commit fa527f345cbbe852ec085932fbea979956c195b5
548bcb
Author: Naohiro Tamura <naohirot@jp.fujitsu.com>
548bcb
Date:   Thu May 27 07:42:35 2021 +0000
548bcb
548bcb
    aarch64: Added optimized memcpy and memmove for A64FX
548bcb
    
548bcb
    This patch optimizes the performance of memcpy/memmove for A64FX [1]
548bcb
    which implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB
548bcb
    cache per NUMA node.
548bcb
    
548bcb
    The performance optimization makes use of Scalable Vector Register
548bcb
    with several techniques such as loop unrolling, memory access
548bcb
    alignment, cache zero fill, and software pipelining.
548bcb
    
548bcb
    SVE assembler code for memcpy/memmove is implemented as Vector Length
548bcb
    Agnostic code so theoretically it can be run on any SOC which supports
548bcb
    ARMv8-A SVE standard.
548bcb
    
548bcb
    We confirmed that all testcases have been passed by running 'make
548bcb
    check' and 'make xcheck' not only on A64FX but also on ThunderX2.
548bcb
    
548bcb
    And also we confirmed that the SVE 512 bit vector register performance
548bcb
    is roughly 4 times better than Advanced SIMD 128 bit register and 8
548bcb
    times better than scalar 64 bit register by running 'make bench'.
548bcb
    
548bcb
    [1] https://github.com/fujitsu/A64FX
548bcb
    
548bcb
    Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
548bcb
    Reviewed-by: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
548bcb
548bcb
Conflicts:
548bcb
	manual/tunables.texi
548bcb
	sysdeps/aarch64/multiarch/Makefile
548bcb
	sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
	sysdeps/aarch64/multiarch/init-arch.h
548bcb
	sysdeps/aarch64/multiarch/memcpy.c
548bcb
	sysdeps/aarch64/multiarch/memmove.c
548bcb
	sysdeps/unix/sysv/linux/aarch64/cpu-features.c
548bcb
	sysdeps/unix/sysv/linux/aarch64/cpu-features.h
548bcb
	  (all conflicts due to missing optimizations for other CPUs)
548bcb
548bcb
diff --git a/manual/tunables.texi b/manual/tunables.texi
548bcb
index bd737b5d57080462..07887981748bc44b 100644
548bcb
--- a/manual/tunables.texi
548bcb
+++ b/manual/tunables.texi
548bcb
@@ -386,7 +386,7 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.
548bcb
 The @code{glibc.cpu.name=xxx} tunable allows the user to tell @theglibc{} to
548bcb
 assume that the CPU is @code{xxx} where xxx may have one of these values:
548bcb
 @code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},
548bcb
-@code{thunderx2t99p1}.
548bcb
+@code{thunderx2t99p1}, @code{a64fx}.
548bcb
 
548bcb
 This tunable is specific to aarch64.
548bcb
 @end deftp
548bcb
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
548bcb
index 57ffdf72382c0a44..5a19ba0308e80983 100644
548bcb
--- a/sysdeps/aarch64/multiarch/Makefile
548bcb
+++ b/sysdeps/aarch64/multiarch/Makefile
548bcb
@@ -1,4 +1,5 @@
548bcb
 ifeq ($(subdir),string)
548bcb
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
548bcb
-		   memcpy_falkor memmove_falkor memset_generic memset_falkor
548bcb
+		   memcpy_falkor memcpy_a64fx \
548bcb
+		   memmove_falkor memset_generic memset_falkor
548bcb
 endif
548bcb
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
index e55be80103b948a2..f53db12acce37877 100644
548bcb
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
548bcb
@@ -25,7 +25,7 @@
548bcb
 #include <stdio.h>
548bcb
 
548bcb
 /* Maximum number of IFUNC implementations.  */
548bcb
-#define MAX_IFUNC	4
548bcb
+#define MAX_IFUNC	7
548bcb
 
548bcb
 size_t
548bcb
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
548bcb
@@ -42,10 +42,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
548bcb
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
548bcb
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
548bcb
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
548bcb
+#if HAVE_AARCH64_SVE_ASM
548bcb
+	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
548bcb
+#endif
548bcb
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
548bcb
   IFUNC_IMPL (i, name, memmove,
548bcb
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
548bcb
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
548bcb
+#if HAVE_AARCH64_SVE_ASM
548bcb
+	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
548bcb
+#endif
548bcb
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
548bcb
   IFUNC_IMPL (i, name, memset,
548bcb
 	      /* Enable this on non-falkor processors too so that other cores
548bcb
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
548bcb
index d1e5703cb25fdcff..65dc8f82ff23c754 100644
548bcb
--- a/sysdeps/aarch64/multiarch/init-arch.h
548bcb
+++ b/sysdeps/aarch64/multiarch/init-arch.h
548bcb
@@ -22,4 +22,6 @@
548bcb
   uint64_t __attribute__((unused)) midr =				      \
548bcb
     GLRO(dl_aarch64_cpu_features).midr_el1;				      \
548bcb
   unsigned __attribute__((unused)) zva_size =				      \
548bcb
-    GLRO(dl_aarch64_cpu_features).zva_size;
548bcb
+    GLRO(dl_aarch64_cpu_features).zva_size;				      \
548bcb
+  bool __attribute__((unused)) sve =					      \
548bcb
+    GLRO(dl_aarch64_cpu_features).sve;
548bcb
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
548bcb
index 4a04a63b0fe0c84b..e0313c42e82a7b86 100644
548bcb
--- a/sysdeps/aarch64/multiarch/memcpy.c
548bcb
+++ b/sysdeps/aarch64/multiarch/memcpy.c
548bcb
@@ -32,6 +32,9 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
548bcb
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
548bcb
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
548bcb
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
548bcb
+# if HAVE_AARCH64_SVE_ASM
548bcb
+extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
548bcb
+# endif
548bcb
 
548bcb
 libc_ifunc (__libc_memcpy,
548bcb
             (IS_THUNDERX (midr)
548bcb
@@ -40,8 +43,13 @@ libc_ifunc (__libc_memcpy,
548bcb
 		? __memcpy_falkor
548bcb
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
548bcb
 		  ? __memcpy_thunderx2
548bcb
+# if HAVE_AARCH64_SVE_ASM
548bcb
+		  : (IS_A64FX (midr)
548bcb
+		     ? __memcpy_a64fx
548bcb
+		     : __memcpy_generic)))));
548bcb
+# else
548bcb
 		  : __memcpy_generic))));
548bcb
-
548bcb
+# endif
548bcb
 # undef memcpy
548bcb
 strong_alias (__libc_memcpy, memcpy);
548bcb
 #endif
548bcb
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
548bcb
new file mode 100644
548bcb
index 0000000000000000..65528405bb123737
548bcb
--- /dev/null
548bcb
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
548bcb
@@ -0,0 +1,406 @@
548bcb
+/* Optimized memcpy for Fujitsu A64FX processor.
548bcb
+   Copyright (C) 2021 Free Software Foundation, Inc.
548bcb
+
548bcb
+   This file is part of the GNU C Library.
548bcb
+
548bcb
+   The GNU C Library is free software; you can redistribute it and/or
548bcb
+   modify it under the terms of the GNU Lesser General Public
548bcb
+   License as published by the Free Software Foundation; either
548bcb
+   version 2.1 of the License, or (at your option) any later version.
548bcb
+
548bcb
+   The GNU C Library is distributed in the hope that it will be useful,
548bcb
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
548bcb
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
548bcb
+   Lesser General Public License for more details.
548bcb
+
548bcb
+   You should have received a copy of the GNU Lesser General Public
548bcb
+   License along with the GNU C Library.  If not, see
548bcb
+   <https://www.gnu.org/licenses/>.  */
548bcb
+
548bcb
+#include <sysdep.h>
548bcb
+
548bcb
+/* Assumptions:
548bcb
+ *
548bcb
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
548bcb
+ *
548bcb
+ */
548bcb
+
548bcb
+#define L2_SIZE		(8*1024*1024)/2	// L2 8MB/2
548bcb
+#define CACHE_LINE_SIZE	256
548bcb
+#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
548bcb
+#define dest		x0
548bcb
+#define src		x1
548bcb
+#define n		x2	// size
548bcb
+#define tmp1		x3
548bcb
+#define tmp2		x4
548bcb
+#define tmp3		x5
548bcb
+#define rest		x6
548bcb
+#define dest_ptr	x7
548bcb
+#define src_ptr		x8
548bcb
+#define vector_length	x9
548bcb
+#define cl_remainder	x10	// CACHE_LINE_SIZE remainder
548bcb
+
548bcb
+#if HAVE_AARCH64_SVE_ASM
548bcb
+# if IS_IN (libc)
548bcb
+#  define MEMCPY __memcpy_a64fx
548bcb
+#  define MEMMOVE __memmove_a64fx
548bcb
+
548bcb
+	.arch armv8.2-a+sve
548bcb
+
548bcb
+	.macro dc_zva times
548bcb
+	dc	zva, tmp1
548bcb
+	add	tmp1, tmp1, CACHE_LINE_SIZE
548bcb
+	.if \times-1
548bcb
+	dc_zva "(\times-1)"
548bcb
+	.endif
548bcb
+	.endm
548bcb
+
548bcb
+	.macro ld1b_unroll8
548bcb
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
548bcb
+	ld1b	z1.b, p0/z, [src_ptr, #1, mul vl]
548bcb
+	ld1b	z2.b, p0/z, [src_ptr, #2, mul vl]
548bcb
+	ld1b	z3.b, p0/z, [src_ptr, #3, mul vl]
548bcb
+	ld1b	z4.b, p0/z, [src_ptr, #4, mul vl]
548bcb
+	ld1b	z5.b, p0/z, [src_ptr, #5, mul vl]
548bcb
+	ld1b	z6.b, p0/z, [src_ptr, #6, mul vl]
548bcb
+	ld1b	z7.b, p0/z, [src_ptr, #7, mul vl]
548bcb
+	.endm
548bcb
+
548bcb
+	.macro stld1b_unroll4a
548bcb
+	st1b	z0.b, p0,   [dest_ptr, #0, mul vl]
548bcb
+	st1b	z1.b, p0,   [dest_ptr, #1, mul vl]
548bcb
+	ld1b	z0.b, p0/z, [src_ptr,  #0, mul vl]
548bcb
+	ld1b	z1.b, p0/z, [src_ptr,  #1, mul vl]
548bcb
+	st1b	z2.b, p0,   [dest_ptr, #2, mul vl]
548bcb
+	st1b	z3.b, p0,   [dest_ptr, #3, mul vl]
548bcb
+	ld1b	z2.b, p0/z, [src_ptr,  #2, mul vl]
548bcb
+	ld1b	z3.b, p0/z, [src_ptr,  #3, mul vl]
548bcb
+	.endm
548bcb
+
548bcb
+	.macro stld1b_unroll4b
548bcb
+	st1b	z4.b, p0,   [dest_ptr, #4, mul vl]
548bcb
+	st1b	z5.b, p0,   [dest_ptr, #5, mul vl]
548bcb
+	ld1b	z4.b, p0/z, [src_ptr,  #4, mul vl]
548bcb
+	ld1b	z5.b, p0/z, [src_ptr,  #5, mul vl]
548bcb
+	st1b	z6.b, p0,   [dest_ptr, #6, mul vl]
548bcb
+	st1b	z7.b, p0,   [dest_ptr, #7, mul vl]
548bcb
+	ld1b	z6.b, p0/z, [src_ptr,  #6, mul vl]
548bcb
+	ld1b	z7.b, p0/z, [src_ptr,  #7, mul vl]
548bcb
+	.endm
548bcb
+
548bcb
+	.macro stld1b_unroll8
548bcb
+	stld1b_unroll4a
548bcb
+	stld1b_unroll4b
548bcb
+	.endm
548bcb
+
548bcb
+	.macro st1b_unroll8
548bcb
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
548bcb
+	st1b	z1.b, p0, [dest_ptr, #1, mul vl]
548bcb
+	st1b	z2.b, p0, [dest_ptr, #2, mul vl]
548bcb
+	st1b	z3.b, p0, [dest_ptr, #3, mul vl]
548bcb
+	st1b	z4.b, p0, [dest_ptr, #4, mul vl]
548bcb
+	st1b	z5.b, p0, [dest_ptr, #5, mul vl]
548bcb
+	st1b	z6.b, p0, [dest_ptr, #6, mul vl]
548bcb
+	st1b	z7.b, p0, [dest_ptr, #7, mul vl]
548bcb
+	.endm
548bcb
+
548bcb
+	.macro shortcut_for_small_size exit
548bcb
+	// if rest <= vector_length * 2
548bcb
+	whilelo	p0.b, xzr, n
548bcb
+	whilelo	p1.b, vector_length, n
548bcb
+	b.last	1f
548bcb
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
548bcb
+	st1b	z0.b, p0, [dest, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest, #1, mul vl]
548bcb
+	ret
548bcb
+1:	// if rest > vector_length * 8
548bcb
+	cmp	n, vector_length, lsl 3 // vector_length * 8
548bcb
+	b.hi	\exit
548bcb
+	// if rest <= vector_length * 4
548bcb
+	lsl	tmp1, vector_length, 1  // vector_length * 2
548bcb
+	whilelo	p2.b, tmp1, n
548bcb
+	incb	tmp1
548bcb
+	whilelo	p3.b, tmp1, n
548bcb
+	b.last	1f
548bcb
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
548bcb
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
548bcb
+	st1b	z0.b, p0, [dest, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest, #1, mul vl]
548bcb
+	st1b	z2.b, p2, [dest, #2, mul vl]
548bcb
+	st1b	z3.b, p3, [dest, #3, mul vl]
548bcb
+	ret
548bcb
+1:	// if rest <= vector_length * 8
548bcb
+	lsl	tmp1, vector_length, 2  // vector_length * 4
548bcb
+	whilelo	p4.b, tmp1, n
548bcb
+	incb	tmp1
548bcb
+	whilelo	p5.b, tmp1, n
548bcb
+	b.last	1f
548bcb
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
548bcb
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
548bcb
+	ld1b	z4.b, p4/z, [src, #4, mul vl]
548bcb
+	ld1b	z5.b, p5/z, [src, #5, mul vl]
548bcb
+	st1b	z0.b, p0, [dest, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest, #1, mul vl]
548bcb
+	st1b	z2.b, p2, [dest, #2, mul vl]
548bcb
+	st1b	z3.b, p3, [dest, #3, mul vl]
548bcb
+	st1b	z4.b, p4, [dest, #4, mul vl]
548bcb
+	st1b	z5.b, p5, [dest, #5, mul vl]
548bcb
+	ret
548bcb
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
548bcb
+	incb	tmp1			// vector_length * 5
548bcb
+	incb	tmp1			// vector_length * 6
548bcb
+	whilelo	p6.b, tmp1, n
548bcb
+	incb	tmp1
548bcb
+	whilelo	p7.b, tmp1, n
548bcb
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
548bcb
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
548bcb
+	ld1b	z4.b, p4/z, [src, #4, mul vl]
548bcb
+	ld1b	z5.b, p5/z, [src, #5, mul vl]
548bcb
+	ld1b	z6.b, p6/z, [src, #6, mul vl]
548bcb
+	ld1b	z7.b, p7/z, [src, #7, mul vl]
548bcb
+	st1b	z0.b, p0, [dest, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest, #1, mul vl]
548bcb
+	st1b	z2.b, p2, [dest, #2, mul vl]
548bcb
+	st1b	z3.b, p3, [dest, #3, mul vl]
548bcb
+	st1b	z4.b, p4, [dest, #4, mul vl]
548bcb
+	st1b	z5.b, p5, [dest, #5, mul vl]
548bcb
+	st1b	z6.b, p6, [dest, #6, mul vl]
548bcb
+	st1b	z7.b, p7, [dest, #7, mul vl]
548bcb
+	ret
548bcb
+	.endm
548bcb
+
548bcb
+ENTRY (MEMCPY)
548bcb
+
548bcb
+	PTR_ARG (0)
548bcb
+	PTR_ARG (1)
548bcb
+	SIZE_ARG (2)
548bcb
+
548bcb
+L(memcpy):
548bcb
+	cntb	vector_length
548bcb
+	// shortcut for less than vector_length * 8
548bcb
+	// gives a free ptrue to p0.b for n >= vector_length
548bcb
+	shortcut_for_small_size L(vl_agnostic)
548bcb
+	// end of shortcut
548bcb
+
548bcb
+L(vl_agnostic): // VL Agnostic
548bcb
+	mov	rest, n
548bcb
+	mov	dest_ptr, dest
548bcb
+	mov	src_ptr, src
548bcb
+	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
548bcb
+	mov	tmp1, 64
548bcb
+	cmp	rest, L2_SIZE
548bcb
+	ccmp	vector_length, tmp1, 0, cs
548bcb
+	b.eq	L(L2)
548bcb
+
548bcb
+L(unroll8): // unrolling and software pipeline
548bcb
+	lsl	tmp1, vector_length, 3	// vector_length * 8
548bcb
+	.p2align 3
548bcb
+	cmp	 rest, tmp1
548bcb
+	b.cc	L(last)
548bcb
+	ld1b_unroll8
548bcb
+	add	src_ptr, src_ptr, tmp1
548bcb
+	sub	rest, rest, tmp1
548bcb
+	cmp	rest, tmp1
548bcb
+	b.cc	2f
548bcb
+	.p2align 3
548bcb
+1:	stld1b_unroll8
548bcb
+	add	dest_ptr, dest_ptr, tmp1
548bcb
+	add	src_ptr, src_ptr, tmp1
548bcb
+	sub	rest, rest, tmp1
548bcb
+	cmp	rest, tmp1
548bcb
+	b.ge	1b
548bcb
+2:	st1b_unroll8
548bcb
+	add	dest_ptr, dest_ptr, tmp1
548bcb
+
548bcb
+	.p2align 3
548bcb
+L(last):
548bcb
+	whilelo	p0.b, xzr, rest
548bcb
+	whilelo	p1.b, vector_length, rest
548bcb
+	b.last	1f
548bcb
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
548bcb
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
548bcb
+	ret
548bcb
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
548bcb
+	whilelo	p2.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p3.b, tmp1, rest
548bcb
+	b.last	1f
548bcb
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
548bcb
+	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
548bcb
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
548bcb
+	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
548bcb
+	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
548bcb
+	ret
548bcb
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
548bcb
+	whilelo	p4.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p5.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p6.b, tmp1, rest
548bcb
+	incb	tmp1
548bcb
+	whilelo	p7.b, tmp1, rest
548bcb
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
548bcb
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
548bcb
+	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
548bcb
+	ld1b	z4.b, p4/z, [src_ptr, #4, mul vl]
548bcb
+	ld1b	z5.b, p5/z, [src_ptr, #5, mul vl]
548bcb
+	ld1b	z6.b, p6/z, [src_ptr, #6, mul vl]
548bcb
+	ld1b	z7.b, p7/z, [src_ptr, #7, mul vl]
548bcb
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
548bcb
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
548bcb
+	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
548bcb
+	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
548bcb
+	st1b	z4.b, p4, [dest_ptr, #4, mul vl]
548bcb
+	st1b	z5.b, p5, [dest_ptr, #5, mul vl]
548bcb
+	st1b	z6.b, p6, [dest_ptr, #6, mul vl]
548bcb
+	st1b	z7.b, p7, [dest_ptr, #7, mul vl]
548bcb
+	ret
548bcb
+
548bcb
+L(L2):
548bcb
+	// align dest address at CACHE_LINE_SIZE byte boundary
548bcb
+	mov	tmp1, CACHE_LINE_SIZE
548bcb
+	ands	tmp2, dest_ptr, CACHE_LINE_SIZE - 1
548bcb
+	// if cl_remainder == 0
548bcb
+	b.eq	L(L2_dc_zva)
548bcb
+	sub	cl_remainder, tmp1, tmp2
548bcb
+	// process remainder until the first CACHE_LINE_SIZE boundary
548bcb
+	whilelo	p1.b, xzr, cl_remainder	// keep p0.b all true
548bcb
+	whilelo	p2.b, vector_length, cl_remainder
548bcb
+	b.last	1f
548bcb
+	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
548bcb
+	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
548bcb
+	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
548bcb
+	b	2f
548bcb
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
548bcb
+	whilelo	p3.b, tmp1, cl_remainder
548bcb
+	incb	tmp1
548bcb
+	whilelo	p4.b, tmp1, cl_remainder
548bcb
+	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
548bcb
+	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
548bcb
+	ld1b	z3.b, p3/z, [src_ptr, #2, mul vl]
548bcb
+	ld1b	z4.b, p4/z, [src_ptr, #3, mul vl]
548bcb
+	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
548bcb
+	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
548bcb
+	st1b	z3.b, p3, [dest_ptr, #2, mul vl]
548bcb
+	st1b	z4.b, p4, [dest_ptr, #3, mul vl]
548bcb
+2:	add	dest_ptr, dest_ptr, cl_remainder
548bcb
+	add	src_ptr, src_ptr, cl_remainder
548bcb
+	sub	rest, rest, cl_remainder
548bcb
+
548bcb
+L(L2_dc_zva):
548bcb
+	// zero fill
548bcb
+	and	tmp1, dest, 0xffffffffffffff
548bcb
+	and	tmp2, src, 0xffffffffffffff
548bcb
+	subs	tmp1, tmp1, tmp2	// diff
548bcb
+	b.ge	1f
548bcb
+	neg	tmp1, tmp1
548bcb
+1:	mov	tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
548bcb
+	cmp	tmp1, tmp3
548bcb
+	b.lo	L(unroll8)
548bcb
+	mov	tmp1, dest_ptr
548bcb
+	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
548bcb
+	// unroll
548bcb
+	ld1b_unroll8	// this line has to be after "b.lo L(unroll8)"
548bcb
+	add	 src_ptr, src_ptr, CACHE_LINE_SIZE * 2
548bcb
+	sub	 rest, rest, CACHE_LINE_SIZE * 2
548bcb
+	mov	 tmp1, ZF_DIST
548bcb
+	.p2align 3
548bcb
+1:	stld1b_unroll4a
548bcb
+	add	tmp2, dest_ptr, tmp1	// dest_ptr + ZF_DIST
548bcb
+	dc	zva, tmp2
548bcb
+	stld1b_unroll4b
548bcb
+	add	tmp2, tmp2, CACHE_LINE_SIZE
548bcb
+	dc	zva, tmp2
548bcb
+	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
548bcb
+	add	src_ptr, src_ptr, CACHE_LINE_SIZE * 2
548bcb
+	sub	rest, rest, CACHE_LINE_SIZE * 2
548bcb
+	cmp	rest, tmp3	// ZF_DIST + CACHE_LINE_SIZE * 2
548bcb
+	b.ge	1b
548bcb
+	st1b_unroll8
548bcb
+	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
548bcb
+	b	L(unroll8)
548bcb
+
548bcb
+END (MEMCPY)
548bcb
+libc_hidden_builtin_def (MEMCPY)
548bcb
+
548bcb
+
548bcb
+ENTRY (MEMMOVE)
548bcb
+
548bcb
+	PTR_ARG (0)
548bcb
+	PTR_ARG (1)
548bcb
+	SIZE_ARG (2)
548bcb
+
548bcb
+	// remove tag address
548bcb
+	// dest has to be immutable because it is the return value
548bcb
+	// src has to be immutable because it is used in L(bwd_last)
548bcb
+	and	tmp2, dest, 0xffffffffffffff	// save dest_notag into tmp2
548bcb
+	and	tmp3, src, 0xffffffffffffff	// save src_notag intp tmp3
548bcb
+	cmp	n, 0
548bcb
+	ccmp	tmp2, tmp3, 4, ne
548bcb
+	b.ne	1f
548bcb
+	ret
548bcb
+1:	cntb	vector_length
548bcb
+	// shortcut for less than vector_length * 8
548bcb
+	// gives a free ptrue to p0.b for n >= vector_length
548bcb
+	// tmp2 and tmp3 should not be used in this macro to keep
548bcb
+	// notag addresses
548bcb
+	shortcut_for_small_size L(dispatch)
548bcb
+	// end of shortcut
548bcb
+
548bcb
+L(dispatch):
548bcb
+	// tmp2 = dest_notag, tmp3 = src_notag
548bcb
+	// diff = dest_notag - src_notag
548bcb
+	sub	tmp1, tmp2, tmp3
548bcb
+	// if diff <= 0 || diff >= n then memcpy
548bcb
+	cmp	tmp1, 0
548bcb
+	ccmp	tmp1, n, 2, gt
548bcb
+	b.cs	L(vl_agnostic)
548bcb
+
548bcb
+L(bwd_start):
548bcb
+	mov	rest, n
548bcb
+	add	dest_ptr, dest, n	// dest_end
548bcb
+	add	src_ptr, src, n		// src_end
548bcb
+
548bcb
+L(bwd_unroll8): // unrolling and software pipeline
548bcb
+	lsl	tmp1, vector_length, 3	// vector_length * 8
548bcb
+	.p2align 3
548bcb
+	cmp	rest, tmp1
548bcb
+	b.cc	L(bwd_last)
548bcb
+	sub	src_ptr, src_ptr, tmp1
548bcb
+	ld1b_unroll8
548bcb
+	sub	rest, rest, tmp1
548bcb
+	cmp	rest, tmp1
548bcb
+	b.cc	2f
548bcb
+	.p2align 3
548bcb
+1:	sub	src_ptr, src_ptr, tmp1
548bcb
+	sub	dest_ptr, dest_ptr, tmp1
548bcb
+	stld1b_unroll8
548bcb
+	sub	rest, rest, tmp1
548bcb
+	cmp	rest, tmp1
548bcb
+	b.ge	1b
548bcb
+2:	sub	dest_ptr, dest_ptr, tmp1
548bcb
+	st1b_unroll8
548bcb
+
548bcb
+L(bwd_last):
548bcb
+	mov	dest_ptr, dest
548bcb
+	mov	src_ptr, src
548bcb
+	b	L(last)
548bcb
+
548bcb
+END (MEMMOVE)
548bcb
+libc_hidden_builtin_def (MEMMOVE)
548bcb
+# endif /* IS_IN (libc) */
548bcb
+#endif /* HAVE_AARCH64_SVE_ASM */
548bcb
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
548bcb
index e69d8162910b938e..d96612b9cf7c3a4e 100644
548bcb
--- a/sysdeps/aarch64/multiarch/memmove.c
548bcb
+++ b/sysdeps/aarch64/multiarch/memmove.c
548bcb
@@ -31,14 +31,22 @@ extern __typeof (__redirect_memmove) __libc_memmove;
548bcb
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
548bcb
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
548bcb
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
548bcb
+# if HAVE_AARCH64_SVE_ASM
548bcb
+extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
548bcb
+# endif
548bcb
 
548bcb
 libc_ifunc (__libc_memmove,
548bcb
             (IS_THUNDERX (midr)
548bcb
 	     ? __memmove_thunderx
548bcb
 	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
548bcb
 		? __memmove_falkor
548bcb
+# if HAVE_AARCH64_SVE_ASM
548bcb
+		: (IS_A64FX (midr)
548bcb
+		   ? __memmove_a64fx
548bcb
+		   : __memmove_generic))));
548bcb
+# else
548bcb
 		: __memmove_generic)));
548bcb
-
548bcb
+# endif
548bcb
 # undef memmove
548bcb
 strong_alias (__libc_memmove, memmove);
548bcb
 #endif
548bcb
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
548bcb
index b4f348509eb1c6b3..71e4355c972f1ffb 100644
548bcb
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
548bcb
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
548bcb
@@ -36,6 +36,7 @@ static struct cpu_list cpu_list[] = {
548bcb
       {"thunderx2t99",   0x431F0AF0},
548bcb
       {"thunderx2t99p1", 0x420F5160},
548bcb
       {"phecda",	 0x680F0000},
548bcb
+      {"a64fx",		 0x460F0010},
548bcb
       {"generic", 	 0x0}
548bcb
 };
548bcb
 
548bcb
@@ -80,4 +81,7 @@ init_cpu_features (struct cpu_features *cpu_features)
548bcb
 
548bcb
   if ((dczid & DCZID_DZP_MASK) == 0)
548bcb
     cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
548bcb
+
548bcb
+  /* Check if SVE is supported.  */
548bcb
+  cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
548bcb
 }
548bcb
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
548bcb
index eb35adfbe9d429d5..5691aea6de3cb7f4 100644
548bcb
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
548bcb
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
548bcb
@@ -20,6 +20,7 @@
548bcb
 #define _CPU_FEATURES_AARCH64_H
548bcb
 
548bcb
 #include <stdint.h>
548bcb
+#include <stdbool.h>
548bcb
 
548bcb
 #define MIDR_PARTNUM_SHIFT	4
548bcb
 #define MIDR_PARTNUM_MASK	(0xfff << MIDR_PARTNUM_SHIFT)
548bcb
@@ -52,10 +53,14 @@
548bcb
 #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'			      \
548bcb
                         && MIDR_PARTNUM(midr) == 0x000)
548bcb
 
548bcb
+#define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F'			      \
548bcb
+			&& MIDR_PARTNUM(midr) == 0x001)
548bcb
+
548bcb
 struct cpu_features
548bcb
 {
548bcb
   uint64_t midr_el1;
548bcb
   unsigned zva_size;
548bcb
+  bool sve;
548bcb
 };
548bcb
 
548bcb
 #endif /* _CPU_FEATURES_AARCH64_H  */