6c0556
commit fa527f345cbbe852ec085932fbea979956c195b5
6c0556
Author: Naohiro Tamura <naohirot@jp.fujitsu.com>
6c0556
Date:   Thu May 27 07:42:35 2021 +0000
6c0556
6c0556
    aarch64: Added optimized memcpy and memmove for A64FX
6c0556
    
6c0556
    This patch optimizes the performance of memcpy/memmove for A64FX [1]
6c0556
    which implements ARMv8-A SVE and has L1 64KB cache per core and L2 8MB
6c0556
    cache per NUMA node.
6c0556
    
6c0556
    The performance optimization makes use of Scalable Vector Register
6c0556
    with several techniques such as loop unrolling, memory access
6c0556
    alignment, cache zero fill, and software pipelining.
6c0556
    
6c0556
    SVE assembler code for memcpy/memmove is implemented as Vector Length
6c0556
    Agnostic code so theoretically it can be run on any SOC which supports
6c0556
    ARMv8-A SVE standard.
6c0556
    
6c0556
    We confirmed that all testcases have been passed by running 'make
6c0556
    check' and 'make xcheck' not only on A64FX but also on ThunderX2.
6c0556
    
6c0556
    And also we confirmed that the SVE 512 bit vector register performance
6c0556
    is roughly 4 times better than Advanced SIMD 128 bit register and 8
6c0556
    times better than scalar 64 bit register by running 'make bench'.
6c0556
    
6c0556
    [1] https://github.com/fujitsu/A64FX
6c0556
    
6c0556
    Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
6c0556
    Reviewed-by: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
6c0556
6c0556
Conflicts:
6c0556
	manual/tunables.texi
6c0556
	sysdeps/aarch64/multiarch/Makefile
6c0556
	sysdeps/aarch64/multiarch/ifunc-impl-list.c
6c0556
	sysdeps/aarch64/multiarch/init-arch.h
6c0556
	sysdeps/aarch64/multiarch/memcpy.c
6c0556
	sysdeps/aarch64/multiarch/memmove.c
6c0556
	sysdeps/unix/sysv/linux/aarch64/cpu-features.c
6c0556
	sysdeps/unix/sysv/linux/aarch64/cpu-features.h
6c0556
	  (all conflicts due to missing optimizations for other CPUs)
6c0556
6c0556
diff --git a/manual/tunables.texi b/manual/tunables.texi
6c0556
index bd737b5d57080462..07887981748bc44b 100644
6c0556
--- a/manual/tunables.texi
6c0556
+++ b/manual/tunables.texi
6c0556
@@ -386,7 +386,7 @@ This tunable is specific to powerpc, powerpc64 and powerpc64le.
6c0556
 The @code{glibc.cpu.name=xxx} tunable allows the user to tell @theglibc{} to
6c0556
 assume that the CPU is @code{xxx} where xxx may have one of these values:
6c0556
 @code{generic}, @code{falkor}, @code{thunderxt88}, @code{thunderx2t99},
6c0556
-@code{thunderx2t99p1}.
6c0556
+@code{thunderx2t99p1}, @code{a64fx}.
6c0556
 
6c0556
 This tunable is specific to aarch64.
6c0556
 @end deftp
6c0556
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
6c0556
index 57ffdf72382c0a44..5a19ba0308e80983 100644
6c0556
--- a/sysdeps/aarch64/multiarch/Makefile
6c0556
+++ b/sysdeps/aarch64/multiarch/Makefile
6c0556
@@ -1,4 +1,5 @@
6c0556
 ifeq ($(subdir),string)
6c0556
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
6c0556
-		   memcpy_falkor memmove_falkor memset_generic memset_falkor
6c0556
+		   memcpy_falkor memcpy_a64fx \
6c0556
+		   memmove_falkor memset_generic memset_falkor
6c0556
 endif
6c0556
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
6c0556
index e55be80103b948a2..f53db12acce37877 100644
6c0556
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
6c0556
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
6c0556
@@ -25,7 +25,7 @@
6c0556
 #include <stdio.h>
6c0556
 
6c0556
 /* Maximum number of IFUNC implementations.  */
6c0556
-#define MAX_IFUNC	4
6c0556
+#define MAX_IFUNC	7
6c0556
 
6c0556
 size_t
6c0556
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
6c0556
@@ -42,10 +42,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
6c0556
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
6c0556
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
6c0556
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
6c0556
+#if HAVE_AARCH64_SVE_ASM
6c0556
+	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
6c0556
+#endif
6c0556
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
6c0556
   IFUNC_IMPL (i, name, memmove,
6c0556
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
6c0556
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
6c0556
+#if HAVE_AARCH64_SVE_ASM
6c0556
+	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
6c0556
+#endif
6c0556
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
6c0556
   IFUNC_IMPL (i, name, memset,
6c0556
 	      /* Enable this on non-falkor processors too so that other cores
6c0556
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
6c0556
index d1e5703cb25fdcff..65dc8f82ff23c754 100644
6c0556
--- a/sysdeps/aarch64/multiarch/init-arch.h
6c0556
+++ b/sysdeps/aarch64/multiarch/init-arch.h
6c0556
@@ -22,4 +22,6 @@
6c0556
   uint64_t __attribute__((unused)) midr =				      \
6c0556
     GLRO(dl_aarch64_cpu_features).midr_el1;				      \
6c0556
   unsigned __attribute__((unused)) zva_size =				      \
6c0556
-    GLRO(dl_aarch64_cpu_features).zva_size;
6c0556
+    GLRO(dl_aarch64_cpu_features).zva_size;				      \
6c0556
+  bool __attribute__((unused)) sve =					      \
6c0556
+    GLRO(dl_aarch64_cpu_features).sve;
6c0556
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
6c0556
index 4a04a63b0fe0c84b..e0313c42e82a7b86 100644
6c0556
--- a/sysdeps/aarch64/multiarch/memcpy.c
6c0556
+++ b/sysdeps/aarch64/multiarch/memcpy.c
6c0556
@@ -32,6 +32,9 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
6c0556
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
6c0556
 extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
6c0556
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
6c0556
+# if HAVE_AARCH64_SVE_ASM
6c0556
+extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
6c0556
+# endif
6c0556
 
6c0556
 libc_ifunc (__libc_memcpy,
6c0556
             (IS_THUNDERX (midr)
6c0556
@@ -40,8 +43,13 @@ libc_ifunc (__libc_memcpy,
6c0556
 		? __memcpy_falkor
6c0556
 		: (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
6c0556
 		  ? __memcpy_thunderx2
6c0556
+# if HAVE_AARCH64_SVE_ASM
6c0556
+		  : (IS_A64FX (midr)
6c0556
+		     ? __memcpy_a64fx
6c0556
+		     : __memcpy_generic)))));
6c0556
+# else
6c0556
 		  : __memcpy_generic))));
6c0556
-
6c0556
+# endif
6c0556
 # undef memcpy
6c0556
 strong_alias (__libc_memcpy, memcpy);
6c0556
 #endif
6c0556
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
6c0556
new file mode 100644
6c0556
index 0000000000000000..65528405bb123737
6c0556
--- /dev/null
6c0556
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
6c0556
@@ -0,0 +1,406 @@
6c0556
+/* Optimized memcpy for Fujitsu A64FX processor.
6c0556
+   Copyright (C) 2021 Free Software Foundation, Inc.
6c0556
+
6c0556
+   This file is part of the GNU C Library.
6c0556
+
6c0556
+   The GNU C Library is free software; you can redistribute it and/or
6c0556
+   modify it under the terms of the GNU Lesser General Public
6c0556
+   License as published by the Free Software Foundation; either
6c0556
+   version 2.1 of the License, or (at your option) any later version.
6c0556
+
6c0556
+   The GNU C Library is distributed in the hope that it will be useful,
6c0556
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
6c0556
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
6c0556
+   Lesser General Public License for more details.
6c0556
+
6c0556
+   You should have received a copy of the GNU Lesser General Public
6c0556
+   License along with the GNU C Library.  If not, see
6c0556
+   <https://www.gnu.org/licenses/>.  */
6c0556
+
6c0556
+#include <sysdep.h>
6c0556
+
6c0556
+/* Assumptions:
6c0556
+ *
6c0556
+ * ARMv8.2-a, AArch64, unaligned accesses, sve
6c0556
+ *
6c0556
+ */
6c0556
+
6c0556
+#define L2_SIZE		(8*1024*1024)/2	// L2 8MB/2
6c0556
+#define CACHE_LINE_SIZE	256
6c0556
+#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
6c0556
+#define dest		x0
6c0556
+#define src		x1
6c0556
+#define n		x2	// size
6c0556
+#define tmp1		x3
6c0556
+#define tmp2		x4
6c0556
+#define tmp3		x5
6c0556
+#define rest		x6
6c0556
+#define dest_ptr	x7
6c0556
+#define src_ptr		x8
6c0556
+#define vector_length	x9
6c0556
+#define cl_remainder	x10	// CACHE_LINE_SIZE remainder
6c0556
+
6c0556
+#if HAVE_AARCH64_SVE_ASM
6c0556
+# if IS_IN (libc)
6c0556
+#  define MEMCPY __memcpy_a64fx
6c0556
+#  define MEMMOVE __memmove_a64fx
6c0556
+
6c0556
+	.arch armv8.2-a+sve
6c0556
+
6c0556
+	.macro dc_zva times
6c0556
+	dc	zva, tmp1
6c0556
+	add	tmp1, tmp1, CACHE_LINE_SIZE
6c0556
+	.if \times-1
6c0556
+	dc_zva "(\times-1)"
6c0556
+	.endif
6c0556
+	.endm
6c0556
+
6c0556
+	.macro ld1b_unroll8
6c0556
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
6c0556
+	ld1b	z1.b, p0/z, [src_ptr, #1, mul vl]
6c0556
+	ld1b	z2.b, p0/z, [src_ptr, #2, mul vl]
6c0556
+	ld1b	z3.b, p0/z, [src_ptr, #3, mul vl]
6c0556
+	ld1b	z4.b, p0/z, [src_ptr, #4, mul vl]
6c0556
+	ld1b	z5.b, p0/z, [src_ptr, #5, mul vl]
6c0556
+	ld1b	z6.b, p0/z, [src_ptr, #6, mul vl]
6c0556
+	ld1b	z7.b, p0/z, [src_ptr, #7, mul vl]
6c0556
+	.endm
6c0556
+
6c0556
+	.macro stld1b_unroll4a
6c0556
+	st1b	z0.b, p0,   [dest_ptr, #0, mul vl]
6c0556
+	st1b	z1.b, p0,   [dest_ptr, #1, mul vl]
6c0556
+	ld1b	z0.b, p0/z, [src_ptr,  #0, mul vl]
6c0556
+	ld1b	z1.b, p0/z, [src_ptr,  #1, mul vl]
6c0556
+	st1b	z2.b, p0,   [dest_ptr, #2, mul vl]
6c0556
+	st1b	z3.b, p0,   [dest_ptr, #3, mul vl]
6c0556
+	ld1b	z2.b, p0/z, [src_ptr,  #2, mul vl]
6c0556
+	ld1b	z3.b, p0/z, [src_ptr,  #3, mul vl]
6c0556
+	.endm
6c0556
+
6c0556
+	.macro stld1b_unroll4b
6c0556
+	st1b	z4.b, p0,   [dest_ptr, #4, mul vl]
6c0556
+	st1b	z5.b, p0,   [dest_ptr, #5, mul vl]
6c0556
+	ld1b	z4.b, p0/z, [src_ptr,  #4, mul vl]
6c0556
+	ld1b	z5.b, p0/z, [src_ptr,  #5, mul vl]
6c0556
+	st1b	z6.b, p0,   [dest_ptr, #6, mul vl]
6c0556
+	st1b	z7.b, p0,   [dest_ptr, #7, mul vl]
6c0556
+	ld1b	z6.b, p0/z, [src_ptr,  #6, mul vl]
6c0556
+	ld1b	z7.b, p0/z, [src_ptr,  #7, mul vl]
6c0556
+	.endm
6c0556
+
6c0556
+	.macro stld1b_unroll8
6c0556
+	stld1b_unroll4a
6c0556
+	stld1b_unroll4b
6c0556
+	.endm
6c0556
+
6c0556
+	.macro st1b_unroll8
6c0556
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
6c0556
+	st1b	z1.b, p0, [dest_ptr, #1, mul vl]
6c0556
+	st1b	z2.b, p0, [dest_ptr, #2, mul vl]
6c0556
+	st1b	z3.b, p0, [dest_ptr, #3, mul vl]
6c0556
+	st1b	z4.b, p0, [dest_ptr, #4, mul vl]
6c0556
+	st1b	z5.b, p0, [dest_ptr, #5, mul vl]
6c0556
+	st1b	z6.b, p0, [dest_ptr, #6, mul vl]
6c0556
+	st1b	z7.b, p0, [dest_ptr, #7, mul vl]
6c0556
+	.endm
6c0556
+
6c0556
+	.macro shortcut_for_small_size exit
6c0556
+	// if rest <= vector_length * 2
6c0556
+	whilelo	p0.b, xzr, n
6c0556
+	whilelo	p1.b, vector_length, n
6c0556
+	b.last	1f
6c0556
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
6c0556
+	st1b	z0.b, p0, [dest, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest, #1, mul vl]
6c0556
+	ret
6c0556
+1:	// if rest > vector_length * 8
6c0556
+	cmp	n, vector_length, lsl 3 // vector_length * 8
6c0556
+	b.hi	\exit
6c0556
+	// if rest <= vector_length * 4
6c0556
+	lsl	tmp1, vector_length, 1  // vector_length * 2
6c0556
+	whilelo	p2.b, tmp1, n
6c0556
+	incb	tmp1
6c0556
+	whilelo	p3.b, tmp1, n
6c0556
+	b.last	1f
6c0556
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
6c0556
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
6c0556
+	st1b	z0.b, p0, [dest, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest, #1, mul vl]
6c0556
+	st1b	z2.b, p2, [dest, #2, mul vl]
6c0556
+	st1b	z3.b, p3, [dest, #3, mul vl]
6c0556
+	ret
6c0556
+1:	// if rest <= vector_length * 8
6c0556
+	lsl	tmp1, vector_length, 2  // vector_length * 4
6c0556
+	whilelo	p4.b, tmp1, n
6c0556
+	incb	tmp1
6c0556
+	whilelo	p5.b, tmp1, n
6c0556
+	b.last	1f
6c0556
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
6c0556
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
6c0556
+	ld1b	z4.b, p4/z, [src, #4, mul vl]
6c0556
+	ld1b	z5.b, p5/z, [src, #5, mul vl]
6c0556
+	st1b	z0.b, p0, [dest, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest, #1, mul vl]
6c0556
+	st1b	z2.b, p2, [dest, #2, mul vl]
6c0556
+	st1b	z3.b, p3, [dest, #3, mul vl]
6c0556
+	st1b	z4.b, p4, [dest, #4, mul vl]
6c0556
+	st1b	z5.b, p5, [dest, #5, mul vl]
6c0556
+	ret
6c0556
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
6c0556
+	incb	tmp1			// vector_length * 5
6c0556
+	incb	tmp1			// vector_length * 6
6c0556
+	whilelo	p6.b, tmp1, n
6c0556
+	incb	tmp1
6c0556
+	whilelo	p7.b, tmp1, n
6c0556
+	ld1b	z0.b, p0/z, [src, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src, #1, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src, #2, mul vl]
6c0556
+	ld1b	z3.b, p3/z, [src, #3, mul vl]
6c0556
+	ld1b	z4.b, p4/z, [src, #4, mul vl]
6c0556
+	ld1b	z5.b, p5/z, [src, #5, mul vl]
6c0556
+	ld1b	z6.b, p6/z, [src, #6, mul vl]
6c0556
+	ld1b	z7.b, p7/z, [src, #7, mul vl]
6c0556
+	st1b	z0.b, p0, [dest, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest, #1, mul vl]
6c0556
+	st1b	z2.b, p2, [dest, #2, mul vl]
6c0556
+	st1b	z3.b, p3, [dest, #3, mul vl]
6c0556
+	st1b	z4.b, p4, [dest, #4, mul vl]
6c0556
+	st1b	z5.b, p5, [dest, #5, mul vl]
6c0556
+	st1b	z6.b, p6, [dest, #6, mul vl]
6c0556
+	st1b	z7.b, p7, [dest, #7, mul vl]
6c0556
+	ret
6c0556
+	.endm
6c0556
+
6c0556
+ENTRY (MEMCPY)
6c0556
+
6c0556
+	PTR_ARG (0)
6c0556
+	PTR_ARG (1)
6c0556
+	SIZE_ARG (2)
6c0556
+
6c0556
+L(memcpy):
6c0556
+	cntb	vector_length
6c0556
+	// shortcut for less than vector_length * 8
6c0556
+	// gives a free ptrue to p0.b for n >= vector_length
6c0556
+	shortcut_for_small_size L(vl_agnostic)
6c0556
+	// end of shortcut
6c0556
+
6c0556
+L(vl_agnostic): // VL Agnostic
6c0556
+	mov	rest, n
6c0556
+	mov	dest_ptr, dest
6c0556
+	mov	src_ptr, src
6c0556
+	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
6c0556
+	mov	tmp1, 64
6c0556
+	cmp	rest, L2_SIZE
6c0556
+	ccmp	vector_length, tmp1, 0, cs
6c0556
+	b.eq	L(L2)
6c0556
+
6c0556
+L(unroll8): // unrolling and software pipeline
6c0556
+	lsl	tmp1, vector_length, 3	// vector_length * 8
6c0556
+	.p2align 3
6c0556
+	cmp	 rest, tmp1
6c0556
+	b.cc	L(last)
6c0556
+	ld1b_unroll8
6c0556
+	add	src_ptr, src_ptr, tmp1
6c0556
+	sub	rest, rest, tmp1
6c0556
+	cmp	rest, tmp1
6c0556
+	b.cc	2f
6c0556
+	.p2align 3
6c0556
+1:	stld1b_unroll8
6c0556
+	add	dest_ptr, dest_ptr, tmp1
6c0556
+	add	src_ptr, src_ptr, tmp1
6c0556
+	sub	rest, rest, tmp1
6c0556
+	cmp	rest, tmp1
6c0556
+	b.ge	1b
6c0556
+2:	st1b_unroll8
6c0556
+	add	dest_ptr, dest_ptr, tmp1
6c0556
+
6c0556
+	.p2align 3
6c0556
+L(last):
6c0556
+	whilelo	p0.b, xzr, rest
6c0556
+	whilelo	p1.b, vector_length, rest
6c0556
+	b.last	1f
6c0556
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
6c0556
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
6c0556
+	ret
6c0556
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
6c0556
+	whilelo	p2.b, tmp1, rest
6c0556
+	incb	tmp1
6c0556
+	whilelo	p3.b, tmp1, rest
6c0556
+	b.last	1f
6c0556
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
6c0556
+	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
6c0556
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
6c0556
+	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
6c0556
+	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
6c0556
+	ret
6c0556
+1:	lsl	tmp1, vector_length, 2	// vector_length * 4
6c0556
+	whilelo	p4.b, tmp1, rest
6c0556
+	incb	tmp1
6c0556
+	whilelo	p5.b, tmp1, rest
6c0556
+	incb	tmp1
6c0556
+	whilelo	p6.b, tmp1, rest
6c0556
+	incb	tmp1
6c0556
+	whilelo	p7.b, tmp1, rest
6c0556
+	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
6c0556
+	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
6c0556
+	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
6c0556
+	ld1b	z4.b, p4/z, [src_ptr, #4, mul vl]
6c0556
+	ld1b	z5.b, p5/z, [src_ptr, #5, mul vl]
6c0556
+	ld1b	z6.b, p6/z, [src_ptr, #6, mul vl]
6c0556
+	ld1b	z7.b, p7/z, [src_ptr, #7, mul vl]
6c0556
+	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
6c0556
+	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
6c0556
+	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
6c0556
+	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
6c0556
+	st1b	z4.b, p4, [dest_ptr, #4, mul vl]
6c0556
+	st1b	z5.b, p5, [dest_ptr, #5, mul vl]
6c0556
+	st1b	z6.b, p6, [dest_ptr, #6, mul vl]
6c0556
+	st1b	z7.b, p7, [dest_ptr, #7, mul vl]
6c0556
+	ret
6c0556
+
6c0556
+L(L2):
6c0556
+	// align dest address at CACHE_LINE_SIZE byte boundary
6c0556
+	mov	tmp1, CACHE_LINE_SIZE
6c0556
+	ands	tmp2, dest_ptr, CACHE_LINE_SIZE - 1
6c0556
+	// if cl_remainder == 0
6c0556
+	b.eq	L(L2_dc_zva)
6c0556
+	sub	cl_remainder, tmp1, tmp2
6c0556
+	// process remainder until the first CACHE_LINE_SIZE boundary
6c0556
+	whilelo	p1.b, xzr, cl_remainder	// keep p0.b all true
6c0556
+	whilelo	p2.b, vector_length, cl_remainder
6c0556
+	b.last	1f
6c0556
+	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
6c0556
+	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
6c0556
+	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
6c0556
+	b	2f
6c0556
+1:	lsl	tmp1, vector_length, 1	// vector_length * 2
6c0556
+	whilelo	p3.b, tmp1, cl_remainder
6c0556
+	incb	tmp1
6c0556
+	whilelo	p4.b, tmp1, cl_remainder
6c0556
+	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
6c0556
+	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
6c0556
+	ld1b	z3.b, p3/z, [src_ptr, #2, mul vl]
6c0556
+	ld1b	z4.b, p4/z, [src_ptr, #3, mul vl]
6c0556
+	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
6c0556
+	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
6c0556
+	st1b	z3.b, p3, [dest_ptr, #2, mul vl]
6c0556
+	st1b	z4.b, p4, [dest_ptr, #3, mul vl]
6c0556
+2:	add	dest_ptr, dest_ptr, cl_remainder
6c0556
+	add	src_ptr, src_ptr, cl_remainder
6c0556
+	sub	rest, rest, cl_remainder
6c0556
+
6c0556
+L(L2_dc_zva):
6c0556
+	// zero fill
6c0556
+	and	tmp1, dest, 0xffffffffffffff
6c0556
+	and	tmp2, src, 0xffffffffffffff
6c0556
+	subs	tmp1, tmp1, tmp2	// diff
6c0556
+	b.ge	1f
6c0556
+	neg	tmp1, tmp1
6c0556
+1:	mov	tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
6c0556
+	cmp	tmp1, tmp3
6c0556
+	b.lo	L(unroll8)
6c0556
+	mov	tmp1, dest_ptr
6c0556
+	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
6c0556
+	// unroll
6c0556
+	ld1b_unroll8	// this line has to be after "b.lo L(unroll8)"
6c0556
+	add	 src_ptr, src_ptr, CACHE_LINE_SIZE * 2
6c0556
+	sub	 rest, rest, CACHE_LINE_SIZE * 2
6c0556
+	mov	 tmp1, ZF_DIST
6c0556
+	.p2align 3
6c0556
+1:	stld1b_unroll4a
6c0556
+	add	tmp2, dest_ptr, tmp1	// dest_ptr + ZF_DIST
6c0556
+	dc	zva, tmp2
6c0556
+	stld1b_unroll4b
6c0556
+	add	tmp2, tmp2, CACHE_LINE_SIZE
6c0556
+	dc	zva, tmp2
6c0556
+	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
6c0556
+	add	src_ptr, src_ptr, CACHE_LINE_SIZE * 2
6c0556
+	sub	rest, rest, CACHE_LINE_SIZE * 2
6c0556
+	cmp	rest, tmp3	// ZF_DIST + CACHE_LINE_SIZE * 2
6c0556
+	b.ge	1b
6c0556
+	st1b_unroll8
6c0556
+	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
6c0556
+	b	L(unroll8)
6c0556
+
6c0556
+END (MEMCPY)
6c0556
+libc_hidden_builtin_def (MEMCPY)
6c0556
+
6c0556
+
6c0556
+ENTRY (MEMMOVE)
6c0556
+
6c0556
+	PTR_ARG (0)
6c0556
+	PTR_ARG (1)
6c0556
+	SIZE_ARG (2)
6c0556
+
6c0556
+	// remove tag address
6c0556
+	// dest has to be immutable because it is the return value
6c0556
+	// src has to be immutable because it is used in L(bwd_last)
6c0556
+	and	tmp2, dest, 0xffffffffffffff	// save dest_notag into tmp2
6c0556
+	and	tmp3, src, 0xffffffffffffff	// save src_notag intp tmp3
6c0556
+	cmp	n, 0
6c0556
+	ccmp	tmp2, tmp3, 4, ne
6c0556
+	b.ne	1f
6c0556
+	ret
6c0556
+1:	cntb	vector_length
6c0556
+	// shortcut for less than vector_length * 8
6c0556
+	// gives a free ptrue to p0.b for n >= vector_length
6c0556
+	// tmp2 and tmp3 should not be used in this macro to keep
6c0556
+	// notag addresses
6c0556
+	shortcut_for_small_size L(dispatch)
6c0556
+	// end of shortcut
6c0556
+
6c0556
+L(dispatch):
6c0556
+	// tmp2 = dest_notag, tmp3 = src_notag
6c0556
+	// diff = dest_notag - src_notag
6c0556
+	sub	tmp1, tmp2, tmp3
6c0556
+	// if diff <= 0 || diff >= n then memcpy
6c0556
+	cmp	tmp1, 0
6c0556
+	ccmp	tmp1, n, 2, gt
6c0556
+	b.cs	L(vl_agnostic)
6c0556
+
6c0556
+L(bwd_start):
6c0556
+	mov	rest, n
6c0556
+	add	dest_ptr, dest, n	// dest_end
6c0556
+	add	src_ptr, src, n		// src_end
6c0556
+
6c0556
+L(bwd_unroll8): // unrolling and software pipeline
6c0556
+	lsl	tmp1, vector_length, 3	// vector_length * 8
6c0556
+	.p2align 3
6c0556
+	cmp	rest, tmp1
6c0556
+	b.cc	L(bwd_last)
6c0556
+	sub	src_ptr, src_ptr, tmp1
6c0556
+	ld1b_unroll8
6c0556
+	sub	rest, rest, tmp1
6c0556
+	cmp	rest, tmp1
6c0556
+	b.cc	2f
6c0556
+	.p2align 3
6c0556
+1:	sub	src_ptr, src_ptr, tmp1
6c0556
+	sub	dest_ptr, dest_ptr, tmp1
6c0556
+	stld1b_unroll8
6c0556
+	sub	rest, rest, tmp1
6c0556
+	cmp	rest, tmp1
6c0556
+	b.ge	1b
6c0556
+2:	sub	dest_ptr, dest_ptr, tmp1
6c0556
+	st1b_unroll8
6c0556
+
6c0556
+L(bwd_last):
6c0556
+	mov	dest_ptr, dest
6c0556
+	mov	src_ptr, src
6c0556
+	b	L(last)
6c0556
+
6c0556
+END (MEMMOVE)
6c0556
+libc_hidden_builtin_def (MEMMOVE)
6c0556
+# endif /* IS_IN (libc) */
6c0556
+#endif /* HAVE_AARCH64_SVE_ASM */
6c0556
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
6c0556
index e69d8162910b938e..d96612b9cf7c3a4e 100644
6c0556
--- a/sysdeps/aarch64/multiarch/memmove.c
6c0556
+++ b/sysdeps/aarch64/multiarch/memmove.c
6c0556
@@ -31,14 +31,22 @@ extern __typeof (__redirect_memmove) __libc_memmove;
6c0556
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
6c0556
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
6c0556
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
6c0556
+# if HAVE_AARCH64_SVE_ASM
6c0556
+extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
6c0556
+# endif
6c0556
 
6c0556
 libc_ifunc (__libc_memmove,
6c0556
             (IS_THUNDERX (midr)
6c0556
 	     ? __memmove_thunderx
6c0556
 	     : (IS_FALKOR (midr) || IS_PHECDA (midr)
6c0556
 		? __memmove_falkor
6c0556
+# if HAVE_AARCH64_SVE_ASM
6c0556
+		: (IS_A64FX (midr)
6c0556
+		   ? __memmove_a64fx
6c0556
+		   : __memmove_generic))));
6c0556
+# else
6c0556
 		: __memmove_generic)));
6c0556
-
6c0556
+# endif
6c0556
 # undef memmove
6c0556
 strong_alias (__libc_memmove, memmove);
6c0556
 #endif
6c0556
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
6c0556
index b4f348509eb1c6b3..71e4355c972f1ffb 100644
6c0556
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
6c0556
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
6c0556
@@ -36,6 +36,7 @@ static struct cpu_list cpu_list[] = {
6c0556
       {"thunderx2t99",   0x431F0AF0},
6c0556
       {"thunderx2t99p1", 0x420F5160},
6c0556
       {"phecda",	 0x680F0000},
6c0556
+      {"a64fx",		 0x460F0010},
6c0556
       {"generic", 	 0x0}
6c0556
 };
6c0556
 
6c0556
@@ -80,4 +81,7 @@ init_cpu_features (struct cpu_features *cpu_features)
6c0556
 
6c0556
   if ((dczid & DCZID_DZP_MASK) == 0)
6c0556
     cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
6c0556
+
6c0556
+  /* Check if SVE is supported.  */
6c0556
+  cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
6c0556
 }
6c0556
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
6c0556
index eb35adfbe9d429d5..5691aea6de3cb7f4 100644
6c0556
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
6c0556
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
6c0556
@@ -20,6 +20,7 @@
6c0556
 #define _CPU_FEATURES_AARCH64_H
6c0556
 
6c0556
 #include <stdint.h>
6c0556
+#include <stdbool.h>
6c0556
 
6c0556
 #define MIDR_PARTNUM_SHIFT	4
6c0556
 #define MIDR_PARTNUM_MASK	(0xfff << MIDR_PARTNUM_SHIFT)
6c0556
@@ -52,10 +53,14 @@
6c0556
 #define IS_PHECDA(midr) (MIDR_IMPLEMENTOR(midr) == 'h'			      \
6c0556
                         && MIDR_PARTNUM(midr) == 0x000)
6c0556
 
6c0556
+#define IS_A64FX(midr) (MIDR_IMPLEMENTOR(midr) == 'F'			      \
6c0556
+			&& MIDR_PARTNUM(midr) == 0x001)
6c0556
+
6c0556
 struct cpu_features
6c0556
 {
6c0556
   uint64_t midr_el1;
6c0556
   unsigned zva_size;
6c0556
+  bool sve;
6c0556
 };
6c0556
 
6c0556
 #endif /* _CPU_FEATURES_AARCH64_H  */