00c0d4
From 1d9f99ce1b3788d1897cb53a76d57e973111b8fe Mon Sep 17 00:00:00 2001
00c0d4
From: Naohiro Tamura <naohirot@fujitsu.com>
00c0d4
Date: Fri, 27 Aug 2021 05:03:04 +0000
00c0d4
Subject: [PATCH] AArch64: Update A64FX memset not to degrade at 16KB
00c0d4
00c0d4
This patch updates unroll8 code so as not to degrade at the peak
00c0d4
performance 16KB for both FX1000 and FX700.
00c0d4
00c0d4
Inserted 2 instructions at the beginning of the unroll8 loop,
00c0d4
cmp and branch, are a workaround that is found heuristically.
00c0d4
00c0d4
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
00c0d4
---
00c0d4
 sysdeps/aarch64/multiarch/memset_a64fx.S | 9 ++++++++-
00c0d4
 1 file changed, 8 insertions(+), 1 deletion(-)
00c0d4
00c0d4
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
00c0d4
index 7bf759b6a7..f7dfdaace7 100644
00c0d4
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
00c0d4
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
00c0d4
@@ -96,7 +96,14 @@ L(vl_agnostic): // VL Agnostic
00c0d4
 L(unroll8):
00c0d4
 	sub	count, count, tmp1
00c0d4
 	.p2align 4
00c0d4
-1:	st1b_unroll 0, 7
00c0d4
+	// The 2 instructions at the beginning of the following loop,
00c0d4
+	// cmp and branch, are a workaround so as not to degrade at
00c0d4
+	// the peak performance 16KB.
00c0d4
+	// It is found heuristically and the branch condition, b.ne,
00c0d4
+	// is chosen intentionally never to jump.
00c0d4
+1:	cmp	xzr, xzr
00c0d4
+	b.ne	1b
00c0d4
+	st1b_unroll 0, 7
00c0d4
 	add	dst, dst, tmp1
00c0d4
 	subs	count, count, tmp1
00c0d4
 	b.hi	1b
00c0d4
-- 
00c0d4
2.31.1
00c0d4