a41fe4
From 1d9f99ce1b3788d1897cb53a76d57e973111b8fe Mon Sep 17 00:00:00 2001
a41fe4
From: Naohiro Tamura <naohirot@fujitsu.com>
a41fe4
Date: Fri, 27 Aug 2021 05:03:04 +0000
a41fe4
Subject: [PATCH] AArch64: Update A64FX memset not to degrade at 16KB
a41fe4
a41fe4
This patch updates unroll8 code so as not to degrade at the peak
a41fe4
performance 16KB for both FX1000 and FX700.
a41fe4
a41fe4
Inserted 2 instructions at the beginning of the unroll8 loop,
a41fe4
cmp and branch, are a workaround that is found heuristically.
a41fe4
a41fe4
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
a41fe4
---
a41fe4
 sysdeps/aarch64/multiarch/memset_a64fx.S | 9 ++++++++-
a41fe4
 1 file changed, 8 insertions(+), 1 deletion(-)
a41fe4
a41fe4
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
a41fe4
index 7bf759b6a7..f7dfdaace7 100644
a41fe4
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
a41fe4
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
a41fe4
@@ -96,7 +96,14 @@ L(vl_agnostic): // VL Agnostic
a41fe4
 L(unroll8):
a41fe4
 	sub	count, count, tmp1
a41fe4
 	.p2align 4
a41fe4
-1:	st1b_unroll 0, 7
a41fe4
+	// The 2 instructions at the beginning of the following loop,
a41fe4
+	// cmp and branch, are a workaround so as not to degrade at
a41fe4
+	// the peak performance 16KB.
a41fe4
+	// It is found heuristically and the branch condition, b.ne,
a41fe4
+	// is chosen intentionally never to jump.
a41fe4
+1:	cmp	xzr, xzr
a41fe4
+	b.ne	1b
a41fe4
+	st1b_unroll 0, 7
a41fe4
 	add	dst, dst, tmp1
a41fe4
 	subs	count, count, tmp1
a41fe4
 	b.hi	1b
a41fe4
-- 
a41fe4
2.31.1
a41fe4