c5d972
From 1d9f99ce1b3788d1897cb53a76d57e973111b8fe Mon Sep 17 00:00:00 2001
c5d972
From: Naohiro Tamura <naohirot@fujitsu.com>
c5d972
Date: Fri, 27 Aug 2021 05:03:04 +0000
c5d972
Subject: [PATCH] AArch64: Update A64FX memset not to degrade at 16KB
c5d972
c5d972
This patch updates unroll8 code so as not to degrade at the peak
c5d972
performance 16KB for both FX1000 and FX700.
c5d972
c5d972
Inserted 2 instructions at the beginning of the unroll8 loop,
c5d972
cmp and branch, are a workaround that is found heuristically.
c5d972
c5d972
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
c5d972
---
c5d972
 sysdeps/aarch64/multiarch/memset_a64fx.S | 9 ++++++++-
c5d972
 1 file changed, 8 insertions(+), 1 deletion(-)
c5d972
c5d972
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
c5d972
index 7bf759b6a7..f7dfdaace7 100644
c5d972
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
c5d972
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
c5d972
@@ -96,7 +96,14 @@ L(vl_agnostic): // VL Agnostic
c5d972
 L(unroll8):
c5d972
 	sub	count, count, tmp1
c5d972
 	.p2align 4
c5d972
-1:	st1b_unroll 0, 7
c5d972
+	// The 2 instructions at the beginning of the following loop,
c5d972
+	// cmp and branch, are a workaround so as not to degrade at
c5d972
+	// the peak performance 16KB.
c5d972
+	// It is found heuristically and the branch condition, b.ne,
c5d972
+	// is chosen intentionally never to jump.
c5d972
+1:	cmp	xzr, xzr
c5d972
+	b.ne	1b
c5d972
+	st1b_unroll 0, 7
c5d972
 	add	dst, dst, tmp1
c5d972
 	subs	count, count, tmp1
c5d972
 	b.hi	1b
c5d972
-- 
c5d972
2.31.1
c5d972