a41fe4
From 186092c6ba8825598ffdbf15dbf0823c771f560d Mon Sep 17 00:00:00 2001
a41fe4
From: Wilco Dijkstra <wdijkstr@arm.com>
a41fe4
Date: Tue, 10 Aug 2021 13:42:07 +0100
a41fe4
Subject: [PATCH] [3/5] AArch64: Improve A64FX memset for remaining bytes
a41fe4
a41fe4
Simplify handling of remaining bytes. Avoid lots of taken branches and complex
a41fe4
whilelo computations, instead unconditionally write vectors from the end.
a41fe4
a41fe4
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
a41fe4
---
a41fe4
 sysdeps/aarch64/multiarch/memset_a64fx.S | 46 +++++++-----------------
a41fe4
 1 file changed, 13 insertions(+), 33 deletions(-)
a41fe4
a41fe4
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
a41fe4
index 75cf43ae79..337c86be6f 100644
a41fe4
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
a41fe4
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
a41fe4
@@ -130,38 +130,19 @@ L(unroll8):
a41fe4
 	b	1b
a41fe4
 
a41fe4
 L(last):
a41fe4
-	whilelo	p0.b, xzr, rest
a41fe4
-	whilelo	p1.b, vector_length, rest
a41fe4
-	b.last	1f
a41fe4
-	st1b	z0.b, p0, [dst, #0, mul vl]
a41fe4
-	st1b	z0.b, p1, [dst, #1, mul vl]
a41fe4
-	ret
a41fe4
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
a41fe4
-	whilelo	p2.b, tmp1, rest
a41fe4
-	incb	tmp1
a41fe4
-	whilelo	p3.b, tmp1, rest
a41fe4
-	b.last	1f
a41fe4
-	st1b	z0.b, p0, [dst, #0, mul vl]
a41fe4
-	st1b	z0.b, p1, [dst, #1, mul vl]
a41fe4
-	st1b	z0.b, p2, [dst, #2, mul vl]
a41fe4
-	st1b	z0.b, p3, [dst, #3, mul vl]
a41fe4
-	ret
a41fe4
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
a41fe4
-	whilelo	p4.b, tmp1, rest
a41fe4
-	incb	tmp1
a41fe4
-	whilelo	p5.b, tmp1, rest
a41fe4
-	incb	tmp1
a41fe4
-	whilelo	p6.b, tmp1, rest
a41fe4
-	incb	tmp1
a41fe4
-	whilelo	p7.b, tmp1, rest
a41fe4
-	st1b	z0.b, p0, [dst, #0, mul vl]
a41fe4
-	st1b	z0.b, p1, [dst, #1, mul vl]
a41fe4
-	st1b	z0.b, p2, [dst, #2, mul vl]
a41fe4
-	st1b	z0.b, p3, [dst, #3, mul vl]
a41fe4
-	st1b	z0.b, p4, [dst, #4, mul vl]
a41fe4
-	st1b	z0.b, p5, [dst, #5, mul vl]
a41fe4
-	st1b	z0.b, p6, [dst, #6, mul vl]
a41fe4
-	st1b	z0.b, p7, [dst, #7, mul vl]
a41fe4
+	cmp	count, vector_length, lsl 1
a41fe4
+	b.ls	2f
a41fe4
+	add	tmp2, vector_length, vector_length, lsl 2
a41fe4
+	cmp	count, tmp2
a41fe4
+	b.ls	5f
a41fe4
+	st1b	z0.b, p0, [dstend, -8, mul vl]
a41fe4
+	st1b	z0.b, p0, [dstend, -7, mul vl]
a41fe4
+	st1b	z0.b, p0, [dstend, -6, mul vl]
a41fe4
+5:	st1b	z0.b, p0, [dstend, -5, mul vl]
a41fe4
+	st1b	z0.b, p0, [dstend, -4, mul vl]
a41fe4
+	st1b	z0.b, p0, [dstend, -3, mul vl]
a41fe4
+2:	st1b	z0.b, p0, [dstend, -2, mul vl]
a41fe4
+	st1b	z0.b, p0, [dstend, -1, mul vl]
a41fe4
 	ret
a41fe4
 
a41fe4
 L(L1_prefetch): // if rest >= L1_SIZE
a41fe4
@@ -199,7 +180,6 @@ L(L2):
a41fe4
 	subs	count, count, CACHE_LINE_SIZE
a41fe4
 	b.hi	1b
a41fe4
 	add	count, count, CACHE_LINE_SIZE
a41fe4
-	add	dst, dst, CACHE_LINE_SIZE
a41fe4
 	b	L(last)
a41fe4
 
a41fe4
 END (MEMSET)
a41fe4
-- 
a41fe4
2.31.1
a41fe4