00c0d4
From 186092c6ba8825598ffdbf15dbf0823c771f560d Mon Sep 17 00:00:00 2001
00c0d4
From: Wilco Dijkstra <wdijkstr@arm.com>
00c0d4
Date: Tue, 10 Aug 2021 13:42:07 +0100
00c0d4
Subject: [PATCH] [3/5] AArch64: Improve A64FX memset for remaining bytes
00c0d4
00c0d4
Simplify handling of remaining bytes. Avoid lots of taken branches and complex
00c0d4
whilelo computations, instead unconditionally write vectors from the end.
00c0d4
00c0d4
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
00c0d4
---
00c0d4
 sysdeps/aarch64/multiarch/memset_a64fx.S | 46 +++++++-----------------
00c0d4
 1 file changed, 13 insertions(+), 33 deletions(-)
00c0d4
00c0d4
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
00c0d4
index 75cf43ae79..337c86be6f 100644
00c0d4
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
00c0d4
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
00c0d4
@@ -130,38 +130,19 @@ L(unroll8):
00c0d4
 	b	1b
00c0d4
 
00c0d4
 L(last):
00c0d4
-	whilelo	p0.b, xzr, rest
00c0d4
-	whilelo	p1.b, vector_length, rest
00c0d4
-	b.last	1f
00c0d4
-	st1b	z0.b, p0, [dst, #0, mul vl]
00c0d4
-	st1b	z0.b, p1, [dst, #1, mul vl]
00c0d4
-	ret
00c0d4
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
00c0d4
-	whilelo	p2.b, tmp1, rest
00c0d4
-	incb	tmp1
00c0d4
-	whilelo	p3.b, tmp1, rest
00c0d4
-	b.last	1f
00c0d4
-	st1b	z0.b, p0, [dst, #0, mul vl]
00c0d4
-	st1b	z0.b, p1, [dst, #1, mul vl]
00c0d4
-	st1b	z0.b, p2, [dst, #2, mul vl]
00c0d4
-	st1b	z0.b, p3, [dst, #3, mul vl]
00c0d4
-	ret
00c0d4
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
00c0d4
-	whilelo	p4.b, tmp1, rest
00c0d4
-	incb	tmp1
00c0d4
-	whilelo	p5.b, tmp1, rest
00c0d4
-	incb	tmp1
00c0d4
-	whilelo	p6.b, tmp1, rest
00c0d4
-	incb	tmp1
00c0d4
-	whilelo	p7.b, tmp1, rest
00c0d4
-	st1b	z0.b, p0, [dst, #0, mul vl]
00c0d4
-	st1b	z0.b, p1, [dst, #1, mul vl]
00c0d4
-	st1b	z0.b, p2, [dst, #2, mul vl]
00c0d4
-	st1b	z0.b, p3, [dst, #3, mul vl]
00c0d4
-	st1b	z0.b, p4, [dst, #4, mul vl]
00c0d4
-	st1b	z0.b, p5, [dst, #5, mul vl]
00c0d4
-	st1b	z0.b, p6, [dst, #6, mul vl]
00c0d4
-	st1b	z0.b, p7, [dst, #7, mul vl]
00c0d4
+	cmp	count, vector_length, lsl 1
00c0d4
+	b.ls	2f
00c0d4
+	add	tmp2, vector_length, vector_length, lsl 2
00c0d4
+	cmp	count, tmp2
00c0d4
+	b.ls	5f
00c0d4
+	st1b	z0.b, p0, [dstend, -8, mul vl]
00c0d4
+	st1b	z0.b, p0, [dstend, -7, mul vl]
00c0d4
+	st1b	z0.b, p0, [dstend, -6, mul vl]
00c0d4
+5:	st1b	z0.b, p0, [dstend, -5, mul vl]
00c0d4
+	st1b	z0.b, p0, [dstend, -4, mul vl]
00c0d4
+	st1b	z0.b, p0, [dstend, -3, mul vl]
00c0d4
+2:	st1b	z0.b, p0, [dstend, -2, mul vl]
00c0d4
+	st1b	z0.b, p0, [dstend, -1, mul vl]
00c0d4
 	ret
00c0d4
 
00c0d4
 L(L1_prefetch): // if rest >= L1_SIZE
00c0d4
@@ -199,7 +180,6 @@ L(L2):
00c0d4
 	subs	count, count, CACHE_LINE_SIZE
00c0d4
 	b.hi	1b
00c0d4
 	add	count, count, CACHE_LINE_SIZE
00c0d4
-	add	dst, dst, CACHE_LINE_SIZE
00c0d4
 	b	L(last)
00c0d4
 
00c0d4
 END (MEMSET)
00c0d4
-- 
00c0d4
2.31.1
00c0d4