c5d972
From 186092c6ba8825598ffdbf15dbf0823c771f560d Mon Sep 17 00:00:00 2001
c5d972
From: Wilco Dijkstra <wdijkstr@arm.com>
c5d972
Date: Tue, 10 Aug 2021 13:42:07 +0100
c5d972
Subject: [PATCH] [3/5] AArch64: Improve A64FX memset for remaining bytes
c5d972
c5d972
Simplify handling of remaining bytes. Avoid lots of taken branches and complex
c5d972
whilelo computations, instead unconditionally write vectors from the end.
c5d972
c5d972
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
c5d972
---
c5d972
 sysdeps/aarch64/multiarch/memset_a64fx.S | 46 +++++++-----------------
c5d972
 1 file changed, 13 insertions(+), 33 deletions(-)
c5d972
c5d972
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
c5d972
index 75cf43ae79..337c86be6f 100644
c5d972
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
c5d972
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
c5d972
@@ -130,38 +130,19 @@ L(unroll8):
c5d972
 	b	1b
c5d972
 
c5d972
 L(last):
c5d972
-	whilelo	p0.b, xzr, rest
c5d972
-	whilelo	p1.b, vector_length, rest
c5d972
-	b.last	1f
c5d972
-	st1b	z0.b, p0, [dst, #0, mul vl]
c5d972
-	st1b	z0.b, p1, [dst, #1, mul vl]
c5d972
-	ret
c5d972
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
c5d972
-	whilelo	p2.b, tmp1, rest
c5d972
-	incb	tmp1
c5d972
-	whilelo	p3.b, tmp1, rest
c5d972
-	b.last	1f
c5d972
-	st1b	z0.b, p0, [dst, #0, mul vl]
c5d972
-	st1b	z0.b, p1, [dst, #1, mul vl]
c5d972
-	st1b	z0.b, p2, [dst, #2, mul vl]
c5d972
-	st1b	z0.b, p3, [dst, #3, mul vl]
c5d972
-	ret
c5d972
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
c5d972
-	whilelo	p4.b, tmp1, rest
c5d972
-	incb	tmp1
c5d972
-	whilelo	p5.b, tmp1, rest
c5d972
-	incb	tmp1
c5d972
-	whilelo	p6.b, tmp1, rest
c5d972
-	incb	tmp1
c5d972
-	whilelo	p7.b, tmp1, rest
c5d972
-	st1b	z0.b, p0, [dst, #0, mul vl]
c5d972
-	st1b	z0.b, p1, [dst, #1, mul vl]
c5d972
-	st1b	z0.b, p2, [dst, #2, mul vl]
c5d972
-	st1b	z0.b, p3, [dst, #3, mul vl]
c5d972
-	st1b	z0.b, p4, [dst, #4, mul vl]
c5d972
-	st1b	z0.b, p5, [dst, #5, mul vl]
c5d972
-	st1b	z0.b, p6, [dst, #6, mul vl]
c5d972
-	st1b	z0.b, p7, [dst, #7, mul vl]
c5d972
+	cmp	count, vector_length, lsl 1
c5d972
+	b.ls	2f
c5d972
+	add	tmp2, vector_length, vector_length, lsl 2
c5d972
+	cmp	count, tmp2
c5d972
+	b.ls	5f
c5d972
+	st1b	z0.b, p0, [dstend, -8, mul vl]
c5d972
+	st1b	z0.b, p0, [dstend, -7, mul vl]
c5d972
+	st1b	z0.b, p0, [dstend, -6, mul vl]
c5d972
+5:	st1b	z0.b, p0, [dstend, -5, mul vl]
c5d972
+	st1b	z0.b, p0, [dstend, -4, mul vl]
c5d972
+	st1b	z0.b, p0, [dstend, -3, mul vl]
c5d972
+2:	st1b	z0.b, p0, [dstend, -2, mul vl]
c5d972
+	st1b	z0.b, p0, [dstend, -1, mul vl]
c5d972
 	ret
c5d972
 
c5d972
 L(L1_prefetch): // if rest >= L1_SIZE
c5d972
@@ -199,7 +180,6 @@ L(L2):
c5d972
 	subs	count, count, CACHE_LINE_SIZE
c5d972
 	b.hi	1b
c5d972
 	add	count, count, CACHE_LINE_SIZE
c5d972
-	add	dst, dst, CACHE_LINE_SIZE
c5d972
 	b	L(last)
c5d972
 
c5d972
 END (MEMSET)
c5d972
-- 
c5d972
2.31.1
c5d972