Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-rh2037416-8.patch

Blob History Raw

		00c0d4	`From b31bd11454fade731e5158b1aea40b133ae19926 Mon Sep 17 00:00:00 2001`
		00c0d4	`From: Wilco Dijkstra <wdijkstr@arm.com>`
		00c0d4	`Date: Thu, 2 Dec 2021 18:33:26 +0000`
		00c0d4	`Subject: [PATCH] AArch64: Improve A64FX memcpy`
		00c0d4
		00c0d4	`v2 is a complete rewrite of the A64FX memcpy. Performance is improved`
		00c0d4	`by streamlining the code, aligning all large copies and using a single`
		00c0d4	`unrolled loop for all sizes. The code size for memcpy and memmove goes`
		00c0d4	`down from 1796 bytes to 868 bytes. Performance is better in all cases:`
		00c0d4	`bench-memcpy-random is 2.3% faster overall, bench-memcpy-large is ~33%`
		00c0d4	`faster for large sizes, bench-memcpy-walk is 25% faster for small sizes`
		00c0d4	`and 20% for the largest sizes. The geomean of all tests in bench-memcpy`
		00c0d4	`is 5.1% faster, and total time is reduced by 4%.`
		00c0d4
		00c0d4	`Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>`
		00c0d4	`---`
		00c0d4	`sysdeps/aarch64/multiarch/memcpy_a64fx.S \| 546 ++++++++++-------------`
		00c0d4	`1 file changed, 225 insertions(+), 321 deletions(-)`
		00c0d4
		00c0d4	`diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
		00c0d4	`index ae7464e09f..0b306925e6 100644`
		00c0d4	`--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
		00c0d4	`+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
		00c0d4	`@@ -28,20 +28,15 @@`
		00c0d4	`*`
		00c0d4	`*/`
		00c0d4
		00c0d4	`-#define L2_SIZE (810241024)/2 // L2 8MB/2`
		00c0d4	`-#define CACHE_LINE_SIZE 256`
		00c0d4	`-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance`
		00c0d4	`-#define dest x0`
		00c0d4	`-#define src x1`
		00c0d4	`-#define n x2 // size`
		00c0d4	`-#define tmp1 x3`
		00c0d4	`-#define tmp2 x4`
		00c0d4	`-#define tmp3 x5`
		00c0d4	`-#define rest x6`
		00c0d4	`-#define dest_ptr x7`
		00c0d4	`-#define src_ptr x8`
		00c0d4	`-#define vector_length x9`
		00c0d4	`-#define cl_remainder x10 // CACHE_LINE_SIZE remainder`
		00c0d4	`+#define dstin x0`
		00c0d4	`+#define src x1`
		00c0d4	`+#define n x2`
		00c0d4	`+#define dst x3`
		00c0d4	`+#define dstend x4`
		00c0d4	`+#define srcend x5`
		00c0d4	`+#define tmp x6`
		00c0d4	`+#define vlen x7`
		00c0d4	`+#define vlen8 x8`
		00c0d4
		00c0d4	`#if HAVE_AARCH64_SVE_ASM`
		00c0d4	`# if IS_IN (libc)`
		00c0d4	`@@ -50,45 +45,37 @@`
		00c0d4
		00c0d4	`.arch armv8.2-a+sve`
		00c0d4
		00c0d4	`- .macro dc_zva times`
		00c0d4	`- dc zva, tmp1`
		00c0d4	`- add tmp1, tmp1, CACHE_LINE_SIZE`
		00c0d4	`- .if \times-1`
		00c0d4	`- dc_zva "(\times-1)"`
		00c0d4	`- .endif`
		00c0d4	`- .endm`
		00c0d4	`-`
		00c0d4	`.macro ld1b_unroll8`
		00c0d4	`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p0/z, [src_ptr, #1, mul vl]`
		00c0d4	`- ld1b z2.b, p0/z, [src_ptr, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p0/z, [src_ptr, #3, mul vl]`
		00c0d4	`- ld1b z4.b, p0/z, [src_ptr, #4, mul vl]`
		00c0d4	`- ld1b z5.b, p0/z, [src_ptr, #5, mul vl]`
		00c0d4	`- ld1b z6.b, p0/z, [src_ptr, #6, mul vl]`
		00c0d4	`- ld1b z7.b, p0/z, [src_ptr, #7, mul vl]`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
		00c0d4	`+ ld1b z2.b, p0/z, [src, 2, mul vl]`
		00c0d4	`+ ld1b z3.b, p0/z, [src, 3, mul vl]`
		00c0d4	`+ ld1b z4.b, p0/z, [src, 4, mul vl]`
		00c0d4	`+ ld1b z5.b, p0/z, [src, 5, mul vl]`
		00c0d4	`+ ld1b z6.b, p0/z, [src, 6, mul vl]`
		00c0d4	`+ ld1b z7.b, p0/z, [src, 7, mul vl]`
		00c0d4	`.endm`
		00c0d4
		00c0d4	`.macro stld1b_unroll4a`
		00c0d4	`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z1.b, p0, [dest_ptr, #1, mul vl]`
		00c0d4	`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p0/z, [src_ptr, #1, mul vl]`
		00c0d4	`- st1b z2.b, p0, [dest_ptr, #2, mul vl]`
		00c0d4	`- st1b z3.b, p0, [dest_ptr, #3, mul vl]`
		00c0d4	`- ld1b z2.b, p0/z, [src_ptr, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p0/z, [src_ptr, #3, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dst, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p0, [dst, 1, mul vl]`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
		00c0d4	`+ st1b z2.b, p0, [dst, 2, mul vl]`
		00c0d4	`+ st1b z3.b, p0, [dst, 3, mul vl]`
		00c0d4	`+ ld1b z2.b, p0/z, [src, 2, mul vl]`
		00c0d4	`+ ld1b z3.b, p0/z, [src, 3, mul vl]`
		00c0d4	`.endm`
		00c0d4
		00c0d4	`.macro stld1b_unroll4b`
		00c0d4	`- st1b z4.b, p0, [dest_ptr, #4, mul vl]`
		00c0d4	`- st1b z5.b, p0, [dest_ptr, #5, mul vl]`
		00c0d4	`- ld1b z4.b, p0/z, [src_ptr, #4, mul vl]`
		00c0d4	`- ld1b z5.b, p0/z, [src_ptr, #5, mul vl]`
		00c0d4	`- st1b z6.b, p0, [dest_ptr, #6, mul vl]`
		00c0d4	`- st1b z7.b, p0, [dest_ptr, #7, mul vl]`
		00c0d4	`- ld1b z6.b, p0/z, [src_ptr, #6, mul vl]`
		00c0d4	`- ld1b z7.b, p0/z, [src_ptr, #7, mul vl]`
		00c0d4	`+ st1b z4.b, p0, [dst, 4, mul vl]`
		00c0d4	`+ st1b z5.b, p0, [dst, 5, mul vl]`
		00c0d4	`+ ld1b z4.b, p0/z, [src, 4, mul vl]`
		00c0d4	`+ ld1b z5.b, p0/z, [src, 5, mul vl]`
		00c0d4	`+ st1b z6.b, p0, [dst, 6, mul vl]`
		00c0d4	`+ st1b z7.b, p0, [dst, 7, mul vl]`
		00c0d4	`+ ld1b z6.b, p0/z, [src, 6, mul vl]`
		00c0d4	`+ ld1b z7.b, p0/z, [src, 7, mul vl]`
		00c0d4	`.endm`
		00c0d4
		00c0d4	`.macro stld1b_unroll8`
		00c0d4	`@@ -97,87 +84,18 @@`
		00c0d4	`.endm`
		00c0d4
		00c0d4	`.macro st1b_unroll8`
		00c0d4	`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z1.b, p0, [dest_ptr, #1, mul vl]`
		00c0d4	`- st1b z2.b, p0, [dest_ptr, #2, mul vl]`
		00c0d4	`- st1b z3.b, p0, [dest_ptr, #3, mul vl]`
		00c0d4	`- st1b z4.b, p0, [dest_ptr, #4, mul vl]`
		00c0d4	`- st1b z5.b, p0, [dest_ptr, #5, mul vl]`
		00c0d4	`- st1b z6.b, p0, [dest_ptr, #6, mul vl]`
		00c0d4	`- st1b z7.b, p0, [dest_ptr, #7, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dst, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p0, [dst, 1, mul vl]`
		00c0d4	`+ st1b z2.b, p0, [dst, 2, mul vl]`
		00c0d4	`+ st1b z3.b, p0, [dst, 3, mul vl]`
		00c0d4	`+ st1b z4.b, p0, [dst, 4, mul vl]`
		00c0d4	`+ st1b z5.b, p0, [dst, 5, mul vl]`
		00c0d4	`+ st1b z6.b, p0, [dst, 6, mul vl]`
		00c0d4	`+ st1b z7.b, p0, [dst, 7, mul vl]`
		00c0d4	`.endm`
		00c0d4
		00c0d4	`- .macro shortcut_for_small_size exit`
		00c0d4	`- // if rest <= vector_length * 2`
		00c0d4	`- whilelo p0.b, xzr, n`
		00c0d4	`- whilelo p1.b, vector_length, n`
		00c0d4	`- b.last 1f`
		00c0d4	`- ld1b z0.b, p0/z, [src, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src, #1, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest, #1, mul vl]`
		00c0d4	`- ret`
		00c0d4	`-1: // if rest > vector_length * 8`
		00c0d4	`- cmp n, vector_length, lsl 3 // vector_length * 8`
		00c0d4	`- b.hi \exit`
		00c0d4	`- // if rest <= vector_length * 4`
		00c0d4	`- lsl tmp1, vector_length, 1 // vector_length * 2`
		00c0d4	`- whilelo p2.b, tmp1, n`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p3.b, tmp1, n`
		00c0d4	`- b.last 1f`
		00c0d4	`- ld1b z0.b, p0/z, [src, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src, #1, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p3/z, [src, #3, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest, #1, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest, #2, mul vl]`
		00c0d4	`- st1b z3.b, p3, [dest, #3, mul vl]`
		00c0d4	`- ret`
		00c0d4	`-1: // if rest <= vector_length * 8`
		00c0d4	`- lsl tmp1, vector_length, 2 // vector_length * 4`
		00c0d4	`- whilelo p4.b, tmp1, n`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p5.b, tmp1, n`
		00c0d4	`- b.last 1f`
		00c0d4	`- ld1b z0.b, p0/z, [src, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src, #1, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p3/z, [src, #3, mul vl]`
		00c0d4	`- ld1b z4.b, p4/z, [src, #4, mul vl]`
		00c0d4	`- ld1b z5.b, p5/z, [src, #5, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest, #1, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest, #2, mul vl]`
		00c0d4	`- st1b z3.b, p3, [dest, #3, mul vl]`
		00c0d4	`- st1b z4.b, p4, [dest, #4, mul vl]`
		00c0d4	`- st1b z5.b, p5, [dest, #5, mul vl]`
		00c0d4	`- ret`
		00c0d4	`-1: lsl tmp1, vector_length, 2 // vector_length * 4`
		00c0d4	`- incb tmp1 // vector_length * 5`
		00c0d4	`- incb tmp1 // vector_length * 6`
		00c0d4	`- whilelo p6.b, tmp1, n`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p7.b, tmp1, n`
		00c0d4	`- ld1b z0.b, p0/z, [src, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src, #1, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p3/z, [src, #3, mul vl]`
		00c0d4	`- ld1b z4.b, p4/z, [src, #4, mul vl]`
		00c0d4	`- ld1b z5.b, p5/z, [src, #5, mul vl]`
		00c0d4	`- ld1b z6.b, p6/z, [src, #6, mul vl]`
		00c0d4	`- ld1b z7.b, p7/z, [src, #7, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest, #1, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest, #2, mul vl]`
		00c0d4	`- st1b z3.b, p3, [dest, #3, mul vl]`
		00c0d4	`- st1b z4.b, p4, [dest, #4, mul vl]`
		00c0d4	`- st1b z5.b, p5, [dest, #5, mul vl]`
		00c0d4	`- st1b z6.b, p6, [dest, #6, mul vl]`
		00c0d4	`- st1b z7.b, p7, [dest, #7, mul vl]`
		00c0d4	`- ret`
		00c0d4	`- .endm`
		00c0d4	`+#undef BTI_C`
		00c0d4	`+#define BTI_C`
		00c0d4
		00c0d4	`ENTRY (MEMCPY)`
		00c0d4
		00c0d4	`@@ -185,223 +103,209 @@ ENTRY (MEMCPY)`
		00c0d4	`PTR_ARG (1)`
		00c0d4	`SIZE_ARG (2)`
		00c0d4
		00c0d4	`-L(memcpy):`
		00c0d4	`- cntb vector_length`
		00c0d4	`- // shortcut for less than vector_length * 8`
		00c0d4	`- // gives a free ptrue to p0.b for n >= vector_length`
		00c0d4	`- shortcut_for_small_size L(vl_agnostic)`
		00c0d4	`- // end of shortcut`
		00c0d4	`-`
		00c0d4	`-L(vl_agnostic): // VL Agnostic`
		00c0d4	`- mov rest, n`
		00c0d4	`- mov dest_ptr, dest`
		00c0d4	`- mov src_ptr, src`
		00c0d4	`- // if rest >= L2_SIZE && vector_length == 64 then L(L2)`
		00c0d4	`- mov tmp1, 64`
		00c0d4	`- cmp rest, L2_SIZE`
		00c0d4	`- ccmp vector_length, tmp1, 0, cs`
		00c0d4	`- b.eq L(L2)`
		00c0d4	`-`
		00c0d4	`-L(unroll8): // unrolling and software pipeline`
		00c0d4	`- lsl tmp1, vector_length, 3 // vector_length * 8`
		00c0d4	`- .p2align 3`
		00c0d4	`- cmp rest, tmp1`
		00c0d4	`- b.cc L(last)`
		00c0d4	`+ cntb vlen`
		00c0d4	`+ cmp n, vlen, lsl 1`
		00c0d4	`+ b.hi L(copy_small)`
		00c0d4	`+ whilelo p1.b, vlen, n`
		00c0d4	`+ whilelo p0.b, xzr, n`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p1/z, [src, 1, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dstin, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p1, [dstin, 1, mul vl]`
		00c0d4	`+ ret`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+`
		00c0d4	`+L(copy_small):`
		00c0d4	`+ cmp n, vlen, lsl 3`
		00c0d4	`+ b.hi L(copy_large)`
		00c0d4	`+ add dstend, dstin, n`
		00c0d4	`+ add srcend, src, n`
		00c0d4	`+ cmp n, vlen, lsl 2`
		00c0d4	`+ b.hi 1f`
		00c0d4	`+`
		00c0d4	`+ /* Copy 2-4 vectors. */`
		00c0d4	`+ ptrue p0.b`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
		00c0d4	`+ ld1b z2.b, p0/z, [srcend, -2, mul vl]`
		00c0d4	`+ ld1b z3.b, p0/z, [srcend, -1, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dstin, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p0, [dstin, 1, mul vl]`
		00c0d4	`+ st1b z2.b, p0, [dstend, -2, mul vl]`
		00c0d4	`+ st1b z3.b, p0, [dstend, -1, mul vl]`
		00c0d4	`+ ret`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+ /* Copy 4-8 vectors. */`
		00c0d4	`+1: ptrue p0.b`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
		00c0d4	`+ ld1b z2.b, p0/z, [src, 2, mul vl]`
		00c0d4	`+ ld1b z3.b, p0/z, [src, 3, mul vl]`
		00c0d4	`+ ld1b z4.b, p0/z, [srcend, -4, mul vl]`
		00c0d4	`+ ld1b z5.b, p0/z, [srcend, -3, mul vl]`
		00c0d4	`+ ld1b z6.b, p0/z, [srcend, -2, mul vl]`
		00c0d4	`+ ld1b z7.b, p0/z, [srcend, -1, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dstin, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p0, [dstin, 1, mul vl]`
		00c0d4	`+ st1b z2.b, p0, [dstin, 2, mul vl]`
		00c0d4	`+ st1b z3.b, p0, [dstin, 3, mul vl]`
		00c0d4	`+ st1b z4.b, p0, [dstend, -4, mul vl]`
		00c0d4	`+ st1b z5.b, p0, [dstend, -3, mul vl]`
		00c0d4	`+ st1b z6.b, p0, [dstend, -2, mul vl]`
		00c0d4	`+ st1b z7.b, p0, [dstend, -1, mul vl]`
		00c0d4	`+ ret`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+ /* At least 8 vectors - always align to vector length for`
		00c0d4	`+ higher and consistent write performance. */`
		00c0d4	`+L(copy_large):`
		00c0d4	`+ sub tmp, vlen, 1`
		00c0d4	`+ and tmp, dstin, tmp`
		00c0d4	`+ sub tmp, vlen, tmp`
		00c0d4	`+ whilelo p1.b, xzr, tmp`
		00c0d4	`+ ld1b z1.b, p1/z, [src]`
		00c0d4	`+ st1b z1.b, p1, [dstin]`
		00c0d4	`+ add dst, dstin, tmp`
		00c0d4	`+ add src, src, tmp`
		00c0d4	`+ sub n, n, tmp`
		00c0d4	`+ ptrue p0.b`
		00c0d4	`+`
		00c0d4	`+ lsl vlen8, vlen, 3`
		00c0d4	`+ subs n, n, vlen8`
		00c0d4	`+ b.ls 3f`
		00c0d4	`ld1b_unroll8`
		00c0d4	`- add src_ptr, src_ptr, tmp1`
		00c0d4	`- sub rest, rest, tmp1`
		00c0d4	`- cmp rest, tmp1`
		00c0d4	`- b.cc 2f`
		00c0d4	`- .p2align 3`
		00c0d4	`+ add src, src, vlen8`
		00c0d4	`+ subs n, n, vlen8`
		00c0d4	`+ b.ls 2f`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+ /* 8x unrolled and software pipelined loop. */`
		00c0d4	`1: stld1b_unroll8`
		00c0d4	`- add dest_ptr, dest_ptr, tmp1`
		00c0d4	`- add src_ptr, src_ptr, tmp1`
		00c0d4	`- sub rest, rest, tmp1`
		00c0d4	`- cmp rest, tmp1`
		00c0d4	`- b.ge 1b`
		00c0d4	`+ add dst, dst, vlen8`
		00c0d4	`+ add src, src, vlen8`
		00c0d4	`+ subs n, n, vlen8`
		00c0d4	`+ b.hi 1b`
		00c0d4	`2: st1b_unroll8`
		00c0d4	`- add dest_ptr, dest_ptr, tmp1`
		00c0d4	`-`
		00c0d4	`- .p2align 3`
		00c0d4	`-L(last):`
		00c0d4	`- whilelo p0.b, xzr, rest`
		00c0d4	`- whilelo p1.b, vector_length, rest`
		00c0d4	`- b.last 1f`
		00c0d4	`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest_ptr, #1, mul vl]`
		00c0d4	`- ret`
		00c0d4	`-1: lsl tmp1, vector_length, 1 // vector_length * 2`
		00c0d4	`- whilelo p2.b, tmp1, rest`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p3.b, tmp1, rest`
		00c0d4	`- b.last 1f`
		00c0d4	`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest_ptr, #1, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest_ptr, #2, mul vl]`
		00c0d4	`- st1b z3.b, p3, [dest_ptr, #3, mul vl]`
		00c0d4	`+ add dst, dst, vlen8`
		00c0d4	`+3: add n, n, vlen8`
		00c0d4	`+`
		00c0d4	`+ /* Move last 0-8 vectors. */`
		00c0d4	`+L(last_bytes):`
		00c0d4	`+ cmp n, vlen, lsl 1`
		00c0d4	`+ b.hi 1f`
		00c0d4	`+ whilelo p0.b, xzr, n`
		00c0d4	`+ whilelo p1.b, vlen, n`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p1/z, [src, 1, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dst, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p1, [dst, 1, mul vl]`
		00c0d4	`ret`
		00c0d4	`-1: lsl tmp1, vector_length, 2 // vector_length * 4`
		00c0d4	`- whilelo p4.b, tmp1, rest`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p5.b, tmp1, rest`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p6.b, tmp1, rest`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p7.b, tmp1, rest`
		00c0d4	`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]`
		00c0d4	`- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]`
		00c0d4	`- ld1b z4.b, p4/z, [src_ptr, #4, mul vl]`
		00c0d4	`- ld1b z5.b, p5/z, [src_ptr, #5, mul vl]`
		00c0d4	`- ld1b z6.b, p6/z, [src_ptr, #6, mul vl]`
		00c0d4	`- ld1b z7.b, p7/z, [src_ptr, #7, mul vl]`
		00c0d4	`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest_ptr, #1, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest_ptr, #2, mul vl]`
		00c0d4	`- st1b z3.b, p3, [dest_ptr, #3, mul vl]`
		00c0d4	`- st1b z4.b, p4, [dest_ptr, #4, mul vl]`
		00c0d4	`- st1b z5.b, p5, [dest_ptr, #5, mul vl]`
		00c0d4	`- st1b z6.b, p6, [dest_ptr, #6, mul vl]`
		00c0d4	`- st1b z7.b, p7, [dest_ptr, #7, mul vl]`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+`
		00c0d4	`+1: add srcend, src, n`
		00c0d4	`+ add dstend, dst, n`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
		00c0d4	`+ ld1b z2.b, p0/z, [srcend, -2, mul vl]`
		00c0d4	`+ ld1b z3.b, p0/z, [srcend, -1, mul vl]`
		00c0d4	`+ cmp n, vlen, lsl 2`
		00c0d4	`+ b.hi 1f`
		00c0d4	`+`
		00c0d4	`+ st1b z0.b, p0, [dst, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p0, [dst, 1, mul vl]`
		00c0d4	`+ st1b z2.b, p0, [dstend, -2, mul vl]`
		00c0d4	`+ st1b z3.b, p0, [dstend, -1, mul vl]`
		00c0d4	`ret`
		00c0d4
		00c0d4	`-L(L2):`
		00c0d4	`- // align dest address at CACHE_LINE_SIZE byte boundary`
		00c0d4	`- mov tmp1, CACHE_LINE_SIZE`
		00c0d4	`- ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1`
		00c0d4	`- // if cl_remainder == 0`
		00c0d4	`- b.eq L(L2_dc_zva)`
		00c0d4	`- sub cl_remainder, tmp1, tmp2`
		00c0d4	`- // process remainder until the first CACHE_LINE_SIZE boundary`
		00c0d4	`- whilelo p1.b, xzr, cl_remainder // keep p0.b all true`
		00c0d4	`- whilelo p2.b, vector_length, cl_remainder`
		00c0d4	`- b.last 1f`
		00c0d4	`- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest_ptr, #1, mul vl]`
		00c0d4	`- b 2f`
		00c0d4	`-1: lsl tmp1, vector_length, 1 // vector_length * 2`
		00c0d4	`- whilelo p3.b, tmp1, cl_remainder`
		00c0d4	`- incb tmp1`
		00c0d4	`- whilelo p4.b, tmp1, cl_remainder`
		00c0d4	`- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]`
		00c0d4	`- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]`
		00c0d4	`- ld1b z3.b, p3/z, [src_ptr, #2, mul vl]`
		00c0d4	`- ld1b z4.b, p4/z, [src_ptr, #3, mul vl]`
		00c0d4	`- st1b z1.b, p1, [dest_ptr, #0, mul vl]`
		00c0d4	`- st1b z2.b, p2, [dest_ptr, #1, mul vl]`
		00c0d4	`- st1b z3.b, p3, [dest_ptr, #2, mul vl]`
		00c0d4	`- st1b z4.b, p4, [dest_ptr, #3, mul vl]`
		00c0d4	`-2: add dest_ptr, dest_ptr, cl_remainder`
		00c0d4	`- add src_ptr, src_ptr, cl_remainder`
		00c0d4	`- sub rest, rest, cl_remainder`
		00c0d4	`-`
		00c0d4	`-L(L2_dc_zva):`
		00c0d4	`- // zero fill`
		00c0d4	`- and tmp1, dest, 0xffffffffffffff`
		00c0d4	`- and tmp2, src, 0xffffffffffffff`
		00c0d4	`- subs tmp1, tmp1, tmp2 // diff`
		00c0d4	`- b.ge 1f`
		00c0d4	`- neg tmp1, tmp1`
		00c0d4	`-1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2`
		00c0d4	`- cmp tmp1, tmp3`
		00c0d4	`- b.lo L(unroll8)`
		00c0d4	`- mov tmp1, dest_ptr`
		00c0d4	`- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1`
		00c0d4	`- // unroll`
		00c0d4	`- ld1b_unroll8 // this line has to be after "b.lo L(unroll8)"`
		00c0d4	`- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2`
		00c0d4	`- sub rest, rest, CACHE_LINE_SIZE * 2`
		00c0d4	`- mov tmp1, ZF_DIST`
		00c0d4	`- .p2align 3`
		00c0d4	`-1: stld1b_unroll4a`
		00c0d4	`- add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST`
		00c0d4	`- dc zva, tmp2`
		00c0d4	`- stld1b_unroll4b`
		00c0d4	`- add tmp2, tmp2, CACHE_LINE_SIZE`
		00c0d4	`- dc zva, tmp2`
		00c0d4	`- add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2`
		00c0d4	`- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2`
		00c0d4	`- sub rest, rest, CACHE_LINE_SIZE * 2`
		00c0d4	`- cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2`
		00c0d4	`- b.ge 1b`
		00c0d4	`- st1b_unroll8`
		00c0d4	`- add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2`
		00c0d4	`- b L(unroll8)`
		00c0d4	`+1: ld1b z4.b, p0/z, [src, 2, mul vl]`
		00c0d4	`+ ld1b z5.b, p0/z, [src, 3, mul vl]`
		00c0d4	`+ ld1b z6.b, p0/z, [srcend, -4, mul vl]`
		00c0d4	`+ ld1b z7.b, p0/z, [srcend, -3, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dst, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p0, [dst, 1, mul vl]`
		00c0d4	`+ st1b z4.b, p0, [dst, 2, mul vl]`
		00c0d4	`+ st1b z5.b, p0, [dst, 3, mul vl]`
		00c0d4	`+ st1b z6.b, p0, [dstend, -4, mul vl]`
		00c0d4	`+ st1b z7.b, p0, [dstend, -3, mul vl]`
		00c0d4	`+ st1b z2.b, p0, [dstend, -2, mul vl]`
		00c0d4	`+ st1b z3.b, p0, [dstend, -1, mul vl]`
		00c0d4	`+ ret`
		00c0d4
		00c0d4	`END (MEMCPY)`
		00c0d4	`libc_hidden_builtin_def (MEMCPY)`
		00c0d4
		00c0d4
		00c0d4	`-ENTRY (MEMMOVE)`
		00c0d4	`+ENTRY_ALIGN (MEMMOVE, 4)`
		00c0d4
		00c0d4	`PTR_ARG (0)`
		00c0d4	`PTR_ARG (1)`
		00c0d4	`SIZE_ARG (2)`
		00c0d4
		00c0d4	`- // remove tag address`
		00c0d4	`- // dest has to be immutable because it is the return value`
		00c0d4	`- // src has to be immutable because it is used in L(bwd_last)`
		00c0d4	`- and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2`
		00c0d4	`- and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3`
		00c0d4	`- cmp n, 0`
		00c0d4	`- ccmp tmp2, tmp3, 4, ne`
		00c0d4	`- b.ne 1f`
		00c0d4	`+ /* Fast case for up to 2 vectors. */`
		00c0d4	`+ cntb vlen`
		00c0d4	`+ cmp n, vlen, lsl 1`
		00c0d4	`+ b.hi 1f`
		00c0d4	`+ whilelo p0.b, xzr, n`
		00c0d4	`+ whilelo p1.b, vlen, n`
		00c0d4	`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
		00c0d4	`+ ld1b z1.b, p1/z, [src, 1, mul vl]`
		00c0d4	`+ st1b z0.b, p0, [dstin, 0, mul vl]`
		00c0d4	`+ st1b z1.b, p1, [dstin, 1, mul vl]`
		00c0d4	`+L(full_overlap):`
		00c0d4	`ret`
		00c0d4	`-1: cntb vector_length`
		00c0d4	`- // shortcut for less than vector_length * 8`
		00c0d4	`- // gives a free ptrue to p0.b for n >= vector_length`
		00c0d4	`- // tmp2 and tmp3 should not be used in this macro to keep`
		00c0d4	`- // notag addresses`
		00c0d4	`- shortcut_for_small_size L(dispatch)`
		00c0d4	`- // end of shortcut`
		00c0d4	`-`
		00c0d4	`-L(dispatch):`
		00c0d4	`- // tmp2 = dest_notag, tmp3 = src_notag`
		00c0d4	`- // diff = dest_notag - src_notag`
		00c0d4	`- sub tmp1, tmp2, tmp3`
		00c0d4	`- // if diff <= 0 \|\| diff >= n then memcpy`
		00c0d4	`- cmp tmp1, 0`
		00c0d4	`- ccmp tmp1, n, 2, gt`
		00c0d4	`- b.cs L(vl_agnostic)`
		00c0d4	`-`
		00c0d4	`-L(bwd_start):`
		00c0d4	`- mov rest, n`
		00c0d4	`- add dest_ptr, dest, n // dest_end`
		00c0d4	`- add src_ptr, src, n // src_end`
		00c0d4	`-`
		00c0d4	`-L(bwd_unroll8): // unrolling and software pipeline`
		00c0d4	`- lsl tmp1, vector_length, 3 // vector_length * 8`
		00c0d4	`- .p2align 3`
		00c0d4	`- cmp rest, tmp1`
		00c0d4	`- b.cc L(bwd_last)`
		00c0d4	`- sub src_ptr, src_ptr, tmp1`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+ /* Check for overlapping moves. Return if there is a full overlap.`
		00c0d4	`+ Small moves up to 8 vectors use the overlap-safe copy_small code.`
		00c0d4	`+ Non-overlapping or overlapping moves with dst < src use memcpy.`
		00c0d4	`+ Overlapping moves with dst > src use a backward copy loop. */`
		00c0d4	`+1: sub tmp, dstin, src`
		00c0d4	`+ ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */`
		00c0d4	`+ b.eq L(full_overlap)`
		00c0d4	`+ cmp n, vlen, lsl 3`
		00c0d4	`+ b.ls L(copy_small)`
		00c0d4	`+ cmp tmp, n`
		00c0d4	`+ b.hs L(copy_large)`
		00c0d4	`+`
		00c0d4	`+ /* Align to vector length. */`
		00c0d4	`+ add dst, dstin, n`
		00c0d4	`+ sub tmp, vlen, 1`
		00c0d4	`+ ands tmp, dst, tmp`
		00c0d4	`+ csel tmp, tmp, vlen, ne`
		00c0d4	`+ whilelo p1.b, xzr, tmp`
		00c0d4	`+ sub n, n, tmp`
		00c0d4	`+ ld1b z1.b, p1/z, [src, n]`
		00c0d4	`+ st1b z1.b, p1, [dstin, n]`
		00c0d4	`+ add src, src, n`
		00c0d4	`+ add dst, dstin, n`
		00c0d4	`+`
		00c0d4	`+ ptrue p0.b`
		00c0d4	`+ lsl vlen8, vlen, 3`
		00c0d4	`+ subs n, n, vlen8`
		00c0d4	`+ b.ls 3f`
		00c0d4	`+ sub src, src, vlen8`
		00c0d4	`ld1b_unroll8`
		00c0d4	`- sub rest, rest, tmp1`
		00c0d4	`- cmp rest, tmp1`
		00c0d4	`- b.cc 2f`
		00c0d4	`- .p2align 3`
		00c0d4	`-1: sub src_ptr, src_ptr, tmp1`
		00c0d4	`- sub dest_ptr, dest_ptr, tmp1`
		00c0d4	`+ subs n, n, vlen8`
		00c0d4	`+ b.ls 2f`
		00c0d4	`+`
		00c0d4	`+ .p2align 4`
		00c0d4	`+ /* 8x unrolled and software pipelined backward copy loop. */`
		00c0d4	`+1: sub src, src, vlen8`
		00c0d4	`+ sub dst, dst, vlen8`
		00c0d4	`stld1b_unroll8`
		00c0d4	`- sub rest, rest, tmp1`
		00c0d4	`- cmp rest, tmp1`
		00c0d4	`- b.ge 1b`
		00c0d4	`-2: sub dest_ptr, dest_ptr, tmp1`
		00c0d4	`+ subs n, n, vlen8`
		00c0d4	`+ b.hi 1b`
		00c0d4	`+2: sub dst, dst, vlen8`
		00c0d4	`st1b_unroll8`
		00c0d4	`+3: add n, n, vlen8`
		00c0d4
		00c0d4	`-L(bwd_last):`
		00c0d4	`- mov dest_ptr, dest`
		00c0d4	`- mov src_ptr, src`
		00c0d4	`- b L(last)`
		00c0d4	`+ /* Adjust src/dst for last 0-8 vectors. */`
		00c0d4	`+ sub src, src, n`
		00c0d4	`+ mov dst, dstin`
		00c0d4	`+ b L(last_bytes)`
		00c0d4
		00c0d4	`END (MEMMOVE)`
		00c0d4	`libc_hidden_builtin_def (MEMMOVE)`
		00c0d4	`--`
		00c0d4	`2.31.1`
		00c0d4

rpms / glibc

Source Code

Blame SOURCES/glibc-rh2037416-8.patch