| From b31bd11454fade731e5158b1aea40b133ae19926 Mon Sep 17 00:00:00 2001 |
| From: Wilco Dijkstra <wdijkstr@arm.com> |
| Date: Thu, 2 Dec 2021 18:33:26 +0000 |
| Subject: [PATCH] AArch64: Improve A64FX memcpy |
| |
| v2 is a complete rewrite of the A64FX memcpy. Performance is improved |
| by streamlining the code, aligning all large copies and using a single |
| unrolled loop for all sizes. The code size for memcpy and memmove goes |
| down from 1796 bytes to 868 bytes. Performance is better in all cases: |
| bench-memcpy-random is 2.3% faster overall, bench-memcpy-large is ~33% |
| faster for large sizes, bench-memcpy-walk is 25% faster for small sizes |
| and 20% for the largest sizes. The geomean of all tests in bench-memcpy |
| is 5.1% faster, and total time is reduced by 4%. |
| |
| Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> |
| |
| sysdeps/aarch64/multiarch/memcpy_a64fx.S | 546 ++++++++++------------- |
| 1 file changed, 225 insertions(+), 321 deletions(-) |
| |
| diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S |
| index ae7464e09f..0b306925e6 100644 |
| |
| |
| @@ -28,20 +28,15 @@ |
| * |
| */ |
| |
| -#define L2_SIZE (8*1024*1024)/2 // L2 8MB/2 |
| -#define CACHE_LINE_SIZE 256 |
| -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance |
| -#define dest x0 |
| -#define src x1 |
| -#define n x2 // size |
| -#define tmp1 x3 |
| -#define tmp2 x4 |
| -#define tmp3 x5 |
| -#define rest x6 |
| -#define dest_ptr x7 |
| -#define src_ptr x8 |
| -#define vector_length x9 |
| -#define cl_remainder x10 // CACHE_LINE_SIZE remainder |
| +#define dstin x0 |
| +#define src x1 |
| +#define n x2 |
| +#define dst x3 |
| +#define dstend x4 |
| +#define srcend x5 |
| +#define tmp x6 |
| +#define vlen x7 |
| +#define vlen8 x8 |
| |
| #if HAVE_AARCH64_SVE_ASM |
| # if IS_IN (libc) |
| @@ -50,45 +45,37 @@ |
| |
| .arch armv8.2-a+sve |
| |
| - .macro dc_zva times |
| - dc zva, tmp1 |
| - add tmp1, tmp1, CACHE_LINE_SIZE |
| - .if \times-1 |
| - dc_zva "(\times-1)" |
| - .endif |
| - .endm |
| - |
| .macro ld1b_unroll8 |
| - ld1b z0.b, p0/z, [src_ptr, #0, mul vl] |
| - ld1b z1.b, p0/z, [src_ptr, #1, mul vl] |
| - ld1b z2.b, p0/z, [src_ptr, #2, mul vl] |
| - ld1b z3.b, p0/z, [src_ptr, #3, mul vl] |
| - ld1b z4.b, p0/z, [src_ptr, #4, mul vl] |
| - ld1b z5.b, p0/z, [src_ptr, #5, mul vl] |
| - ld1b z6.b, p0/z, [src_ptr, #6, mul vl] |
| - ld1b z7.b, p0/z, [src_ptr, #7, mul vl] |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p0/z, [src, 1, mul vl] |
| + ld1b z2.b, p0/z, [src, 2, mul vl] |
| + ld1b z3.b, p0/z, [src, 3, mul vl] |
| + ld1b z4.b, p0/z, [src, 4, mul vl] |
| + ld1b z5.b, p0/z, [src, 5, mul vl] |
| + ld1b z6.b, p0/z, [src, 6, mul vl] |
| + ld1b z7.b, p0/z, [src, 7, mul vl] |
| .endm |
| |
| .macro stld1b_unroll4a |
| - st1b z0.b, p0, [dest_ptr, #0, mul vl] |
| - st1b z1.b, p0, [dest_ptr, #1, mul vl] |
| - ld1b z0.b, p0/z, [src_ptr, #0, mul vl] |
| - ld1b z1.b, p0/z, [src_ptr, #1, mul vl] |
| - st1b z2.b, p0, [dest_ptr, #2, mul vl] |
| - st1b z3.b, p0, [dest_ptr, #3, mul vl] |
| - ld1b z2.b, p0/z, [src_ptr, #2, mul vl] |
| - ld1b z3.b, p0/z, [src_ptr, #3, mul vl] |
| + st1b z0.b, p0, [dst, 0, mul vl] |
| + st1b z1.b, p0, [dst, 1, mul vl] |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p0/z, [src, 1, mul vl] |
| + st1b z2.b, p0, [dst, 2, mul vl] |
| + st1b z3.b, p0, [dst, 3, mul vl] |
| + ld1b z2.b, p0/z, [src, 2, mul vl] |
| + ld1b z3.b, p0/z, [src, 3, mul vl] |
| .endm |
| |
| .macro stld1b_unroll4b |
| - st1b z4.b, p0, [dest_ptr, #4, mul vl] |
| - st1b z5.b, p0, [dest_ptr, #5, mul vl] |
| - ld1b z4.b, p0/z, [src_ptr, #4, mul vl] |
| - ld1b z5.b, p0/z, [src_ptr, #5, mul vl] |
| - st1b z6.b, p0, [dest_ptr, #6, mul vl] |
| - st1b z7.b, p0, [dest_ptr, #7, mul vl] |
| - ld1b z6.b, p0/z, [src_ptr, #6, mul vl] |
| - ld1b z7.b, p0/z, [src_ptr, #7, mul vl] |
| + st1b z4.b, p0, [dst, 4, mul vl] |
| + st1b z5.b, p0, [dst, 5, mul vl] |
| + ld1b z4.b, p0/z, [src, 4, mul vl] |
| + ld1b z5.b, p0/z, [src, 5, mul vl] |
| + st1b z6.b, p0, [dst, 6, mul vl] |
| + st1b z7.b, p0, [dst, 7, mul vl] |
| + ld1b z6.b, p0/z, [src, 6, mul vl] |
| + ld1b z7.b, p0/z, [src, 7, mul vl] |
| .endm |
| |
| .macro stld1b_unroll8 |
| @@ -97,87 +84,18 @@ |
| .endm |
| |
| .macro st1b_unroll8 |
| - st1b z0.b, p0, [dest_ptr, #0, mul vl] |
| - st1b z1.b, p0, [dest_ptr, #1, mul vl] |
| - st1b z2.b, p0, [dest_ptr, #2, mul vl] |
| - st1b z3.b, p0, [dest_ptr, #3, mul vl] |
| - st1b z4.b, p0, [dest_ptr, #4, mul vl] |
| - st1b z5.b, p0, [dest_ptr, #5, mul vl] |
| - st1b z6.b, p0, [dest_ptr, #6, mul vl] |
| - st1b z7.b, p0, [dest_ptr, #7, mul vl] |
| + st1b z0.b, p0, [dst, 0, mul vl] |
| + st1b z1.b, p0, [dst, 1, mul vl] |
| + st1b z2.b, p0, [dst, 2, mul vl] |
| + st1b z3.b, p0, [dst, 3, mul vl] |
| + st1b z4.b, p0, [dst, 4, mul vl] |
| + st1b z5.b, p0, [dst, 5, mul vl] |
| + st1b z6.b, p0, [dst, 6, mul vl] |
| + st1b z7.b, p0, [dst, 7, mul vl] |
| .endm |
| |
| - .macro shortcut_for_small_size exit |
| - // if rest <= vector_length * 2 |
| - whilelo p0.b, xzr, n |
| - whilelo p1.b, vector_length, n |
| - b.last 1f |
| - ld1b z0.b, p0/z, [src, #0, mul vl] |
| - ld1b z1.b, p1/z, [src, #1, mul vl] |
| - st1b z0.b, p0, [dest, #0, mul vl] |
| - st1b z1.b, p1, [dest, #1, mul vl] |
| - ret |
| -1: // if rest > vector_length * 8 |
| - cmp n, vector_length, lsl 3 // vector_length * 8 |
| - b.hi \exit |
| - // if rest <= vector_length * 4 |
| - lsl tmp1, vector_length, 1 // vector_length * 2 |
| - whilelo p2.b, tmp1, n |
| - incb tmp1 |
| - whilelo p3.b, tmp1, n |
| - b.last 1f |
| - ld1b z0.b, p0/z, [src, #0, mul vl] |
| - ld1b z1.b, p1/z, [src, #1, mul vl] |
| - ld1b z2.b, p2/z, [src, #2, mul vl] |
| - ld1b z3.b, p3/z, [src, #3, mul vl] |
| - st1b z0.b, p0, [dest, #0, mul vl] |
| - st1b z1.b, p1, [dest, #1, mul vl] |
| - st1b z2.b, p2, [dest, #2, mul vl] |
| - st1b z3.b, p3, [dest, #3, mul vl] |
| - ret |
| -1: // if rest <= vector_length * 8 |
| - lsl tmp1, vector_length, 2 // vector_length * 4 |
| - whilelo p4.b, tmp1, n |
| - incb tmp1 |
| - whilelo p5.b, tmp1, n |
| - b.last 1f |
| - ld1b z0.b, p0/z, [src, #0, mul vl] |
| - ld1b z1.b, p1/z, [src, #1, mul vl] |
| - ld1b z2.b, p2/z, [src, #2, mul vl] |
| - ld1b z3.b, p3/z, [src, #3, mul vl] |
| - ld1b z4.b, p4/z, [src, #4, mul vl] |
| - ld1b z5.b, p5/z, [src, #5, mul vl] |
| - st1b z0.b, p0, [dest, #0, mul vl] |
| - st1b z1.b, p1, [dest, #1, mul vl] |
| - st1b z2.b, p2, [dest, #2, mul vl] |
| - st1b z3.b, p3, [dest, #3, mul vl] |
| - st1b z4.b, p4, [dest, #4, mul vl] |
| - st1b z5.b, p5, [dest, #5, mul vl] |
| - ret |
| -1: lsl tmp1, vector_length, 2 // vector_length * 4 |
| - incb tmp1 // vector_length * 5 |
| - incb tmp1 // vector_length * 6 |
| - whilelo p6.b, tmp1, n |
| - incb tmp1 |
| - whilelo p7.b, tmp1, n |
| - ld1b z0.b, p0/z, [src, #0, mul vl] |
| - ld1b z1.b, p1/z, [src, #1, mul vl] |
| - ld1b z2.b, p2/z, [src, #2, mul vl] |
| - ld1b z3.b, p3/z, [src, #3, mul vl] |
| - ld1b z4.b, p4/z, [src, #4, mul vl] |
| - ld1b z5.b, p5/z, [src, #5, mul vl] |
| - ld1b z6.b, p6/z, [src, #6, mul vl] |
| - ld1b z7.b, p7/z, [src, #7, mul vl] |
| - st1b z0.b, p0, [dest, #0, mul vl] |
| - st1b z1.b, p1, [dest, #1, mul vl] |
| - st1b z2.b, p2, [dest, #2, mul vl] |
| - st1b z3.b, p3, [dest, #3, mul vl] |
| - st1b z4.b, p4, [dest, #4, mul vl] |
| - st1b z5.b, p5, [dest, #5, mul vl] |
| - st1b z6.b, p6, [dest, #6, mul vl] |
| - st1b z7.b, p7, [dest, #7, mul vl] |
| - ret |
| - .endm |
| +#undef BTI_C |
| +#define BTI_C |
| |
| ENTRY (MEMCPY) |
| |
| @@ -185,223 +103,209 @@ ENTRY (MEMCPY) |
| PTR_ARG (1) |
| SIZE_ARG (2) |
| |
| -L(memcpy): |
| - cntb vector_length |
| - // shortcut for less than vector_length * 8 |
| - // gives a free ptrue to p0.b for n >= vector_length |
| - shortcut_for_small_size L(vl_agnostic) |
| - // end of shortcut |
| - |
| -L(vl_agnostic): // VL Agnostic |
| - mov rest, n |
| - mov dest_ptr, dest |
| - mov src_ptr, src |
| - // if rest >= L2_SIZE && vector_length == 64 then L(L2) |
| - mov tmp1, 64 |
| - cmp rest, L2_SIZE |
| - ccmp vector_length, tmp1, 0, cs |
| - b.eq L(L2) |
| - |
| -L(unroll8): // unrolling and software pipeline |
| - lsl tmp1, vector_length, 3 // vector_length * 8 |
| - .p2align 3 |
| - cmp rest, tmp1 |
| - b.cc L(last) |
| + cntb vlen |
| + cmp n, vlen, lsl 1 |
| + b.hi L(copy_small) |
| + whilelo p1.b, vlen, n |
| + whilelo p0.b, xzr, n |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p1/z, [src, 1, mul vl] |
| + st1b z0.b, p0, [dstin, 0, mul vl] |
| + st1b z1.b, p1, [dstin, 1, mul vl] |
| + ret |
| + |
| + .p2align 4 |
| + |
| +L(copy_small): |
| + cmp n, vlen, lsl 3 |
| + b.hi L(copy_large) |
| + add dstend, dstin, n |
| + add srcend, src, n |
| + cmp n, vlen, lsl 2 |
| + b.hi 1f |
| + |
| + /* Copy 2-4 vectors. */ |
| + ptrue p0.b |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p0/z, [src, 1, mul vl] |
| + ld1b z2.b, p0/z, [srcend, -2, mul vl] |
| + ld1b z3.b, p0/z, [srcend, -1, mul vl] |
| + st1b z0.b, p0, [dstin, 0, mul vl] |
| + st1b z1.b, p0, [dstin, 1, mul vl] |
| + st1b z2.b, p0, [dstend, -2, mul vl] |
| + st1b z3.b, p0, [dstend, -1, mul vl] |
| + ret |
| + |
| + .p2align 4 |
| + /* Copy 4-8 vectors. */ |
| +1: ptrue p0.b |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p0/z, [src, 1, mul vl] |
| + ld1b z2.b, p0/z, [src, 2, mul vl] |
| + ld1b z3.b, p0/z, [src, 3, mul vl] |
| + ld1b z4.b, p0/z, [srcend, -4, mul vl] |
| + ld1b z5.b, p0/z, [srcend, -3, mul vl] |
| + ld1b z6.b, p0/z, [srcend, -2, mul vl] |
| + ld1b z7.b, p0/z, [srcend, -1, mul vl] |
| + st1b z0.b, p0, [dstin, 0, mul vl] |
| + st1b z1.b, p0, [dstin, 1, mul vl] |
| + st1b z2.b, p0, [dstin, 2, mul vl] |
| + st1b z3.b, p0, [dstin, 3, mul vl] |
| + st1b z4.b, p0, [dstend, -4, mul vl] |
| + st1b z5.b, p0, [dstend, -3, mul vl] |
| + st1b z6.b, p0, [dstend, -2, mul vl] |
| + st1b z7.b, p0, [dstend, -1, mul vl] |
| + ret |
| + |
| + .p2align 4 |
| + /* At least 8 vectors - always align to vector length for |
| + higher and consistent write performance. */ |
| +L(copy_large): |
| + sub tmp, vlen, 1 |
| + and tmp, dstin, tmp |
| + sub tmp, vlen, tmp |
| + whilelo p1.b, xzr, tmp |
| + ld1b z1.b, p1/z, [src] |
| + st1b z1.b, p1, [dstin] |
| + add dst, dstin, tmp |
| + add src, src, tmp |
| + sub n, n, tmp |
| + ptrue p0.b |
| + |
| + lsl vlen8, vlen, 3 |
| + subs n, n, vlen8 |
| + b.ls 3f |
| ld1b_unroll8 |
| - add src_ptr, src_ptr, tmp1 |
| - sub rest, rest, tmp1 |
| - cmp rest, tmp1 |
| - b.cc 2f |
| - .p2align 3 |
| + add src, src, vlen8 |
| + subs n, n, vlen8 |
| + b.ls 2f |
| + |
| + .p2align 4 |
| + /* 8x unrolled and software pipelined loop. */ |
| 1: stld1b_unroll8 |
| - add dest_ptr, dest_ptr, tmp1 |
| - add src_ptr, src_ptr, tmp1 |
| - sub rest, rest, tmp1 |
| - cmp rest, tmp1 |
| - b.ge 1b |
| + add dst, dst, vlen8 |
| + add src, src, vlen8 |
| + subs n, n, vlen8 |
| + b.hi 1b |
| 2: st1b_unroll8 |
| - add dest_ptr, dest_ptr, tmp1 |
| - |
| - .p2align 3 |
| -L(last): |
| - whilelo p0.b, xzr, rest |
| - whilelo p1.b, vector_length, rest |
| - b.last 1f |
| - ld1b z0.b, p0/z, [src_ptr, #0, mul vl] |
| - ld1b z1.b, p1/z, [src_ptr, #1, mul vl] |
| - st1b z0.b, p0, [dest_ptr, #0, mul vl] |
| - st1b z1.b, p1, [dest_ptr, #1, mul vl] |
| - ret |
| -1: lsl tmp1, vector_length, 1 // vector_length * 2 |
| - whilelo p2.b, tmp1, rest |
| - incb tmp1 |
| - whilelo p3.b, tmp1, rest |
| - b.last 1f |
| - ld1b z0.b, p0/z, [src_ptr, #0, mul vl] |
| - ld1b z1.b, p1/z, [src_ptr, #1, mul vl] |
| - ld1b z2.b, p2/z, [src_ptr, #2, mul vl] |
| - ld1b z3.b, p3/z, [src_ptr, #3, mul vl] |
| - st1b z0.b, p0, [dest_ptr, #0, mul vl] |
| - st1b z1.b, p1, [dest_ptr, #1, mul vl] |
| - st1b z2.b, p2, [dest_ptr, #2, mul vl] |
| - st1b z3.b, p3, [dest_ptr, #3, mul vl] |
| + add dst, dst, vlen8 |
| +3: add n, n, vlen8 |
| + |
| + /* Move last 0-8 vectors. */ |
| +L(last_bytes): |
| + cmp n, vlen, lsl 1 |
| + b.hi 1f |
| + whilelo p0.b, xzr, n |
| + whilelo p1.b, vlen, n |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p1/z, [src, 1, mul vl] |
| + st1b z0.b, p0, [dst, 0, mul vl] |
| + st1b z1.b, p1, [dst, 1, mul vl] |
| ret |
| -1: lsl tmp1, vector_length, 2 // vector_length * 4 |
| - whilelo p4.b, tmp1, rest |
| - incb tmp1 |
| - whilelo p5.b, tmp1, rest |
| - incb tmp1 |
| - whilelo p6.b, tmp1, rest |
| - incb tmp1 |
| - whilelo p7.b, tmp1, rest |
| - ld1b z0.b, p0/z, [src_ptr, #0, mul vl] |
| - ld1b z1.b, p1/z, [src_ptr, #1, mul vl] |
| - ld1b z2.b, p2/z, [src_ptr, #2, mul vl] |
| - ld1b z3.b, p3/z, [src_ptr, #3, mul vl] |
| - ld1b z4.b, p4/z, [src_ptr, #4, mul vl] |
| - ld1b z5.b, p5/z, [src_ptr, #5, mul vl] |
| - ld1b z6.b, p6/z, [src_ptr, #6, mul vl] |
| - ld1b z7.b, p7/z, [src_ptr, #7, mul vl] |
| - st1b z0.b, p0, [dest_ptr, #0, mul vl] |
| - st1b z1.b, p1, [dest_ptr, #1, mul vl] |
| - st1b z2.b, p2, [dest_ptr, #2, mul vl] |
| - st1b z3.b, p3, [dest_ptr, #3, mul vl] |
| - st1b z4.b, p4, [dest_ptr, #4, mul vl] |
| - st1b z5.b, p5, [dest_ptr, #5, mul vl] |
| - st1b z6.b, p6, [dest_ptr, #6, mul vl] |
| - st1b z7.b, p7, [dest_ptr, #7, mul vl] |
| + |
| + .p2align 4 |
| + |
| +1: add srcend, src, n |
| + add dstend, dst, n |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p0/z, [src, 1, mul vl] |
| + ld1b z2.b, p0/z, [srcend, -2, mul vl] |
| + ld1b z3.b, p0/z, [srcend, -1, mul vl] |
| + cmp n, vlen, lsl 2 |
| + b.hi 1f |
| + |
| + st1b z0.b, p0, [dst, 0, mul vl] |
| + st1b z1.b, p0, [dst, 1, mul vl] |
| + st1b z2.b, p0, [dstend, -2, mul vl] |
| + st1b z3.b, p0, [dstend, -1, mul vl] |
| ret |
| |
| -L(L2): |
| - // align dest address at CACHE_LINE_SIZE byte boundary |
| - mov tmp1, CACHE_LINE_SIZE |
| - ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1 |
| - // if cl_remainder == 0 |
| - b.eq L(L2_dc_zva) |
| - sub cl_remainder, tmp1, tmp2 |
| - // process remainder until the first CACHE_LINE_SIZE boundary |
| - whilelo p1.b, xzr, cl_remainder // keep p0.b all true |
| - whilelo p2.b, vector_length, cl_remainder |
| - b.last 1f |
| - ld1b z1.b, p1/z, [src_ptr, #0, mul vl] |
| - ld1b z2.b, p2/z, [src_ptr, #1, mul vl] |
| - st1b z1.b, p1, [dest_ptr, #0, mul vl] |
| - st1b z2.b, p2, [dest_ptr, #1, mul vl] |
| - b 2f |
| -1: lsl tmp1, vector_length, 1 // vector_length * 2 |
| - whilelo p3.b, tmp1, cl_remainder |
| - incb tmp1 |
| - whilelo p4.b, tmp1, cl_remainder |
| - ld1b z1.b, p1/z, [src_ptr, #0, mul vl] |
| - ld1b z2.b, p2/z, [src_ptr, #1, mul vl] |
| - ld1b z3.b, p3/z, [src_ptr, #2, mul vl] |
| - ld1b z4.b, p4/z, [src_ptr, #3, mul vl] |
| - st1b z1.b, p1, [dest_ptr, #0, mul vl] |
| - st1b z2.b, p2, [dest_ptr, #1, mul vl] |
| - st1b z3.b, p3, [dest_ptr, #2, mul vl] |
| - st1b z4.b, p4, [dest_ptr, #3, mul vl] |
| -2: add dest_ptr, dest_ptr, cl_remainder |
| - add src_ptr, src_ptr, cl_remainder |
| - sub rest, rest, cl_remainder |
| - |
| -L(L2_dc_zva): |
| - // zero fill |
| - and tmp1, dest, 0xffffffffffffff |
| - and tmp2, src, 0xffffffffffffff |
| - subs tmp1, tmp1, tmp2 // diff |
| - b.ge 1f |
| - neg tmp1, tmp1 |
| -1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2 |
| - cmp tmp1, tmp3 |
| - b.lo L(unroll8) |
| - mov tmp1, dest_ptr |
| - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 |
| - // unroll |
| - ld1b_unroll8 // this line has to be after "b.lo L(unroll8)" |
| - add src_ptr, src_ptr, CACHE_LINE_SIZE * 2 |
| - sub rest, rest, CACHE_LINE_SIZE * 2 |
| - mov tmp1, ZF_DIST |
| - .p2align 3 |
| -1: stld1b_unroll4a |
| - add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST |
| - dc zva, tmp2 |
| - stld1b_unroll4b |
| - add tmp2, tmp2, CACHE_LINE_SIZE |
| - dc zva, tmp2 |
| - add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2 |
| - add src_ptr, src_ptr, CACHE_LINE_SIZE * 2 |
| - sub rest, rest, CACHE_LINE_SIZE * 2 |
| - cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2 |
| - b.ge 1b |
| - st1b_unroll8 |
| - add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2 |
| - b L(unroll8) |
| +1: ld1b z4.b, p0/z, [src, 2, mul vl] |
| + ld1b z5.b, p0/z, [src, 3, mul vl] |
| + ld1b z6.b, p0/z, [srcend, -4, mul vl] |
| + ld1b z7.b, p0/z, [srcend, -3, mul vl] |
| + st1b z0.b, p0, [dst, 0, mul vl] |
| + st1b z1.b, p0, [dst, 1, mul vl] |
| + st1b z4.b, p0, [dst, 2, mul vl] |
| + st1b z5.b, p0, [dst, 3, mul vl] |
| + st1b z6.b, p0, [dstend, -4, mul vl] |
| + st1b z7.b, p0, [dstend, -3, mul vl] |
| + st1b z2.b, p0, [dstend, -2, mul vl] |
| + st1b z3.b, p0, [dstend, -1, mul vl] |
| + ret |
| |
| END (MEMCPY) |
| libc_hidden_builtin_def (MEMCPY) |
| |
| |
| -ENTRY (MEMMOVE) |
| +ENTRY_ALIGN (MEMMOVE, 4) |
| |
| PTR_ARG (0) |
| PTR_ARG (1) |
| SIZE_ARG (2) |
| |
| - // remove tag address |
| - // dest has to be immutable because it is the return value |
| - // src has to be immutable because it is used in L(bwd_last) |
| - and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2 |
| - and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3 |
| - cmp n, 0 |
| - ccmp tmp2, tmp3, 4, ne |
| - b.ne 1f |
| + /* Fast case for up to 2 vectors. */ |
| + cntb vlen |
| + cmp n, vlen, lsl 1 |
| + b.hi 1f |
| + whilelo p0.b, xzr, n |
| + whilelo p1.b, vlen, n |
| + ld1b z0.b, p0/z, [src, 0, mul vl] |
| + ld1b z1.b, p1/z, [src, 1, mul vl] |
| + st1b z0.b, p0, [dstin, 0, mul vl] |
| + st1b z1.b, p1, [dstin, 1, mul vl] |
| +L(full_overlap): |
| ret |
| -1: cntb vector_length |
| - // shortcut for less than vector_length * 8 |
| - // gives a free ptrue to p0.b for n >= vector_length |
| - // tmp2 and tmp3 should not be used in this macro to keep |
| - // notag addresses |
| - shortcut_for_small_size L(dispatch) |
| - // end of shortcut |
| - |
| -L(dispatch): |
| - // tmp2 = dest_notag, tmp3 = src_notag |
| - // diff = dest_notag - src_notag |
| - sub tmp1, tmp2, tmp3 |
| - // if diff <= 0 || diff >= n then memcpy |
| - cmp tmp1, 0 |
| - ccmp tmp1, n, 2, gt |
| - b.cs L(vl_agnostic) |
| - |
| -L(bwd_start): |
| - mov rest, n |
| - add dest_ptr, dest, n // dest_end |
| - add src_ptr, src, n // src_end |
| - |
| -L(bwd_unroll8): // unrolling and software pipeline |
| - lsl tmp1, vector_length, 3 // vector_length * 8 |
| - .p2align 3 |
| - cmp rest, tmp1 |
| - b.cc L(bwd_last) |
| - sub src_ptr, src_ptr, tmp1 |
| + |
| + .p2align 4 |
| + /* Check for overlapping moves. Return if there is a full overlap. |
| + Small moves up to 8 vectors use the overlap-safe copy_small code. |
| + Non-overlapping or overlapping moves with dst < src use memcpy. |
| + Overlapping moves with dst > src use a backward copy loop. */ |
| +1: sub tmp, dstin, src |
| + ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */ |
| + b.eq L(full_overlap) |
| + cmp n, vlen, lsl 3 |
| + b.ls L(copy_small) |
| + cmp tmp, n |
| + b.hs L(copy_large) |
| + |
| + /* Align to vector length. */ |
| + add dst, dstin, n |
| + sub tmp, vlen, 1 |
| + ands tmp, dst, tmp |
| + csel tmp, tmp, vlen, ne |
| + whilelo p1.b, xzr, tmp |
| + sub n, n, tmp |
| + ld1b z1.b, p1/z, [src, n] |
| + st1b z1.b, p1, [dstin, n] |
| + add src, src, n |
| + add dst, dstin, n |
| + |
| + ptrue p0.b |
| + lsl vlen8, vlen, 3 |
| + subs n, n, vlen8 |
| + b.ls 3f |
| + sub src, src, vlen8 |
| ld1b_unroll8 |
| - sub rest, rest, tmp1 |
| - cmp rest, tmp1 |
| - b.cc 2f |
| - .p2align 3 |
| -1: sub src_ptr, src_ptr, tmp1 |
| - sub dest_ptr, dest_ptr, tmp1 |
| + subs n, n, vlen8 |
| + b.ls 2f |
| + |
| + .p2align 4 |
| + /* 8x unrolled and software pipelined backward copy loop. */ |
| +1: sub src, src, vlen8 |
| + sub dst, dst, vlen8 |
| stld1b_unroll8 |
| - sub rest, rest, tmp1 |
| - cmp rest, tmp1 |
| - b.ge 1b |
| -2: sub dest_ptr, dest_ptr, tmp1 |
| + subs n, n, vlen8 |
| + b.hi 1b |
| +2: sub dst, dst, vlen8 |
| st1b_unroll8 |
| +3: add n, n, vlen8 |
| |
| -L(bwd_last): |
| - mov dest_ptr, dest |
| - mov src_ptr, src |
| - b L(last) |
| + /* Adjust src/dst for last 0-8 vectors. */ |
| + sub src, src, n |
| + mov dst, dstin |
| + b L(last_bytes) |
| |
| END (MEMMOVE) |
| libc_hidden_builtin_def (MEMMOVE) |
| -- |
| 2.31.1 |
| |