| From 91272636c23028e55554be4e677bf40ac22b1adc Mon Sep 17 00:00:00 2001 |
| From: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Thu, 20 May 2021 13:13:51 -0400 |
| Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S |
| |
| No bug. This commit makes a few small improvements to |
| memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64 |
| instead of 128. Either alignment will perform equally well in a loop |
| and 128 just increases the odds of having to do an extra iteration |
| which can be significant overhead for small values. 2) Align some |
| targets and the loop. 3) Remove an ALU from the alignment process. 4) |
| Reorder the last 4x VEC so that they are stored after the loop. 5) |
| Move the condition for leq 8x VEC to before the alignment |
| process. test-memset and test-wmemset are both passing. |
| |
| Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| (cherry picked from commit 6abf27980a947f9b6e514d6b33b83059d39566ae) |
| |
| .../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++-------- |
| 1 file changed, 28 insertions(+), 22 deletions(-) |
| |
| diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S |
| index f877ac9d..909c33f6 100644 |
| |
| |
| @@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) |
| VMOVU %VEC(0), (%rdi) |
| VZEROUPPER_RETURN |
| |
| + .p2align 4 |
| L(stosb_more_2x_vec): |
| cmp __x86_rep_stosb_threshold(%rip), %RDX_LP |
| ja L(stosb) |
| +#else |
| + .p2align 4 |
| #endif |
| L(more_2x_vec): |
| - cmpq $(VEC_SIZE * 4), %rdx |
| - ja L(loop_start) |
| + /* Stores to first 2x VEC before cmp as any path forward will |
| + require it. */ |
| VMOVU %VEC(0), (%rdi) |
| VMOVU %VEC(0), VEC_SIZE(%rdi) |
| - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) |
| + cmpq $(VEC_SIZE * 4), %rdx |
| + ja L(loop_start) |
| VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) |
| + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) |
| L(return): |
| #if VEC_SIZE > 16 |
| ZERO_UPPER_VEC_REGISTERS_RETURN |
| @@ -192,28 +197,29 @@ L(return): |
| #endif |
| |
| L(loop_start): |
| - leaq (VEC_SIZE * 4)(%rdi), %rcx |
| - VMOVU %VEC(0), (%rdi) |
| - andq $-(VEC_SIZE * 4), %rcx |
| - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) |
| - VMOVU %VEC(0), VEC_SIZE(%rdi) |
| - VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) |
| VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) |
| - VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) |
| VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) |
| - VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) |
| - addq %rdi, %rdx |
| - andq $-(VEC_SIZE * 4), %rdx |
| - cmpq %rdx, %rcx |
| - je L(return) |
| + cmpq $(VEC_SIZE * 8), %rdx |
| + jbe L(loop_end) |
| + andq $-(VEC_SIZE * 2), %rdi |
| + subq $-(VEC_SIZE * 4), %rdi |
| + leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx |
| + .p2align 4 |
| L(loop): |
| - VMOVA %VEC(0), (%rcx) |
| - VMOVA %VEC(0), VEC_SIZE(%rcx) |
| - VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) |
| - VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) |
| - addq $(VEC_SIZE * 4), %rcx |
| - cmpq %rcx, %rdx |
| - jne L(loop) |
| + VMOVA %VEC(0), (%rdi) |
| + VMOVA %VEC(0), VEC_SIZE(%rdi) |
| + VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) |
| + VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) |
| + subq $-(VEC_SIZE * 4), %rdi |
| + cmpq %rcx, %rdi |
| + jb L(loop) |
| +L(loop_end): |
| + /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. |
| + rdx as length is also unchanged. */ |
| + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) |
| + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) |
| + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) |
| + VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) |
| VZEROUPPER_SHORT_RETURN |
| |
| .p2align 4 |
| -- |
| GitLab |
| |