| commit a7392db2ff2b9dd906500941ac6361dbe2211b0d |
| Author: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Mon Nov 1 00:49:51 2021 -0500 |
| |
| x86: Optimize memmove-vec-unaligned-erms.S |
| |
| No bug. |
| |
| The optimizations are as follows: |
| |
| 1) Always align entry to 64 bytes. This makes behavior more |
| predictable and makes other frontend optimizations easier. |
| |
| 2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have |
| significant benefits in the case that: |
| 0 < (dst - src) < [256, 512] |
| |
| 3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%] |
| improvement and for FSRM [-10%, 25%]. |
| |
| In addition to these primary changes there is general cleanup |
| throughout to optimize the aligning routines and control flow logic. |
| |
| Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| (cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6) |
| |
| diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S |
| index db106a7a1f23f268..b2b318084823dceb 100644 |
| |
| |
| @@ -25,7 +25,7 @@ |
| /* Use movups and movaps for smaller code sizes. */ |
| #define VMOVU movups |
| #define VMOVA movaps |
| - |
| +#define MOV_SIZE 3 |
| #define SECTION(p) p |
| |
| #ifdef USE_MULTIARCH |
| diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S |
| index 1ec1962e861dbf63..67a55f0c85af841c 100644 |
| |
| |
| @@ -4,7 +4,7 @@ |
| # define VMOVNT vmovntdq |
| # define VMOVU vmovdqu |
| # define VMOVA vmovdqa |
| - |
| +# define MOV_SIZE 4 |
| # define ZERO_UPPER_VEC_REGISTERS_RETURN \ |
| ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST |
| |
| diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S |
| index e195e93f153c9512..975ae6c0515b83cb 100644 |
| |
| |
| @@ -4,7 +4,7 @@ |
| # define VMOVNT vmovntdq |
| # define VMOVU vmovdqu |
| # define VMOVA vmovdqa |
| - |
| +# define MOV_SIZE 4 |
| # define SECTION(p) p##.avx |
| # define MEMMOVE_SYMBOL(p,s) p##_avx_##s |
| |
| diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S |
| index 848848ab39ff9326..0fa7126830af7acb 100644 |
| |
| |
| @@ -25,7 +25,7 @@ |
| # define VMOVU vmovdqu64 |
| # define VMOVA vmovdqa64 |
| # define VZEROUPPER |
| - |
| +# define MOV_SIZE 6 |
| # define SECTION(p) p##.evex512 |
| # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s |
| |
| diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S |
| index 0cbce8f944da51a0..88715441feaaccf5 100644 |
| |
| |
| @@ -25,7 +25,7 @@ |
| # define VMOVU vmovdqu64 |
| # define VMOVA vmovdqa64 |
| # define VZEROUPPER |
| - |
| +# define MOV_SIZE 6 |
| # define SECTION(p) p##.evex |
| # define MEMMOVE_SYMBOL(p,s) p##_evex_##s |
| |
| diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S |
| index abde8438d41f2320..7b27cbdda5fb99f7 100644 |
| |
| |
| @@ -76,6 +76,25 @@ |
| # endif |
| #endif |
| |
| +/* Whether to align before movsb. Ultimately we want 64 byte |
| + align and not worth it to load 4x VEC for VEC_SIZE == 16. */ |
| +#define ALIGN_MOVSB (VEC_SIZE > 16) |
| +/* Number of bytes to align movsb to. */ |
| +#define MOVSB_ALIGN_TO 64 |
| + |
| +#define SMALL_MOV_SIZE (MOV_SIZE <= 4) |
| +#define LARGE_MOV_SIZE (MOV_SIZE > 4) |
| + |
| +#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 |
| +# error MOV_SIZE Unknown |
| +#endif |
| + |
| +#if LARGE_MOV_SIZE |
| +# define SMALL_SIZE_OFFSET (4) |
| +#else |
| +# define SMALL_SIZE_OFFSET (0) |
| +#endif |
| + |
| #ifndef PAGE_SIZE |
| # define PAGE_SIZE 4096 |
| #endif |
| @@ -199,25 +218,21 @@ L(start): |
| # endif |
| cmp $VEC_SIZE, %RDX_LP |
| jb L(less_vec) |
| + /* Load regardless. */ |
| + VMOVU (%rsi), %VEC(0) |
| cmp $(VEC_SIZE * 2), %RDX_LP |
| ja L(more_2x_vec) |
| -#if !defined USE_MULTIARCH || !IS_IN (libc) |
| -L(last_2x_vec): |
| -#endif |
| /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
| - VMOVU (%rsi), %VEC(0) |
| VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
| VMOVU %VEC(0), (%rdi) |
| VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
| -#if !defined USE_MULTIARCH || !IS_IN (libc) |
| -L(nop): |
| - ret |
| +#if !(defined USE_MULTIARCH && IS_IN (libc)) |
| + ZERO_UPPER_VEC_REGISTERS_RETURN |
| #else |
| VZEROUPPER_RETURN |
| #endif |
| #if defined USE_MULTIARCH && IS_IN (libc) |
| END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
| - |
| # if VEC_SIZE == 16 |
| ENTRY (__mempcpy_chk_erms) |
| cmp %RDX_LP, %RCX_LP |
| @@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
| END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
| # endif |
| |
| -ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
| +ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) |
| movq %rdi, %rax |
| L(start_erms): |
| # ifdef __ILP32__ |
| @@ -298,310 +313,448 @@ L(start_erms): |
| # endif |
| cmp $VEC_SIZE, %RDX_LP |
| jb L(less_vec) |
| + /* Load regardless. */ |
| + VMOVU (%rsi), %VEC(0) |
| cmp $(VEC_SIZE * 2), %RDX_LP |
| ja L(movsb_more_2x_vec) |
| -L(last_2x_vec): |
| - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
| - VMOVU (%rsi), %VEC(0) |
| - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
| + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |
| + */ |
| + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) |
| VMOVU %VEC(0), (%rdi) |
| - VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
| + VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) |
| L(return): |
| -#if VEC_SIZE > 16 |
| +# if VEC_SIZE > 16 |
| ZERO_UPPER_VEC_REGISTERS_RETURN |
| -#else |
| +# else |
| ret |
| +# endif |
| #endif |
| |
| -L(movsb): |
| - cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
| - jae L(more_8x_vec) |
| - cmpq %rsi, %rdi |
| - jb 1f |
| - /* Source == destination is less common. */ |
| - je L(nop) |
| - leaq (%rsi,%rdx), %r9 |
| - cmpq %r9, %rdi |
| - /* Avoid slow backward REP MOVSB. */ |
| - jb L(more_8x_vec_backward) |
| -# if AVOID_SHORT_DISTANCE_REP_MOVSB |
| - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
| - jz 3f |
| - movq %rdi, %rcx |
| - subq %rsi, %rcx |
| - jmp 2f |
| -# endif |
| -1: |
| -# if AVOID_SHORT_DISTANCE_REP_MOVSB |
| - testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
| - jz 3f |
| - movq %rsi, %rcx |
| - subq %rdi, %rcx |
| -2: |
| -/* Avoid "rep movsb" if RCX, the distance between source and destination, |
| - is N*4GB + [1..63] with N >= 0. */ |
| - cmpl $63, %ecx |
| - jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ |
| -3: |
| -# endif |
| - mov %RDX_LP, %RCX_LP |
| - rep movsb |
| -L(nop): |
| +#if LARGE_MOV_SIZE |
| + /* If LARGE_MOV_SIZE this fits in the aligning bytes between the |
| + ENTRY block and L(less_vec). */ |
| + .p2align 4,, 8 |
| +L(between_4_7): |
| + /* From 4 to 7. No branch when size == 4. */ |
| + movl (%rsi), %ecx |
| + movl (%rsi, %rdx), %esi |
| + movl %ecx, (%rdi) |
| + movl %esi, (%rdi, %rdx) |
| ret |
| #endif |
| |
| + .p2align 4 |
| L(less_vec): |
| /* Less than 1 VEC. */ |
| #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
| # error Unsupported VEC_SIZE! |
| #endif |
| #if VEC_SIZE > 32 |
| - cmpb $32, %dl |
| + cmpl $32, %edx |
| jae L(between_32_63) |
| #endif |
| #if VEC_SIZE > 16 |
| - cmpb $16, %dl |
| + cmpl $16, %edx |
| jae L(between_16_31) |
| #endif |
| - cmpb $8, %dl |
| + cmpl $8, %edx |
| jae L(between_8_15) |
| - cmpb $4, %dl |
| +#if SMALL_MOV_SIZE |
| + cmpl $4, %edx |
| +#else |
| + subq $4, %rdx |
| +#endif |
| jae L(between_4_7) |
| - cmpb $1, %dl |
| - ja L(between_2_3) |
| - jb 1f |
| - movzbl (%rsi), %ecx |
| + cmpl $(1 - SMALL_SIZE_OFFSET), %edx |
| + jl L(copy_0) |
| + movb (%rsi), %cl |
| + je L(copy_1) |
| + movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi |
| + movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) |
| +L(copy_1): |
| movb %cl, (%rdi) |
| -1: |
| +L(copy_0): |
| ret |
| + |
| +#if SMALL_MOV_SIZE |
| + .p2align 4,, 8 |
| +L(between_4_7): |
| + /* From 4 to 7. No branch when size == 4. */ |
| + movl -4(%rsi, %rdx), %ecx |
| + movl (%rsi), %esi |
| + movl %ecx, -4(%rdi, %rdx) |
| + movl %esi, (%rdi) |
| + ret |
| +#endif |
| + |
| +#if VEC_SIZE > 16 |
| + /* From 16 to 31. No branch when size == 16. */ |
| + .p2align 4,, 8 |
| +L(between_16_31): |
| + vmovdqu (%rsi), %xmm0 |
| + vmovdqu -16(%rsi, %rdx), %xmm1 |
| + vmovdqu %xmm0, (%rdi) |
| + vmovdqu %xmm1, -16(%rdi, %rdx) |
| + /* No ymm registers have been touched. */ |
| + ret |
| +#endif |
| + |
| #if VEC_SIZE > 32 |
| + .p2align 4,, 10 |
| L(between_32_63): |
| /* From 32 to 63. No branch when size == 32. */ |
| VMOVU (%rsi), %YMM0 |
| - VMOVU -32(%rsi,%rdx), %YMM1 |
| + VMOVU -32(%rsi, %rdx), %YMM1 |
| VMOVU %YMM0, (%rdi) |
| - VMOVU %YMM1, -32(%rdi,%rdx) |
| - VZEROUPPER_RETURN |
| -#endif |
| -#if VEC_SIZE > 16 |
| - /* From 16 to 31. No branch when size == 16. */ |
| -L(between_16_31): |
| - VMOVU (%rsi), %XMM0 |
| - VMOVU -16(%rsi,%rdx), %XMM1 |
| - VMOVU %XMM0, (%rdi) |
| - VMOVU %XMM1, -16(%rdi,%rdx) |
| + VMOVU %YMM1, -32(%rdi, %rdx) |
| VZEROUPPER_RETURN |
| #endif |
| + |
| + .p2align 4,, 10 |
| L(between_8_15): |
| /* From 8 to 15. No branch when size == 8. */ |
| - movq -8(%rsi,%rdx), %rcx |
| + movq -8(%rsi, %rdx), %rcx |
| movq (%rsi), %rsi |
| - movq %rcx, -8(%rdi,%rdx) |
| movq %rsi, (%rdi) |
| + movq %rcx, -8(%rdi, %rdx) |
| ret |
| -L(between_4_7): |
| - /* From 4 to 7. No branch when size == 4. */ |
| - movl -4(%rsi,%rdx), %ecx |
| - movl (%rsi), %esi |
| - movl %ecx, -4(%rdi,%rdx) |
| - movl %esi, (%rdi) |
| - ret |
| -L(between_2_3): |
| - /* From 2 to 3. No branch when size == 2. */ |
| - movzwl -2(%rsi,%rdx), %ecx |
| - movzwl (%rsi), %esi |
| - movw %cx, -2(%rdi,%rdx) |
| - movw %si, (%rdi) |
| - ret |
| |
| + .p2align 4,, 10 |
| +L(last_4x_vec): |
| + /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
| + |
| + /* VEC(0) and VEC(1) have already been loaded. */ |
| + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) |
| + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) |
| + VMOVU %VEC(0), (%rdi) |
| + VMOVU %VEC(1), VEC_SIZE(%rdi) |
| + VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) |
| + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) |
| + VZEROUPPER_RETURN |
| + |
| + .p2align 4 |
| #if defined USE_MULTIARCH && IS_IN (libc) |
| L(movsb_more_2x_vec): |
| cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
| ja L(movsb) |
| #endif |
| L(more_2x_vec): |
| - /* More than 2 * VEC and there may be overlap between destination |
| - and source. */ |
| + /* More than 2 * VEC and there may be overlap between |
| + destination and source. */ |
| cmpq $(VEC_SIZE * 8), %rdx |
| ja L(more_8x_vec) |
| + /* Load VEC(1) regardless. VEC(0) has already been loaded. */ |
| + VMOVU VEC_SIZE(%rsi), %VEC(1) |
| cmpq $(VEC_SIZE * 4), %rdx |
| jbe L(last_4x_vec) |
| - /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
| - VMOVU (%rsi), %VEC(0) |
| - VMOVU VEC_SIZE(%rsi), %VEC(1) |
| + /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
| VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) |
| - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) |
| - VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) |
| - VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) |
| + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) |
| + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) |
| + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) |
| + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) |
| VMOVU %VEC(0), (%rdi) |
| VMOVU %VEC(1), VEC_SIZE(%rdi) |
| VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
| VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
| - VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) |
| - VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) |
| - VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) |
| - VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) |
| - VZEROUPPER_RETURN |
| -L(last_4x_vec): |
| - /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
| - VMOVU (%rsi), %VEC(0) |
| - VMOVU VEC_SIZE(%rsi), %VEC(1) |
| - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) |
| - VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) |
| - VMOVU %VEC(0), (%rdi) |
| - VMOVU %VEC(1), VEC_SIZE(%rdi) |
| - VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) |
| - VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) |
| + VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) |
| + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) |
| + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) |
| + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) |
| VZEROUPPER_RETURN |
| |
| + .p2align 4,, 4 |
| L(more_8x_vec): |
| + movq %rdi, %rcx |
| + subq %rsi, %rcx |
| + /* Go to backwards temporal copy if overlap no matter what as |
| + backward REP MOVSB is slow and we don't want to use NT stores if |
| + there is overlap. */ |
| + cmpq %rdx, %rcx |
| + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
| + jb L(more_8x_vec_backward_check_nop) |
| /* Check if non-temporal move candidate. */ |
| #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
| /* Check non-temporal store threshold. */ |
| - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
| + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
| ja L(large_memcpy_2x) |
| #endif |
| - /* Entry if rdx is greater than non-temporal threshold but there |
| - is overlap. */ |
| + /* To reach this point there cannot be overlap and dst > src. So |
| + check for overlap and src > dst in which case correctness |
| + requires forward copy. Otherwise decide between backward/forward |
| + copy depending on address aliasing. */ |
| + |
| + /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold |
| + but less than __x86_shared_non_temporal_threshold. */ |
| L(more_8x_vec_check): |
| - cmpq %rsi, %rdi |
| - ja L(more_8x_vec_backward) |
| - /* Source == destination is less common. */ |
| - je L(nop) |
| - /* Load the first VEC and last 4 * VEC to support overlapping |
| - addresses. */ |
| - VMOVU (%rsi), %VEC(4) |
| + /* rcx contains dst - src. Add back length (rdx). */ |
| + leaq (%rcx, %rdx), %r8 |
| + /* If r8 has different sign than rcx then there is overlap so we |
| + must do forward copy. */ |
| + xorq %rcx, %r8 |
| + /* Isolate just sign bit of r8. */ |
| + shrq $63, %r8 |
| + /* Get 4k difference dst - src. */ |
| + andl $(PAGE_SIZE - 256), %ecx |
| + /* If r8 is non-zero must do foward for correctness. Otherwise |
| + if ecx is non-zero there is 4k False Alaising so do backward |
| + copy. */ |
| + addl %r8d, %ecx |
| + jz L(more_8x_vec_backward) |
| + |
| + /* if rdx is greater than __x86_shared_non_temporal_threshold |
| + but there is overlap, or from short distance movsb. */ |
| +L(more_8x_vec_forward): |
| + /* Load first and last 4 * VEC to support overlapping addresses. |
| + */ |
| + |
| + /* First vec was already loaded into VEC(0). */ |
| VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |
| VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |
| + /* Save begining of dst. */ |
| + movq %rdi, %rcx |
| + /* Align dst to VEC_SIZE - 1. */ |
| + orq $(VEC_SIZE - 1), %rdi |
| VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |
| VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |
| - /* Save start and stop of the destination buffer. */ |
| - movq %rdi, %r11 |
| - leaq -VEC_SIZE(%rdi, %rdx), %rcx |
| - /* Align destination for aligned stores in the loop. Compute |
| - how much destination is misaligned. */ |
| - movq %rdi, %r8 |
| - andq $(VEC_SIZE - 1), %r8 |
| - /* Get the negative of offset for alignment. */ |
| - subq $VEC_SIZE, %r8 |
| - /* Adjust source. */ |
| - subq %r8, %rsi |
| - /* Adjust destination which should be aligned now. */ |
| - subq %r8, %rdi |
| - /* Adjust length. */ |
| - addq %r8, %rdx |
| |
| - .p2align 4 |
| + /* Subtract dst from src. Add back after dst aligned. */ |
| + subq %rcx, %rsi |
| + /* Finish aligning dst. */ |
| + incq %rdi |
| + /* Restore src adjusted with new value for aligned dst. */ |
| + addq %rdi, %rsi |
| + /* Store end of buffer minus tail in rdx. */ |
| + leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx |
| + |
| + /* Dont use multi-byte nop to align. */ |
| + .p2align 4,, 11 |
| L(loop_4x_vec_forward): |
| /* Copy 4 * VEC a time forward. */ |
| - VMOVU (%rsi), %VEC(0) |
| - VMOVU VEC_SIZE(%rsi), %VEC(1) |
| - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| + VMOVU (%rsi), %VEC(1) |
| + VMOVU VEC_SIZE(%rsi), %VEC(2) |
| + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) |
| + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) |
| subq $-(VEC_SIZE * 4), %rsi |
| - addq $-(VEC_SIZE * 4), %rdx |
| - VMOVA %VEC(0), (%rdi) |
| - VMOVA %VEC(1), VEC_SIZE(%rdi) |
| - VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
| - VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
| + VMOVA %VEC(1), (%rdi) |
| + VMOVA %VEC(2), VEC_SIZE(%rdi) |
| + VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) |
| + VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) |
| subq $-(VEC_SIZE * 4), %rdi |
| - cmpq $(VEC_SIZE * 4), %rdx |
| + cmpq %rdi, %rdx |
| ja L(loop_4x_vec_forward) |
| /* Store the last 4 * VEC. */ |
| - VMOVU %VEC(5), (%rcx) |
| - VMOVU %VEC(6), -VEC_SIZE(%rcx) |
| - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) |
| - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) |
| + VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) |
| + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) |
| + VMOVU %VEC(7), VEC_SIZE(%rdx) |
| + VMOVU %VEC(8), (%rdx) |
| /* Store the first VEC. */ |
| - VMOVU %VEC(4), (%r11) |
| + VMOVU %VEC(0), (%rcx) |
| + /* Keep L(nop_backward) target close to jmp for 2-byte encoding. |
| + */ |
| +L(nop_backward): |
| VZEROUPPER_RETURN |
| |
| + .p2align 4,, 8 |
| +L(more_8x_vec_backward_check_nop): |
| + /* rcx contains dst - src. Test for dst == src to skip all of |
| + memmove. */ |
| + testq %rcx, %rcx |
| + jz L(nop_backward) |
| L(more_8x_vec_backward): |
| /* Load the first 4 * VEC and last VEC to support overlapping |
| addresses. */ |
| - VMOVU (%rsi), %VEC(4) |
| + |
| + /* First vec was also loaded into VEC(0). */ |
| VMOVU VEC_SIZE(%rsi), %VEC(5) |
| VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |
| + /* Begining of region for 4x backward copy stored in rcx. */ |
| + leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx |
| VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |
| - VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) |
| - /* Save stop of the destination buffer. */ |
| - leaq -VEC_SIZE(%rdi, %rdx), %r11 |
| - /* Align destination end for aligned stores in the loop. Compute |
| - how much destination end is misaligned. */ |
| - leaq -VEC_SIZE(%rsi, %rdx), %rcx |
| - movq %r11, %r9 |
| - movq %r11, %r8 |
| - andq $(VEC_SIZE - 1), %r8 |
| - /* Adjust source. */ |
| - subq %r8, %rcx |
| - /* Adjust the end of destination which should be aligned now. */ |
| - subq %r8, %r9 |
| - /* Adjust length. */ |
| - subq %r8, %rdx |
| - |
| - .p2align 4 |
| + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) |
| + /* Subtract dst from src. Add back after dst aligned. */ |
| + subq %rdi, %rsi |
| + /* Align dst. */ |
| + andq $-(VEC_SIZE), %rcx |
| + /* Restore src. */ |
| + addq %rcx, %rsi |
| + |
| + /* Don't use multi-byte nop to align. */ |
| + .p2align 4,, 11 |
| L(loop_4x_vec_backward): |
| /* Copy 4 * VEC a time backward. */ |
| - VMOVU (%rcx), %VEC(0) |
| - VMOVU -VEC_SIZE(%rcx), %VEC(1) |
| - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) |
| - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) |
| - addq $-(VEC_SIZE * 4), %rcx |
| - addq $-(VEC_SIZE * 4), %rdx |
| - VMOVA %VEC(0), (%r9) |
| - VMOVA %VEC(1), -VEC_SIZE(%r9) |
| - VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) |
| - VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) |
| - addq $-(VEC_SIZE * 4), %r9 |
| - cmpq $(VEC_SIZE * 4), %rdx |
| - ja L(loop_4x_vec_backward) |
| + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) |
| + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| + VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) |
| + VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) |
| + addq $(VEC_SIZE * -4), %rsi |
| + VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) |
| + VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) |
| + VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) |
| + VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) |
| + addq $(VEC_SIZE * -4), %rcx |
| + cmpq %rcx, %rdi |
| + jb L(loop_4x_vec_backward) |
| /* Store the first 4 * VEC. */ |
| - VMOVU %VEC(4), (%rdi) |
| + VMOVU %VEC(0), (%rdi) |
| VMOVU %VEC(5), VEC_SIZE(%rdi) |
| VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
| VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
| /* Store the last VEC. */ |
| - VMOVU %VEC(8), (%r11) |
| + VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) |
| + VZEROUPPER_RETURN |
| + |
| +#if defined USE_MULTIARCH && IS_IN (libc) |
| + /* L(skip_short_movsb_check) is only used with ERMS. Not for |
| + FSRM. */ |
| + .p2align 5,, 16 |
| +# if ALIGN_MOVSB |
| +L(skip_short_movsb_check): |
| +# if MOVSB_ALIGN_TO > VEC_SIZE |
| + VMOVU VEC_SIZE(%rsi), %VEC(1) |
| +# endif |
| +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
| +# error Unsupported MOVSB_ALIGN_TO |
| +# endif |
| + /* If CPU does not have FSRM two options for aligning. Align src |
| + if dst and src 4k alias. Otherwise align dst. */ |
| + testl $(PAGE_SIZE - 512), %ecx |
| + jnz L(movsb_align_dst) |
| + /* Fall through. dst and src 4k alias. It's better to align src |
| + here because the bottleneck will be loads dues to the false |
| + dependency on dst. */ |
| + |
| + /* rcx already has dst - src. */ |
| + movq %rcx, %r9 |
| + /* Add src to len. Subtract back after src aligned. -1 because |
| + src is initially aligned to MOVSB_ALIGN_TO - 1. */ |
| + leaq -1(%rsi, %rdx), %rcx |
| + /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ |
| + orq $(MOVSB_ALIGN_TO - 1), %rsi |
| + /* Restore dst and len adjusted with new values for aligned dst. |
| + */ |
| + leaq 1(%rsi, %r9), %rdi |
| + subq %rsi, %rcx |
| + /* Finish aligning src. */ |
| + incq %rsi |
| + |
| + rep movsb |
| + |
| + VMOVU %VEC(0), (%r8) |
| +# if MOVSB_ALIGN_TO > VEC_SIZE |
| + VMOVU %VEC(1), VEC_SIZE(%r8) |
| +# endif |
| VZEROUPPER_RETURN |
| +# endif |
| + |
| + .p2align 4,, 12 |
| +L(movsb): |
| + movq %rdi, %rcx |
| + subq %rsi, %rcx |
| + /* Go to backwards temporal copy if overlap no matter what as |
| + backward REP MOVSB is slow and we don't want to use NT stores if |
| + there is overlap. */ |
| + cmpq %rdx, %rcx |
| + /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
| + jb L(more_8x_vec_backward_check_nop) |
| +# if ALIGN_MOVSB |
| + /* Save dest for storing aligning VECs later. */ |
| + movq %rdi, %r8 |
| +# endif |
| + /* If above __x86_rep_movsb_stop_threshold most likely is |
| + candidate for NT moves aswell. */ |
| + cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
| + jae L(large_memcpy_2x_check) |
| +# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB |
| + /* Only avoid short movsb if CPU has FSRM. */ |
| + testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
| + jz L(skip_short_movsb_check) |
| +# if AVOID_SHORT_DISTANCE_REP_MOVSB |
| + /* Avoid "rep movsb" if RCX, the distance between source and |
| + destination, is N*4GB + [1..63] with N >= 0. */ |
| + |
| + /* ecx contains dst - src. Early check for backward copy |
| + conditions means only case of slow movsb with src = dst + [0, |
| + 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check |
| + for that case. */ |
| + cmpl $-64, %ecx |
| + ja L(more_8x_vec_forward) |
| +# endif |
| +# endif |
| +# if ALIGN_MOVSB |
| +# if MOVSB_ALIGN_TO > VEC_SIZE |
| + VMOVU VEC_SIZE(%rsi), %VEC(1) |
| +# endif |
| +# if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
| +# error Unsupported MOVSB_ALIGN_TO |
| +# endif |
| + /* Fall through means cpu has FSRM. In that case exclusively |
| + align destination. */ |
| +L(movsb_align_dst): |
| + /* Subtract dst from src. Add back after dst aligned. */ |
| + subq %rdi, %rsi |
| + /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ |
| + addq $(MOVSB_ALIGN_TO - 1), %rdi |
| + /* Add dst to len. Subtract back after dst aligned. */ |
| + leaq (%r8, %rdx), %rcx |
| + /* Finish aligning dst. */ |
| + andq $-(MOVSB_ALIGN_TO), %rdi |
| + /* Restore src and len adjusted with new values for aligned dst. |
| + */ |
| + addq %rdi, %rsi |
| + subq %rdi, %rcx |
| + |
| + rep movsb |
| + |
| + /* Store VECs loaded for aligning. */ |
| + VMOVU %VEC(0), (%r8) |
| +# if MOVSB_ALIGN_TO > VEC_SIZE |
| + VMOVU %VEC(1), VEC_SIZE(%r8) |
| +# endif |
| + VZEROUPPER_RETURN |
| +# else /* !ALIGN_MOVSB. */ |
| +L(skip_short_movsb_check): |
| + mov %RDX_LP, %RCX_LP |
| + rep movsb |
| + ret |
| +# endif |
| +#endif |
| |
| + .p2align 4,, 10 |
| #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
| - .p2align 4 |
| +L(large_memcpy_2x_check): |
| + cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
| + jb L(more_8x_vec_check) |
| L(large_memcpy_2x): |
| - /* Compute absolute value of difference between source and |
| - destination. */ |
| - movq %rdi, %r9 |
| - subq %rsi, %r9 |
| - movq %r9, %r8 |
| - leaq -1(%r9), %rcx |
| - sarq $63, %r8 |
| - xorq %r8, %r9 |
| - subq %r8, %r9 |
| - /* Don't use non-temporal store if there is overlap between |
| - destination and source since destination may be in cache when |
| - source is loaded. */ |
| - cmpq %r9, %rdx |
| - ja L(more_8x_vec_check) |
| + /* To reach this point it is impossible for dst > src and |
| + overlap. Remaining to check is src > dst and overlap. rcx |
| + already contains dst - src. Negate rcx to get src - dst. If |
| + length > rcx then there is overlap and forward copy is best. */ |
| + negq %rcx |
| + cmpq %rcx, %rdx |
| + ja L(more_8x_vec_forward) |
| |
| /* Cache align destination. First store the first 64 bytes then |
| adjust alignments. */ |
| - VMOVU (%rsi), %VEC(8) |
| -#if VEC_SIZE < 64 |
| - VMOVU VEC_SIZE(%rsi), %VEC(9) |
| -#if VEC_SIZE < 32 |
| - VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) |
| - VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) |
| -#endif |
| -#endif |
| - VMOVU %VEC(8), (%rdi) |
| -#if VEC_SIZE < 64 |
| - VMOVU %VEC(9), VEC_SIZE(%rdi) |
| -#if VEC_SIZE < 32 |
| - VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) |
| - VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) |
| -#endif |
| -#endif |
| + |
| + /* First vec was also loaded into VEC(0). */ |
| +# if VEC_SIZE < 64 |
| + VMOVU VEC_SIZE(%rsi), %VEC(1) |
| +# if VEC_SIZE < 32 |
| + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
| + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
| +# endif |
| +# endif |
| + VMOVU %VEC(0), (%rdi) |
| +# if VEC_SIZE < 64 |
| + VMOVU %VEC(1), VEC_SIZE(%rdi) |
| +# if VEC_SIZE < 32 |
| + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
| + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
| +# endif |
| +# endif |
| + |
| /* Adjust source, destination, and size. */ |
| movq %rdi, %r8 |
| andq $63, %r8 |
| @@ -614,9 +767,13 @@ L(large_memcpy_2x): |
| /* Adjust length. */ |
| addq %r8, %rdx |
| |
| - /* Test if source and destination addresses will alias. If they do |
| - the larger pipeline in large_memcpy_4x alleviated the |
| + /* Test if source and destination addresses will alias. If they |
| + do the larger pipeline in large_memcpy_4x alleviated the |
| performance drop. */ |
| + |
| + /* ecx contains -(dst - src). not ecx will return dst - src - 1 |
| + which works for testing aliasing. */ |
| + notl %ecx |
| testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |
| jz L(large_memcpy_4x) |
| |
| @@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer): |
| /* ecx stores inner loop counter. */ |
| movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
| L(loop_large_memcpy_4x_inner): |
| - /* Only one prefetch set per page as doing 4 pages give more time |
| - for prefetcher to keep up. */ |
| + /* Only one prefetch set per page as doing 4 pages give more |
| + time for prefetcher to keep up. */ |
| PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
| PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
| PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |