| commit ea19c490a3f5628d55ded271cbb753e66b2f05e8 |
| Author: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Sun Feb 6 00:54:18 2022 -0600 |
| |
| x86: Improve vec generation in memset-vec-unaligned-erms.S |
| |
| No bug. |
| |
| Split vec generation into multiple steps. This allows the |
| broadcast in AVX2 to use 'xmm' registers for the L(less_vec) |
| case. This saves an expensive lane-cross instruction and removes |
| the need for 'vzeroupper'. |
| |
| For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for |
| byte broadcast. |
| |
| Results for memset-avx2 small (geomean of N = 20 benchset runs). |
| |
| size, New Time, Old Time, New / Old |
| 0, 4.100, 3.831, 0.934 |
| 1, 5.074, 4.399, 0.867 |
| 2, 4.433, 4.411, 0.995 |
| 4, 4.487, 4.415, 0.984 |
| 8, 4.454, 4.396, 0.987 |
| 16, 4.502, 4.443, 0.987 |
| |
| All relevant string/wcsmbs tests are passing. |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| |
| (cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d) |
| |
| diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S |
| index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644 |
| |
| |
| @@ -28,17 +28,22 @@ |
| #define VMOVU movups |
| #define VMOVA movaps |
| |
| -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| movd d, %xmm0; \ |
| - movq r, %rax; \ |
| - punpcklbw %xmm0, %xmm0; \ |
| - punpcklwd %xmm0, %xmm0; \ |
| - pshufd $0, %xmm0, %xmm0 |
| + pxor %xmm1, %xmm1; \ |
| + pshufb %xmm1, %xmm0; \ |
| + movq r, %rax |
| |
| -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| movd d, %xmm0; \ |
| - movq r, %rax; \ |
| - pshufd $0, %xmm0, %xmm0 |
| + pshufd $0, %xmm0, %xmm0; \ |
| + movq r, %rax |
| + |
| +# define MEMSET_VDUP_TO_VEC0_HIGH() |
| +# define MEMSET_VDUP_TO_VEC0_LOW() |
| + |
| +# define WMEMSET_VDUP_TO_VEC0_HIGH() |
| +# define WMEMSET_VDUP_TO_VEC0_LOW() |
| |
| #define SECTION(p) p |
| |
| diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S |
| index 1af668af0aeda59e..c0bf2875d03d51ab 100644 |
| |
| |
| @@ -10,15 +10,18 @@ |
| # define VMOVU vmovdqu |
| # define VMOVA vmovdqa |
| |
| -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| vmovd d, %xmm0; \ |
| - movq r, %rax; \ |
| - vpbroadcastb %xmm0, %ymm0 |
| + movq r, %rax; |
| |
| -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| - vmovd d, %xmm0; \ |
| - movq r, %rax; \ |
| - vpbroadcastd %xmm0, %ymm0 |
| +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) |
| + |
| +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 |
| +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 |
| + |
| +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 |
| +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 |
| |
| # ifndef SECTION |
| # define SECTION(p) p##.avx |
| @@ -30,5 +33,6 @@ |
| # define WMEMSET_SYMBOL(p,s) p##_avx2_##s |
| # endif |
| |
| +# define USE_XMM_LESS_VEC |
| # include "memset-vec-unaligned-erms.S" |
| #endif |
| diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S |
| index f14d6f8493c21a36..5241216a77bf72b7 100644 |
| |
| |
| @@ -15,13 +15,19 @@ |
| |
| # define VZEROUPPER |
| |
| -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| - movq r, %rax; \ |
| - vpbroadcastb d, %VEC0 |
| +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| + vpbroadcastb d, %VEC0; \ |
| + movq r, %rax |
| |
| -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| - movq r, %rax; \ |
| - vpbroadcastd d, %VEC0 |
| +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| + vpbroadcastd d, %VEC0; \ |
| + movq r, %rax |
| + |
| +# define MEMSET_VDUP_TO_VEC0_HIGH() |
| +# define MEMSET_VDUP_TO_VEC0_LOW() |
| + |
| +# define WMEMSET_VDUP_TO_VEC0_HIGH() |
| +# define WMEMSET_VDUP_TO_VEC0_LOW() |
| |
| # define SECTION(p) p##.evex512 |
| # define MEMSET_SYMBOL(p,s) p##_avx512_##s |
| diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S |
| index 64b09e77cc20cc42..637002150659123c 100644 |
| |
| |
| @@ -15,13 +15,19 @@ |
| |
| # define VZEROUPPER |
| |
| -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| - movq r, %rax; \ |
| - vpbroadcastb d, %VEC0 |
| +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| + vpbroadcastb d, %VEC0; \ |
| + movq r, %rax |
| |
| -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ |
| - movq r, %rax; \ |
| - vpbroadcastd d, %VEC0 |
| +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ |
| + vpbroadcastd d, %VEC0; \ |
| + movq r, %rax |
| + |
| +# define MEMSET_VDUP_TO_VEC0_HIGH() |
| +# define MEMSET_VDUP_TO_VEC0_LOW() |
| + |
| +# define WMEMSET_VDUP_TO_VEC0_HIGH() |
| +# define WMEMSET_VDUP_TO_VEC0_LOW() |
| |
| # define SECTION(p) p##.evex |
| # define MEMSET_SYMBOL(p,s) p##_evex_##s |
| diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S |
| index e723413a664c088f..c8db87dcbf69f0d8 100644 |
| |
| |
| @@ -58,8 +58,10 @@ |
| #ifndef MOVQ |
| # if VEC_SIZE > 16 |
| # define MOVQ vmovq |
| +# define MOVD vmovd |
| # else |
| # define MOVQ movq |
| +# define MOVD movd |
| # endif |
| #endif |
| |
| @@ -72,9 +74,17 @@ |
| #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 |
| # define END_REG rcx |
| # define LOOP_REG rdi |
| +# define LESS_VEC_REG rax |
| #else |
| # define END_REG rdi |
| # define LOOP_REG rdx |
| +# define LESS_VEC_REG rdi |
| +#endif |
| + |
| +#ifdef USE_XMM_LESS_VEC |
| +# define XMM_SMALL 1 |
| +#else |
| +# define XMM_SMALL 0 |
| #endif |
| |
| #define PAGE_SIZE 4096 |
| @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) |
| |
| ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) |
| shl $2, %RDX_LP |
| - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) |
| - jmp L(entry_from_bzero) |
| + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) |
| + WMEMSET_VDUP_TO_VEC0_LOW() |
| + cmpq $VEC_SIZE, %rdx |
| + jb L(less_vec_no_vdup) |
| + WMEMSET_VDUP_TO_VEC0_HIGH() |
| + jmp L(entry_from_wmemset) |
| END (WMEMSET_SYMBOL (__wmemset, unaligned)) |
| #endif |
| |
| @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) |
| #endif |
| |
| ENTRY (MEMSET_SYMBOL (__memset, unaligned)) |
| - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) |
| + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) |
| # ifdef __ILP32__ |
| /* Clear the upper 32 bits. */ |
| mov %edx, %edx |
| @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) |
| L(entry_from_bzero): |
| cmpq $VEC_SIZE, %rdx |
| jb L(less_vec) |
| + MEMSET_VDUP_TO_VEC0_HIGH() |
| +L(entry_from_wmemset): |
| cmpq $(VEC_SIZE * 2), %rdx |
| ja L(more_2x_vec) |
| /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
| @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) |
| # endif |
| |
| ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) |
| - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) |
| + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) |
| # ifdef __ILP32__ |
| /* Clear the upper 32 bits. */ |
| mov %edx, %edx |
| # endif |
| cmp $VEC_SIZE, %RDX_LP |
| jb L(less_vec) |
| + MEMSET_VDUP_TO_VEC0_HIGH () |
| cmp $(VEC_SIZE * 2), %RDX_LP |
| ja L(stosb_more_2x_vec) |
| - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |
| - */ |
| - VMOVU %VEC(0), (%rax) |
| - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) |
| + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
| + VMOVU %VEC(0), (%rdi) |
| + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) |
| VZEROUPPER_RETURN |
| #endif |
| |
| - .p2align 4,, 10 |
| + .p2align 4,, 4 |
| L(last_2x_vec): |
| #ifdef USE_LESS_VEC_MASK_STORE |
| - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) |
| - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) |
| + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) |
| + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) |
| #else |
| VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) |
| VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) |
| @@ -212,6 +228,7 @@ L(last_2x_vec): |
| #ifdef USE_LESS_VEC_MASK_STORE |
| .p2align 4,, 10 |
| L(less_vec): |
| +L(less_vec_no_vdup): |
| /* Less than 1 VEC. */ |
| # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
| # error Unsupported VEC_SIZE! |
| @@ -262,28 +279,18 @@ L(stosb_more_2x_vec): |
| /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] |
| and (4x, 8x] jump to target. */ |
| L(more_2x_vec): |
| - |
| - /* Two different methods of setting up pointers / compare. The |
| - two methods are based on the fact that EVEX/AVX512 mov |
| - instructions take more bytes then AVX2/SSE2 mov instructions. As |
| - well that EVEX/AVX512 machines also have fast LEA_BID. Both |
| - setup and END_REG to avoid complex address mode. For EVEX/AVX512 |
| - this saves code size and keeps a few targets in one fetch block. |
| - For AVX2/SSE2 this helps prevent AGU bottlenecks. */ |
| -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 |
| - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + |
| - LOOP_4X_OFFSET) with LEA_BID. */ |
| - |
| - /* END_REG is rcx for EVEX/AVX512. */ |
| - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG |
| -#endif |
| - |
| - /* Stores to first 2x VEC before cmp as any path forward will |
| - require it. */ |
| - VMOVU %VEC(0), (%rax) |
| - VMOVU %VEC(0), VEC_SIZE(%rax) |
| + /* Store next 2x vec regardless. */ |
| + VMOVU %VEC(0), (%rdi) |
| + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) |
| |
| |
| + /* Two different methods of setting up pointers / compare. The two |
| + methods are based on the fact that EVEX/AVX512 mov instructions take |
| + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 |
| + machines also have fast LEA_BID. Both setup and END_REG to avoid complex |
| + address mode. For EVEX/AVX512 this saves code size and keeps a few |
| + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU |
| + bottlenecks. */ |
| #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) |
| /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ |
| addq %rdx, %END_REG |
| @@ -292,6 +299,15 @@ L(more_2x_vec): |
| cmpq $(VEC_SIZE * 4), %rdx |
| jbe L(last_2x_vec) |
| |
| + |
| +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 |
| + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with |
| + LEA_BID. */ |
| + |
| + /* END_REG is rcx for EVEX/AVX512. */ |
| + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG |
| +#endif |
| + |
| /* Store next 2x vec regardless. */ |
| VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) |
| VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) |
| @@ -355,65 +371,93 @@ L(stosb_local): |
| /* Define L(less_vec) only if not otherwise defined. */ |
| .p2align 4 |
| L(less_vec): |
| + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to |
| + xmm). This is only does anything for AVX2. */ |
| + MEMSET_VDUP_TO_VEC0_LOW () |
| +L(less_vec_no_vdup): |
| #endif |
| L(cross_page): |
| #if VEC_SIZE > 32 |
| cmpl $32, %edx |
| - jae L(between_32_63) |
| + jge L(between_32_63) |
| #endif |
| #if VEC_SIZE > 16 |
| cmpl $16, %edx |
| - jae L(between_16_31) |
| + jge L(between_16_31) |
| +#endif |
| +#ifndef USE_XMM_LESS_VEC |
| + MOVQ %XMM0, %rcx |
| #endif |
| - MOVQ %XMM0, %rdi |
| cmpl $8, %edx |
| - jae L(between_8_15) |
| + jge L(between_8_15) |
| cmpl $4, %edx |
| - jae L(between_4_7) |
| + jge L(between_4_7) |
| cmpl $1, %edx |
| - ja L(between_2_3) |
| - jb L(return) |
| - movb %sil, (%rax) |
| - VZEROUPPER_RETURN |
| + jg L(between_2_3) |
| + jl L(between_0_0) |
| + movb %sil, (%LESS_VEC_REG) |
| +L(between_0_0): |
| + ret |
| |
| - /* Align small targets only if not doing so would cross a fetch |
| - line. */ |
| + /* Align small targets only if not doing so would cross a fetch line. |
| + */ |
| #if VEC_SIZE > 32 |
| .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) |
| /* From 32 to 63. No branch when size == 32. */ |
| L(between_32_63): |
| - VMOVU %YMM0, (%rax) |
| - VMOVU %YMM0, -32(%rax, %rdx) |
| + VMOVU %YMM0, (%LESS_VEC_REG) |
| + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) |
| VZEROUPPER_RETURN |
| #endif |
| |
| #if VEC_SIZE >= 32 |
| - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) |
| + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) |
| L(between_16_31): |
| /* From 16 to 31. No branch when size == 16. */ |
| - VMOVU %XMM0, (%rax) |
| - VMOVU %XMM0, -16(%rax, %rdx) |
| - VZEROUPPER_RETURN |
| + VMOVU %XMM0, (%LESS_VEC_REG) |
| + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) |
| + ret |
| #endif |
| |
| - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) |
| + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. |
| + */ |
| + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) |
| L(between_8_15): |
| /* From 8 to 15. No branch when size == 8. */ |
| - movq %rdi, (%rax) |
| - movq %rdi, -8(%rax, %rdx) |
| - VZEROUPPER_RETURN |
| +#ifdef USE_XMM_LESS_VEC |
| + MOVQ %XMM0, (%rdi) |
| + MOVQ %XMM0, -8(%rdi, %rdx) |
| +#else |
| + movq %rcx, (%LESS_VEC_REG) |
| + movq %rcx, -8(%LESS_VEC_REG, %rdx) |
| +#endif |
| + ret |
| |
| - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) |
| + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. |
| + */ |
| + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) |
| L(between_4_7): |
| /* From 4 to 7. No branch when size == 4. */ |
| - movl %edi, (%rax) |
| - movl %edi, -4(%rax, %rdx) |
| - VZEROUPPER_RETURN |
| +#ifdef USE_XMM_LESS_VEC |
| + MOVD %XMM0, (%rdi) |
| + MOVD %XMM0, -4(%rdi, %rdx) |
| +#else |
| + movl %ecx, (%LESS_VEC_REG) |
| + movl %ecx, -4(%LESS_VEC_REG, %rdx) |
| +#endif |
| + ret |
| |
| - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) |
| + /* 4 * XMM_SMALL for the third mov for AVX2. */ |
| + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) |
| L(between_2_3): |
| /* From 2 to 3. No branch when size == 2. */ |
| - movw %di, (%rax) |
| - movb %dil, -1(%rax, %rdx) |
| - VZEROUPPER_RETURN |
| +#ifdef USE_XMM_LESS_VEC |
| + movb %sil, (%rdi) |
| + movb %sil, 1(%rdi) |
| + movb %sil, -1(%rdi, %rdx) |
| +#else |
| + movw %cx, (%LESS_VEC_REG) |
| + movb %sil, -1(%LESS_VEC_REG, %rdx) |
| +#endif |
| + ret |
| END (MEMSET_SYMBOL (__memset, unaligned_erms)) |