| From 8bb82fd62bdb2d8741998a5a1be38388890452da Mon Sep 17 00:00:00 2001 |
| From: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Mon, 3 May 2021 03:03:19 -0400 |
| Subject: [PATCH] x86: Optimize memchr-evex.S |
| |
| No bug. This commit optimizes memchr-evex.S. The optimizations include |
| replacing some branches with cmovcc, avoiding some branches entirely |
| in the less_4x_vec case, making the page cross logic less strict, |
| saving some ALU in the alignment process, and most importantly |
| increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and |
| test-wmemchr are all passing. |
| |
| Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| (cherry picked from commit 2a76821c3081d2c0231ecd2618f52662cb48fccd) |
| |
| sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++---------- |
| 1 file changed, 322 insertions(+), 225 deletions(-) |
| |
| diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S |
| index 6dd5d67b..81d5cd64 100644 |
| |
| |
| @@ -26,14 +26,28 @@ |
| |
| # ifdef USE_AS_WMEMCHR |
| # define VPBROADCAST vpbroadcastd |
| -# define VPCMP vpcmpd |
| -# define SHIFT_REG r8d |
| +# define VPMINU vpminud |
| +# define VPCMP vpcmpd |
| +# define VPCMPEQ vpcmpeqd |
| +# define CHAR_SIZE 4 |
| # else |
| # define VPBROADCAST vpbroadcastb |
| -# define VPCMP vpcmpb |
| -# define SHIFT_REG ecx |
| +# define VPMINU vpminub |
| +# define VPCMP vpcmpb |
| +# define VPCMPEQ vpcmpeqb |
| +# define CHAR_SIZE 1 |
| # endif |
| |
| +# ifdef USE_AS_RAWMEMCHR |
| +# define RAW_PTR_REG rcx |
| +# define ALGN_PTR_REG rdi |
| +# else |
| +# define RAW_PTR_REG rdi |
| +# define ALGN_PTR_REG rcx |
| +# endif |
| + |
| +# define XMMZERO xmm23 |
| +# define YMMZERO ymm23 |
| # define XMMMATCH xmm16 |
| # define YMMMATCH ymm16 |
| # define YMM1 ymm17 |
| @@ -44,6 +58,8 @@ |
| # define YMM6 ymm22 |
| |
| # define VEC_SIZE 32 |
| +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |
| +# define PAGE_SIZE 4096 |
| |
| .section .text.evex,"ax",@progbits |
| ENTRY (MEMCHR) |
| @@ -51,11 +67,7 @@ ENTRY (MEMCHR) |
| /* Check for zero length. */ |
| test %RDX_LP, %RDX_LP |
| jz L(zero) |
| -# endif |
| - movl %edi, %ecx |
| -# ifdef USE_AS_WMEMCHR |
| - shl $2, %RDX_LP |
| -# else |
| + |
| # ifdef __ILP32__ |
| /* Clear the upper 32 bits. */ |
| movl %edx, %edx |
| @@ -64,318 +76,403 @@ ENTRY (MEMCHR) |
| /* Broadcast CHAR to YMMMATCH. */ |
| VPBROADCAST %esi, %YMMMATCH |
| /* Check if we may cross page boundary with one vector load. */ |
| - andl $(2 * VEC_SIZE - 1), %ecx |
| - cmpl $VEC_SIZE, %ecx |
| - ja L(cros_page_boundary) |
| + movl %edi, %eax |
| + andl $(PAGE_SIZE - 1), %eax |
| + cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
| + ja L(cross_page_boundary) |
| |
| /* Check the first VEC_SIZE bytes. */ |
| - VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| - testl %eax, %eax |
| - |
| + VPCMP $0, (%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| # ifndef USE_AS_RAWMEMCHR |
| - jnz L(first_vec_x0_check) |
| - /* Adjust length and check the end of data. */ |
| - subq $VEC_SIZE, %rdx |
| - jbe L(zero) |
| + /* If length < CHAR_PER_VEC handle special. */ |
| + cmpq $CHAR_PER_VEC, %rdx |
| + jbe L(first_vec_x0) |
| +# endif |
| + testl %eax, %eax |
| + jz L(aligned_more) |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (%rdi, %rax, CHAR_SIZE), %rax |
| # else |
| - jnz L(first_vec_x0) |
| + addq %rdi, %rax |
| # endif |
| - |
| - /* Align data for aligned loads in the loop. */ |
| - addq $VEC_SIZE, %rdi |
| - andl $(VEC_SIZE - 1), %ecx |
| - andq $-VEC_SIZE, %rdi |
| + ret |
| |
| # ifndef USE_AS_RAWMEMCHR |
| - /* Adjust length. */ |
| - addq %rcx, %rdx |
| - |
| - subq $(VEC_SIZE * 4), %rdx |
| - jbe L(last_4x_vec_or_less) |
| -# endif |
| - jmp L(more_4x_vec) |
| +L(zero): |
| + xorl %eax, %eax |
| + ret |
| |
| + .p2align 5 |
| +L(first_vec_x0): |
| + /* Check if first match was before length. */ |
| + tzcntl %eax, %eax |
| + xorl %ecx, %ecx |
| + cmpl %eax, %edx |
| + leaq (%rdi, %rax, CHAR_SIZE), %rax |
| + cmovle %rcx, %rax |
| + ret |
| +# else |
| + /* NB: first_vec_x0 is 17 bytes which will leave |
| + cross_page_boundary (which is relatively cold) close enough |
| + to ideal alignment. So only realign L(cross_page_boundary) if |
| + rawmemchr. */ |
| .p2align 4 |
| -L(cros_page_boundary): |
| - andl $(VEC_SIZE - 1), %ecx |
| +# endif |
| +L(cross_page_boundary): |
| + /* Save pointer before aligning as its original value is |
| + necessary for computer return address if byte is found or |
| + adjusting length if it is not and this is memchr. */ |
| + movq %rdi, %rcx |
| + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi |
| + for rawmemchr. */ |
| + andq $-VEC_SIZE, %ALGN_PTR_REG |
| + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 |
| + kmovd %k0, %r8d |
| # ifdef USE_AS_WMEMCHR |
| - /* NB: Divide shift count by 4 since each bit in K1 represent 4 |
| + /* NB: Divide shift count by 4 since each bit in K0 represent 4 |
| bytes. */ |
| - movl %ecx, %SHIFT_REG |
| - sarl $2, %SHIFT_REG |
| + sarl $2, %eax |
| +# endif |
| +# ifndef USE_AS_RAWMEMCHR |
| + movl $(PAGE_SIZE / CHAR_SIZE), %esi |
| + subl %eax, %esi |
| # endif |
| - andq $-VEC_SIZE, %rdi |
| - VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| - /* Remove the leading bytes. */ |
| - sarxl %SHIFT_REG, %eax, %eax |
| - testl %eax, %eax |
| - jz L(aligned_more) |
| - tzcntl %eax, %eax |
| # ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - sall $2, %eax |
| + andl $(CHAR_PER_VEC - 1), %eax |
| # endif |
| + /* Remove the leading bytes. */ |
| + sarxl %eax, %r8d, %eax |
| # ifndef USE_AS_RAWMEMCHR |
| /* Check the end of data. */ |
| - cmpq %rax, %rdx |
| - jbe L(zero) |
| + cmpq %rsi, %rdx |
| + jbe L(first_vec_x0) |
| +# endif |
| + testl %eax, %eax |
| + jz L(cross_page_continue) |
| + tzcntl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax |
| +# else |
| + addq %RAW_PTR_REG, %rax |
| # endif |
| - addq %rdi, %rax |
| - addq %rcx, %rax |
| ret |
| |
| .p2align 4 |
| -L(aligned_more): |
| -# ifndef USE_AS_RAWMEMCHR |
| - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" |
| - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition |
| - overflow. */ |
| - negq %rcx |
| - addq $VEC_SIZE, %rcx |
| +L(first_vec_x1): |
| + tzcntl %eax, %eax |
| + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
| + ret |
| |
| - /* Check the end of data. */ |
| - subq %rcx, %rdx |
| - jbe L(zero) |
| -# endif |
| + .p2align 4 |
| +L(first_vec_x2): |
| + tzcntl %eax, %eax |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
| + ret |
| |
| - addq $VEC_SIZE, %rdi |
| + .p2align 4 |
| +L(first_vec_x3): |
| + tzcntl %eax, %eax |
| + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
| + ret |
| |
| -# ifndef USE_AS_RAWMEMCHR |
| - subq $(VEC_SIZE * 4), %rdx |
| - jbe L(last_4x_vec_or_less) |
| -# endif |
| + .p2align 4 |
| +L(first_vec_x4): |
| + tzcntl %eax, %eax |
| + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
| + ret |
| |
| -L(more_4x_vec): |
| + .p2align 5 |
| +L(aligned_more): |
| /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |
| since data is only aligned to VEC_SIZE. */ |
| - VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| - testl %eax, %eax |
| - jnz L(first_vec_x0) |
| |
| - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Align data to VEC_SIZE. */ |
| +L(cross_page_continue): |
| + xorl %ecx, %ecx |
| + subl %edi, %ecx |
| + andq $-VEC_SIZE, %rdi |
| + /* esi is for adjusting length to see if near the end. */ |
| + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Divide bytes by 4 to get the wchar_t count. */ |
| + sarl $2, %esi |
| +# endif |
| +# else |
| + andq $-VEC_SIZE, %rdi |
| +L(cross_page_continue): |
| +# endif |
| + /* Load first VEC regardless. */ |
| + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| +# ifndef USE_AS_RAWMEMCHR |
| + /* Adjust length. If near end handle specially. */ |
| + subq %rsi, %rdx |
| + jbe L(last_4x_vec_or_less) |
| +# endif |
| testl %eax, %eax |
| jnz L(first_vec_x1) |
| |
| - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| testl %eax, %eax |
| jnz L(first_vec_x2) |
| |
| - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| testl %eax, %eax |
| jnz L(first_vec_x3) |
| |
| - addq $(VEC_SIZE * 4), %rdi |
| + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(first_vec_x4) |
| + |
| |
| # ifndef USE_AS_RAWMEMCHR |
| - subq $(VEC_SIZE * 4), %rdx |
| - jbe L(last_4x_vec_or_less) |
| -# endif |
| + /* Check if at last CHAR_PER_VEC * 4 length. */ |
| + subq $(CHAR_PER_VEC * 4), %rdx |
| + jbe L(last_4x_vec_or_less_cmpeq) |
| + addq $VEC_SIZE, %rdi |
| |
| - /* Align data to 4 * VEC_SIZE. */ |
| - movq %rdi, %rcx |
| - andl $(4 * VEC_SIZE - 1), %ecx |
| + /* Align data to VEC_SIZE * 4 for the loop and readjust length. |
| + */ |
| +# ifdef USE_AS_WMEMCHR |
| + movl %edi, %ecx |
| andq $-(4 * VEC_SIZE), %rdi |
| - |
| -# ifndef USE_AS_RAWMEMCHR |
| - /* Adjust length. */ |
| + andl $(VEC_SIZE * 4 - 1), %ecx |
| + /* NB: Divide bytes by 4 to get the wchar_t count. */ |
| + sarl $2, %ecx |
| addq %rcx, %rdx |
| +# else |
| + addq %rdi, %rdx |
| + andq $-(4 * VEC_SIZE), %rdi |
| + subq %rdi, %rdx |
| +# endif |
| +# else |
| + addq $VEC_SIZE, %rdi |
| + andq $-(4 * VEC_SIZE), %rdi |
| # endif |
| |
| + vpxorq %XMMZERO, %XMMZERO, %XMMZERO |
| + |
| + /* Compare 4 * VEC at a time forward. */ |
| .p2align 4 |
| L(loop_4x_vec): |
| - /* Compare 4 * VEC at a time forward. */ |
| - VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 |
| - kord %k1, %k2, %k5 |
| - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 |
| - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 |
| - |
| - kord %k3, %k4, %k6 |
| - kortestd %k5, %k6 |
| - jnz L(4x_vec_end) |
| - |
| - addq $(VEC_SIZE * 4), %rdi |
| - |
| + /* It would be possible to save some instructions using 4x VPCMP |
| + but bottleneck on port 5 makes it not woth it. */ |
| + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 |
| + /* xor will set bytes match esi to zero. */ |
| + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 |
| + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 |
| + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 |
| + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ |
| + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z} |
| + VPCMP $0, %YMM3, %YMMZERO, %k2 |
| # ifdef USE_AS_RAWMEMCHR |
| - jmp L(loop_4x_vec) |
| + subq $-(VEC_SIZE * 4), %rdi |
| + kortestd %k2, %k3 |
| + jz L(loop_4x_vec) |
| # else |
| - subq $(VEC_SIZE * 4), %rdx |
| + kortestd %k2, %k3 |
| + jnz L(loop_4x_vec_end) |
| + |
| + subq $-(VEC_SIZE * 4), %rdi |
| + |
| + subq $(CHAR_PER_VEC * 4), %rdx |
| ja L(loop_4x_vec) |
| |
| + /* Fall through into less than 4 remaining vectors of length case. |
| + */ |
| + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + addq $(VEC_SIZE * 3), %rdi |
| + .p2align 4 |
| L(last_4x_vec_or_less): |
| - /* Less than 4 * VEC and aligned to VEC_SIZE. */ |
| - addl $(VEC_SIZE * 2), %edx |
| - jle L(last_2x_vec) |
| - |
| - VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| + /* Check if first VEC contained match. */ |
| testl %eax, %eax |
| - jnz L(first_vec_x0) |
| + jnz L(first_vec_x1_check) |
| |
| - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| - testl %eax, %eax |
| - jnz L(first_vec_x1) |
| + /* If remaining length > CHAR_PER_VEC * 2. */ |
| + addl $(CHAR_PER_VEC * 2), %edx |
| + jg L(last_4x_vec) |
| |
| - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| - testl %eax, %eax |
| +L(last_2x_vec): |
| + /* If remaining length < CHAR_PER_VEC. */ |
| + addl $CHAR_PER_VEC, %edx |
| + jle L(zero_end) |
| |
| - jnz L(first_vec_x2_check) |
| - subl $VEC_SIZE, %edx |
| - jle L(zero) |
| + /* Check VEC2 and compare any match with remaining length. */ |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + tzcntl %eax, %eax |
| + cmpl %eax, %edx |
| + jbe L(set_zero_end) |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
| +L(zero_end): |
| + ret |
| |
| - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| - testl %eax, %eax |
| |
| - jnz L(first_vec_x3_check) |
| + .p2align 4 |
| +L(first_vec_x1_check): |
| + tzcntl %eax, %eax |
| + /* Adjust length. */ |
| + subl $-(CHAR_PER_VEC * 4), %edx |
| + /* Check if match within remaining length. */ |
| + cmpl %eax, %edx |
| + jbe L(set_zero_end) |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
| + ret |
| +L(set_zero_end): |
| xorl %eax, %eax |
| ret |
| |
| .p2align 4 |
| -L(last_2x_vec): |
| - addl $(VEC_SIZE * 2), %edx |
| - VPCMP $0, (%rdi), %YMMMATCH, %k1 |
| +L(loop_4x_vec_end): |
| +# endif |
| + /* rawmemchr will fall through into this if match was found in |
| + loop. */ |
| + |
| + /* k1 has not of matches with VEC1. */ |
| kmovd %k1, %eax |
| - testl %eax, %eax |
| +# ifdef USE_AS_WMEMCHR |
| + subl $((1 << CHAR_PER_VEC) - 1), %eax |
| +# else |
| + incl %eax |
| +# endif |
| + jnz L(last_vec_x1_return) |
| |
| - jnz L(first_vec_x0_check) |
| - subl $VEC_SIZE, %edx |
| - jle L(zero) |
| + VPCMP $0, %YMM2, %YMMZERO, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x2_return) |
| |
| - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 |
| - kmovd %k1, %eax |
| + kmovd %k2, %eax |
| testl %eax, %eax |
| - jnz L(first_vec_x1_check) |
| - xorl %eax, %eax |
| - ret |
| + jnz L(last_vec_x3_return) |
| |
| - .p2align 4 |
| -L(first_vec_x0_check): |
| + kmovd %k3, %eax |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - sall $2, %eax |
| +# ifdef USE_AS_RAWMEMCHR |
| + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
| +# else |
| + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax |
| # endif |
| - /* Check the end of data. */ |
| - cmpq %rax, %rdx |
| - jbe L(zero) |
| - addq %rdi, %rax |
| ret |
| |
| .p2align 4 |
| -L(first_vec_x1_check): |
| +L(last_vec_x1_return): |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - sall $2, %eax |
| -# endif |
| - /* Check the end of data. */ |
| - cmpq %rax, %rdx |
| - jbe L(zero) |
| - addq $VEC_SIZE, %rax |
| +# ifdef USE_AS_RAWMEMCHR |
| +# ifdef USE_AS_WMEMCHR |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (%rdi, %rax, CHAR_SIZE), %rax |
| +# else |
| addq %rdi, %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(first_vec_x2_check): |
| - tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - sall $2, %eax |
| +# endif |
| +# else |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
| # endif |
| - /* Check the end of data. */ |
| - cmpq %rax, %rdx |
| - jbe L(zero) |
| - addq $(VEC_SIZE * 2), %rax |
| - addq %rdi, %rax |
| ret |
| |
| .p2align 4 |
| -L(first_vec_x3_check): |
| +L(last_vec_x2_return): |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - sall $2, %eax |
| +# ifdef USE_AS_RAWMEMCHR |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax |
| +# else |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax |
| # endif |
| - /* Check the end of data. */ |
| - cmpq %rax, %rdx |
| - jbe L(zero) |
| - addq $(VEC_SIZE * 3), %rax |
| - addq %rdi, %rax |
| ret |
| |
| .p2align 4 |
| -L(zero): |
| - xorl %eax, %eax |
| - ret |
| -# endif |
| - |
| - .p2align 4 |
| -L(first_vec_x0): |
| +L(last_vec_x3_return): |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - leaq (%rdi, %rax, 4), %rax |
| +# ifdef USE_AS_RAWMEMCHR |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
| # else |
| - addq %rdi, %rax |
| + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ |
| + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax |
| # endif |
| ret |
| |
| + |
| +# ifndef USE_AS_RAWMEMCHR |
| +L(last_4x_vec_or_less_cmpeq): |
| + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + subq $-(VEC_SIZE * 4), %rdi |
| + /* Check first VEC regardless. */ |
| + testl %eax, %eax |
| + jnz L(first_vec_x1_check) |
| + |
| + /* If remaining length <= CHAR_PER_VEC * 2. */ |
| + addl $(CHAR_PER_VEC * 2), %edx |
| + jle L(last_2x_vec) |
| + |
| .p2align 4 |
| -L(first_vec_x1): |
| +L(last_4x_vec): |
| + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + testl %eax, %eax |
| + jnz L(last_vec_x2) |
| + |
| + |
| + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + /* Create mask for possible matches within remaining length. */ |
| +# ifdef USE_AS_WMEMCHR |
| + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx |
| + bzhil %edx, %ecx, %ecx |
| +# else |
| + movq $-1, %rcx |
| + bzhiq %rdx, %rcx, %rcx |
| +# endif |
| + /* Test matches in data against length match. */ |
| + andl %ecx, %eax |
| + jnz L(last_vec_x3) |
| + |
| + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after |
| + remaining length was found to be > CHAR_PER_VEC * 2. */ |
| + subl $CHAR_PER_VEC, %edx |
| + jbe L(zero_end2) |
| + |
| + |
| + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 |
| + kmovd %k0, %eax |
| + /* Shift remaining length mask for last VEC. */ |
| +# ifdef USE_AS_WMEMCHR |
| + shrl $CHAR_PER_VEC, %ecx |
| +# else |
| + shrq $CHAR_PER_VEC, %rcx |
| +# endif |
| + andl %ecx, %eax |
| + jz L(zero_end2) |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - leaq VEC_SIZE(%rdi, %rax, 4), %rax |
| -# else |
| - addq $VEC_SIZE, %rax |
| - addq %rdi, %rax |
| -# endif |
| + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax |
| +L(zero_end2): |
| ret |
| |
| - .p2align 4 |
| -L(first_vec_x2): |
| +L(last_vec_x2): |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax |
| -# else |
| - addq $(VEC_SIZE * 2), %rax |
| - addq %rdi, %rax |
| -# endif |
| + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax |
| ret |
| |
| .p2align 4 |
| -L(4x_vec_end): |
| - kmovd %k1, %eax |
| - testl %eax, %eax |
| - jnz L(first_vec_x0) |
| - kmovd %k2, %eax |
| - testl %eax, %eax |
| - jnz L(first_vec_x1) |
| - kmovd %k3, %eax |
| - testl %eax, %eax |
| - jnz L(first_vec_x2) |
| - kmovd %k4, %eax |
| - testl %eax, %eax |
| -L(first_vec_x3): |
| +L(last_vec_x3): |
| tzcntl %eax, %eax |
| -# ifdef USE_AS_WMEMCHR |
| - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ |
| - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax |
| -# else |
| - addq $(VEC_SIZE * 3), %rax |
| - addq %rdi, %rax |
| -# endif |
| + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax |
| ret |
| +# endif |
| |
| END (MEMCHR) |
| #endif |
| -- |
| GitLab |
| |