| commit 0a11305416e287d85c64f04337cfd64b6b350e0c |
| Author: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Thu Apr 21 20:52:28 2022 -0500 |
| |
| x86: Optimize {str|wcs}rchr-sse2 |
| |
| The new code unrolls the main loop slightly without adding too much |
| overhead and minimizes the comparisons for the search CHAR. |
| |
| Geometric Mean of all benchmarks New / Old: 0.741 |
| See email for all results. |
| |
| Full xcheck passes on x86_64 with and without multiarch enabled. |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| |
| (cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c) |
| |
| diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S |
| index 67c30d0260cef8a3..a56300bc1830dedd 100644 |
| |
| |
| @@ -17,7 +17,7 @@ |
| <https://www.gnu.org/licenses/>. */ |
| |
| #if IS_IN (libc) |
| -# define strrchr __strrchr_sse2 |
| +# define STRRCHR __strrchr_sse2 |
| |
| # undef weak_alias |
| # define weak_alias(strrchr, rindex) |
| diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S |
| index a36034b40afe8d3d..00f69f2be77a43a0 100644 |
| |
| |
| @@ -17,7 +17,6 @@ |
| <https://www.gnu.org/licenses/>. */ |
| |
| #if IS_IN (libc) |
| -# define wcsrchr __wcsrchr_sse2 |
| +# define STRRCHR __wcsrchr_sse2 |
| #endif |
| - |
| #include "../wcsrchr.S" |
| diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S |
| index dfd09fe9508cb5bc..fc1598bb11417fd5 100644 |
| |
| |
| @@ -19,210 +19,360 @@ |
| |
| #include <sysdep.h> |
| |
| +#ifndef STRRCHR |
| +# define STRRCHR strrchr |
| +#endif |
| + |
| +#ifdef USE_AS_WCSRCHR |
| +# define PCMPEQ pcmpeqd |
| +# define CHAR_SIZE 4 |
| +# define PMINU pminud |
| +#else |
| +# define PCMPEQ pcmpeqb |
| +# define CHAR_SIZE 1 |
| +# define PMINU pminub |
| +#endif |
| + |
| +#define PAGE_SIZE 4096 |
| +#define VEC_SIZE 16 |
| + |
| .text |
| -ENTRY (strrchr) |
| - movd %esi, %xmm1 |
| +ENTRY(STRRCHR) |
| + movd %esi, %xmm0 |
| movq %rdi, %rax |
| - andl $4095, %eax |
| - punpcklbw %xmm1, %xmm1 |
| - cmpq $4032, %rax |
| - punpcklwd %xmm1, %xmm1 |
| - pshufd $0, %xmm1, %xmm1 |
| + andl $(PAGE_SIZE - 1), %eax |
| +#ifndef USE_AS_WCSRCHR |
| + punpcklbw %xmm0, %xmm0 |
| + punpcklwd %xmm0, %xmm0 |
| +#endif |
| + pshufd $0, %xmm0, %xmm0 |
| + cmpl $(PAGE_SIZE - VEC_SIZE), %eax |
| ja L(cross_page) |
| - movdqu (%rdi), %xmm0 |
| + |
| +L(cross_page_continue): |
| + movups (%rdi), %xmm1 |
| pxor %xmm2, %xmm2 |
| - movdqa %xmm0, %xmm3 |
| - pcmpeqb %xmm1, %xmm0 |
| - pcmpeqb %xmm2, %xmm3 |
| - pmovmskb %xmm0, %ecx |
| - pmovmskb %xmm3, %edx |
| - testq %rdx, %rdx |
| - je L(next_48_bytes) |
| - leaq -1(%rdx), %rax |
| - xorq %rdx, %rax |
| - andq %rcx, %rax |
| - je L(exit) |
| - bsrq %rax, %rax |
| + PCMPEQ %xmm1, %xmm2 |
| + pmovmskb %xmm2, %ecx |
| + testl %ecx, %ecx |
| + jz L(aligned_more) |
| + |
| + PCMPEQ %xmm0, %xmm1 |
| + pmovmskb %xmm1, %eax |
| + leal -1(%rcx), %edx |
| + xorl %edx, %ecx |
| + andl %ecx, %eax |
| + jz L(ret0) |
| + bsrl %eax, %eax |
| addq %rdi, %rax |
| + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If |
| + search CHAR is zero we are correct. Either way `andq |
| + -CHAR_SIZE, %rax` gets the correct result. */ |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| +L(ret0): |
| ret |
| |
| + /* Returns for first vec x1/x2 have hard coded backward search |
| + path for earlier matches. */ |
| .p2align 4 |
| -L(next_48_bytes): |
| - movdqu 16(%rdi), %xmm4 |
| - movdqa %xmm4, %xmm5 |
| - movdqu 32(%rdi), %xmm3 |
| - pcmpeqb %xmm1, %xmm4 |
| - pcmpeqb %xmm2, %xmm5 |
| - movdqu 48(%rdi), %xmm0 |
| - pmovmskb %xmm5, %edx |
| - movdqa %xmm3, %xmm5 |
| - pcmpeqb %xmm1, %xmm3 |
| - pcmpeqb %xmm2, %xmm5 |
| - pcmpeqb %xmm0, %xmm2 |
| - salq $16, %rdx |
| - pmovmskb %xmm3, %r8d |
| - pmovmskb %xmm5, %eax |
| - pmovmskb %xmm2, %esi |
| - salq $32, %r8 |
| - salq $32, %rax |
| - pcmpeqb %xmm1, %xmm0 |
| - orq %rdx, %rax |
| - movq %rsi, %rdx |
| - pmovmskb %xmm4, %esi |
| - salq $48, %rdx |
| - salq $16, %rsi |
| - orq %r8, %rsi |
| - orq %rcx, %rsi |
| - pmovmskb %xmm0, %ecx |
| - salq $48, %rcx |
| - orq %rcx, %rsi |
| - orq %rdx, %rax |
| - je L(loop_header2) |
| - leaq -1(%rax), %rcx |
| - xorq %rax, %rcx |
| - andq %rcx, %rsi |
| - je L(exit) |
| - bsrq %rsi, %rsi |
| - leaq (%rdi,%rsi), %rax |
| +L(first_vec_x0_test): |
| + PCMPEQ %xmm0, %xmm1 |
| + pmovmskb %xmm1, %eax |
| + testl %eax, %eax |
| + jz L(ret0) |
| + bsrl %eax, %eax |
| + addq %r8, %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| ret |
| |
| .p2align 4 |
| -L(loop_header2): |
| - testq %rsi, %rsi |
| - movq %rdi, %rcx |
| - je L(no_c_found) |
| -L(loop_header): |
| - addq $64, %rdi |
| - pxor %xmm7, %xmm7 |
| - andq $-64, %rdi |
| - jmp L(loop_entry) |
| +L(first_vec_x1): |
| + PCMPEQ %xmm0, %xmm2 |
| + pmovmskb %xmm2, %eax |
| + leal -1(%rcx), %edx |
| + xorl %edx, %ecx |
| + andl %ecx, %eax |
| + jz L(first_vec_x0_test) |
| + bsrl %eax, %eax |
| + leaq (VEC_SIZE)(%rdi, %rax), %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| + ret |
| |
| .p2align 4 |
| -L(loop64): |
| - testq %rdx, %rdx |
| - cmovne %rdx, %rsi |
| - cmovne %rdi, %rcx |
| - addq $64, %rdi |
| -L(loop_entry): |
| - movdqa 32(%rdi), %xmm3 |
| - pxor %xmm6, %xmm6 |
| - movdqa 48(%rdi), %xmm2 |
| - movdqa %xmm3, %xmm0 |
| - movdqa 16(%rdi), %xmm4 |
| - pminub %xmm2, %xmm0 |
| - movdqa (%rdi), %xmm5 |
| - pminub %xmm4, %xmm0 |
| - pminub %xmm5, %xmm0 |
| - pcmpeqb %xmm7, %xmm0 |
| - pmovmskb %xmm0, %eax |
| - movdqa %xmm5, %xmm0 |
| - pcmpeqb %xmm1, %xmm0 |
| - pmovmskb %xmm0, %r9d |
| - movdqa %xmm4, %xmm0 |
| - pcmpeqb %xmm1, %xmm0 |
| - pmovmskb %xmm0, %edx |
| - movdqa %xmm3, %xmm0 |
| - pcmpeqb %xmm1, %xmm0 |
| - salq $16, %rdx |
| - pmovmskb %xmm0, %r10d |
| - movdqa %xmm2, %xmm0 |
| - pcmpeqb %xmm1, %xmm0 |
| - salq $32, %r10 |
| - orq %r10, %rdx |
| - pmovmskb %xmm0, %r8d |
| - orq %r9, %rdx |
| - salq $48, %r8 |
| - orq %r8, %rdx |
| +L(first_vec_x1_test): |
| + PCMPEQ %xmm0, %xmm2 |
| + pmovmskb %xmm2, %eax |
| testl %eax, %eax |
| - je L(loop64) |
| - pcmpeqb %xmm6, %xmm4 |
| - pcmpeqb %xmm6, %xmm3 |
| - pcmpeqb %xmm6, %xmm5 |
| - pmovmskb %xmm4, %eax |
| - pmovmskb %xmm3, %r10d |
| - pcmpeqb %xmm6, %xmm2 |
| - pmovmskb %xmm5, %r9d |
| - salq $32, %r10 |
| - salq $16, %rax |
| - pmovmskb %xmm2, %r8d |
| - orq %r10, %rax |
| - orq %r9, %rax |
| - salq $48, %r8 |
| - orq %r8, %rax |
| - leaq -1(%rax), %r8 |
| - xorq %rax, %r8 |
| - andq %r8, %rdx |
| - cmovne %rdi, %rcx |
| - cmovne %rdx, %rsi |
| - bsrq %rsi, %rsi |
| - leaq (%rcx,%rsi), %rax |
| + jz L(first_vec_x0_test) |
| + bsrl %eax, %eax |
| + leaq (VEC_SIZE)(%rdi, %rax), %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| + ret |
| + |
| + .p2align 4 |
| +L(first_vec_x2): |
| + PCMPEQ %xmm0, %xmm3 |
| + pmovmskb %xmm3, %eax |
| + leal -1(%rcx), %edx |
| + xorl %edx, %ecx |
| + andl %ecx, %eax |
| + jz L(first_vec_x1_test) |
| + bsrl %eax, %eax |
| + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| + ret |
| + |
| + .p2align 4 |
| +L(aligned_more): |
| + /* Save original pointer if match was in VEC 0. */ |
| + movq %rdi, %r8 |
| + andq $-VEC_SIZE, %rdi |
| + |
| + movaps VEC_SIZE(%rdi), %xmm2 |
| + pxor %xmm3, %xmm3 |
| + PCMPEQ %xmm2, %xmm3 |
| + pmovmskb %xmm3, %ecx |
| + testl %ecx, %ecx |
| + jnz L(first_vec_x1) |
| + |
| + movaps (VEC_SIZE * 2)(%rdi), %xmm3 |
| + pxor %xmm4, %xmm4 |
| + PCMPEQ %xmm3, %xmm4 |
| + pmovmskb %xmm4, %ecx |
| + testl %ecx, %ecx |
| + jnz L(first_vec_x2) |
| + |
| + addq $VEC_SIZE, %rdi |
| + /* Save pointer again before realigning. */ |
| + movq %rdi, %rsi |
| + andq $-(VEC_SIZE * 2), %rdi |
| + .p2align 4 |
| +L(first_loop): |
| + /* Do 2x VEC at a time. */ |
| + movaps (VEC_SIZE * 2)(%rdi), %xmm4 |
| + movaps (VEC_SIZE * 3)(%rdi), %xmm5 |
| + /* Since SSE2 no pminud so wcsrchr needs seperate logic for |
| + detecting zero. Note if this is found to be a bottleneck it |
| + may be worth adding an SSE4.1 wcsrchr implementation. */ |
| +#ifdef USE_AS_WCSRCHR |
| + movaps %xmm5, %xmm6 |
| + pxor %xmm8, %xmm8 |
| + |
| + PCMPEQ %xmm8, %xmm5 |
| + PCMPEQ %xmm4, %xmm8 |
| + por %xmm5, %xmm8 |
| +#else |
| + movaps %xmm5, %xmm6 |
| + PMINU %xmm4, %xmm5 |
| +#endif |
| + |
| + movaps %xmm4, %xmm9 |
| + PCMPEQ %xmm0, %xmm4 |
| + PCMPEQ %xmm0, %xmm6 |
| + movaps %xmm6, %xmm7 |
| + por %xmm4, %xmm6 |
| +#ifndef USE_AS_WCSRCHR |
| + pxor %xmm8, %xmm8 |
| + PCMPEQ %xmm5, %xmm8 |
| +#endif |
| + pmovmskb %xmm8, %ecx |
| + pmovmskb %xmm6, %eax |
| + |
| + addq $(VEC_SIZE * 2), %rdi |
| + /* Use `addl` 1) so we can undo it with `subl` and 2) it can |
| + macro-fuse with `jz`. */ |
| + addl %ecx, %eax |
| + jz L(first_loop) |
| + |
| + /* Check if there is zero match. */ |
| + testl %ecx, %ecx |
| + jz L(second_loop_match) |
| + |
| + /* Check if there was a match in last iteration. */ |
| + subl %ecx, %eax |
| + jnz L(new_match) |
| + |
| +L(first_loop_old_match): |
| + PCMPEQ %xmm0, %xmm2 |
| + PCMPEQ %xmm0, %xmm3 |
| + pmovmskb %xmm2, %ecx |
| + pmovmskb %xmm3, %eax |
| + addl %eax, %ecx |
| + jz L(first_vec_x0_test) |
| + /* NB: We could move this shift to before the branch and save a |
| + bit of code size / performance on the fall through. The |
| + branch leads to the null case which generally seems hotter |
| + than char in first 3x VEC. */ |
| + sall $16, %eax |
| + orl %ecx, %eax |
| + |
| + bsrl %eax, %eax |
| + addq %rsi, %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| + ret |
| + |
| + .p2align 4 |
| +L(new_match): |
| + pxor %xmm6, %xmm6 |
| + PCMPEQ %xmm9, %xmm6 |
| + pmovmskb %xmm6, %eax |
| + sall $16, %ecx |
| + orl %eax, %ecx |
| + |
| + /* We can't reuse either of the old comparisons as since we mask |
| + of zeros after first zero (instead of using the full |
| + comparison) we can't gurantee no interference between match |
| + after end of string and valid match. */ |
| + pmovmskb %xmm4, %eax |
| + pmovmskb %xmm7, %edx |
| + sall $16, %edx |
| + orl %edx, %eax |
| + |
| + leal -1(%ecx), %edx |
| + xorl %edx, %ecx |
| + andl %ecx, %eax |
| + jz L(first_loop_old_match) |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| ret |
| |
| + /* Save minimum state for getting most recent match. We can |
| + throw out all previous work. */ |
| .p2align 4 |
| -L(no_c_found): |
| - movl $1, %esi |
| - xorl %ecx, %ecx |
| - jmp L(loop_header) |
| +L(second_loop_match): |
| + movq %rdi, %rsi |
| + movaps %xmm4, %xmm2 |
| + movaps %xmm7, %xmm3 |
| |
| .p2align 4 |
| -L(exit): |
| - xorl %eax, %eax |
| +L(second_loop): |
| + movaps (VEC_SIZE * 2)(%rdi), %xmm4 |
| + movaps (VEC_SIZE * 3)(%rdi), %xmm5 |
| + /* Since SSE2 no pminud so wcsrchr needs seperate logic for |
| + detecting zero. Note if this is found to be a bottleneck it |
| + may be worth adding an SSE4.1 wcsrchr implementation. */ |
| +#ifdef USE_AS_WCSRCHR |
| + movaps %xmm5, %xmm6 |
| + pxor %xmm8, %xmm8 |
| + |
| + PCMPEQ %xmm8, %xmm5 |
| + PCMPEQ %xmm4, %xmm8 |
| + por %xmm5, %xmm8 |
| +#else |
| + movaps %xmm5, %xmm6 |
| + PMINU %xmm4, %xmm5 |
| +#endif |
| + |
| + movaps %xmm4, %xmm9 |
| + PCMPEQ %xmm0, %xmm4 |
| + PCMPEQ %xmm0, %xmm6 |
| + movaps %xmm6, %xmm7 |
| + por %xmm4, %xmm6 |
| +#ifndef USE_AS_WCSRCHR |
| + pxor %xmm8, %xmm8 |
| + PCMPEQ %xmm5, %xmm8 |
| +#endif |
| + |
| + pmovmskb %xmm8, %ecx |
| + pmovmskb %xmm6, %eax |
| + |
| + addq $(VEC_SIZE * 2), %rdi |
| + /* Either null term or new occurence of CHAR. */ |
| + addl %ecx, %eax |
| + jz L(second_loop) |
| + |
| + /* No null term so much be new occurence of CHAR. */ |
| + testl %ecx, %ecx |
| + jz L(second_loop_match) |
| + |
| + |
| + subl %ecx, %eax |
| + jnz L(second_loop_new_match) |
| + |
| +L(second_loop_old_match): |
| + pmovmskb %xmm2, %ecx |
| + pmovmskb %xmm3, %eax |
| + sall $16, %eax |
| + orl %ecx, %eax |
| + bsrl %eax, %eax |
| + addq %rsi, %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| ret |
| |
| .p2align 4 |
| +L(second_loop_new_match): |
| + pxor %xmm6, %xmm6 |
| + PCMPEQ %xmm9, %xmm6 |
| + pmovmskb %xmm6, %eax |
| + sall $16, %ecx |
| + orl %eax, %ecx |
| + |
| + /* We can't reuse either of the old comparisons as since we mask |
| + of zeros after first zero (instead of using the full |
| + comparison) we can't gurantee no interference between match |
| + after end of string and valid match. */ |
| + pmovmskb %xmm4, %eax |
| + pmovmskb %xmm7, %edx |
| + sall $16, %edx |
| + orl %edx, %eax |
| + |
| + leal -1(%ecx), %edx |
| + xorl %edx, %ecx |
| + andl %ecx, %eax |
| + jz L(second_loop_old_match) |
| + bsrl %eax, %eax |
| + addq %rdi, %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| + ret |
| + |
| + .p2align 4,, 4 |
| L(cross_page): |
| - movq %rdi, %rax |
| - pxor %xmm0, %xmm0 |
| - andq $-64, %rax |
| - movdqu (%rax), %xmm5 |
| - movdqa %xmm5, %xmm6 |
| - movdqu 16(%rax), %xmm4 |
| - pcmpeqb %xmm1, %xmm5 |
| - pcmpeqb %xmm0, %xmm6 |
| - movdqu 32(%rax), %xmm3 |
| - pmovmskb %xmm6, %esi |
| - movdqa %xmm4, %xmm6 |
| - movdqu 48(%rax), %xmm2 |
| - pcmpeqb %xmm1, %xmm4 |
| - pcmpeqb %xmm0, %xmm6 |
| - pmovmskb %xmm6, %edx |
| - movdqa %xmm3, %xmm6 |
| - pcmpeqb %xmm1, %xmm3 |
| - pcmpeqb %xmm0, %xmm6 |
| - pcmpeqb %xmm2, %xmm0 |
| - salq $16, %rdx |
| - pmovmskb %xmm3, %r9d |
| - pmovmskb %xmm6, %r8d |
| - pmovmskb %xmm0, %ecx |
| - salq $32, %r9 |
| - salq $32, %r8 |
| - pcmpeqb %xmm1, %xmm2 |
| - orq %r8, %rdx |
| - salq $48, %rcx |
| - pmovmskb %xmm5, %r8d |
| - orq %rsi, %rdx |
| - pmovmskb %xmm4, %esi |
| - orq %rcx, %rdx |
| - pmovmskb %xmm2, %ecx |
| - salq $16, %rsi |
| - salq $48, %rcx |
| - orq %r9, %rsi |
| - orq %r8, %rsi |
| - orq %rcx, %rsi |
| + movq %rdi, %rsi |
| + andq $-VEC_SIZE, %rsi |
| + movaps (%rsi), %xmm1 |
| + pxor %xmm2, %xmm2 |
| + PCMPEQ %xmm1, %xmm2 |
| + pmovmskb %xmm2, %edx |
| movl %edi, %ecx |
| - subl %eax, %ecx |
| - shrq %cl, %rdx |
| - shrq %cl, %rsi |
| - testq %rdx, %rdx |
| - je L(loop_header2) |
| - leaq -1(%rdx), %rax |
| - xorq %rdx, %rax |
| - andq %rax, %rsi |
| - je L(exit) |
| - bsrq %rsi, %rax |
| + andl $(VEC_SIZE - 1), %ecx |
| + sarl %cl, %edx |
| + jz L(cross_page_continue) |
| + PCMPEQ %xmm0, %xmm1 |
| + pmovmskb %xmm1, %eax |
| + sarl %cl, %eax |
| + leal -1(%rdx), %ecx |
| + xorl %edx, %ecx |
| + andl %ecx, %eax |
| + jz L(ret1) |
| + bsrl %eax, %eax |
| addq %rdi, %rax |
| +#ifdef USE_AS_WCSRCHR |
| + andq $-CHAR_SIZE, %rax |
| +#endif |
| +L(ret1): |
| ret |
| -END (strrchr) |
| +END(STRRCHR) |
| |
| -weak_alias (strrchr, rindex) |
| -libc_hidden_builtin_def (strrchr) |
| +#ifndef USE_AS_WCSRCHR |
| + weak_alias (STRRCHR, rindex) |
| + libc_hidden_builtin_def (STRRCHR) |
| +#endif |
| diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S |
| index 6b318d3f29de9a9e..9006f2220963d76c 100644 |
| |
| |
| @@ -17,266 +17,12 @@ |
| License along with the GNU C Library; if not, see |
| <https://www.gnu.org/licenses/>. */ |
| |
| -#include <sysdep.h> |
| |
| - .text |
| -ENTRY (wcsrchr) |
| +#define USE_AS_WCSRCHR 1 |
| +#define NO_PMINU 1 |
| |
| - movd %rsi, %xmm1 |
| - mov %rdi, %rcx |
| - punpckldq %xmm1, %xmm1 |
| - pxor %xmm2, %xmm2 |
| - punpckldq %xmm1, %xmm1 |
| - and $63, %rcx |
| - cmp $48, %rcx |
| - ja L(crosscache) |
| +#ifndef STRRCHR |
| +# define STRRCHR wcsrchr |
| +#endif |
| |
| - movdqu (%rdi), %xmm0 |
| - pcmpeqd %xmm0, %xmm2 |
| - pcmpeqd %xmm1, %xmm0 |
| - pmovmskb %xmm2, %rcx |
| - pmovmskb %xmm0, %rax |
| - add $16, %rdi |
| - |
| - test %rax, %rax |
| - jnz L(unaligned_match1) |
| - |
| - test %rcx, %rcx |
| - jnz L(return_null) |
| - |
| - and $-16, %rdi |
| - xor %r8, %r8 |
| - jmp L(loop) |
| - |
| - .p2align 4 |
| -L(unaligned_match1): |
| - test %rcx, %rcx |
| - jnz L(prolog_find_zero_1) |
| - |
| - mov %rax, %r8 |
| - mov %rdi, %rsi |
| - and $-16, %rdi |
| - jmp L(loop) |
| - |
| - .p2align 4 |
| -L(crosscache): |
| - and $15, %rcx |
| - and $-16, %rdi |
| - pxor %xmm3, %xmm3 |
| - movdqa (%rdi), %xmm0 |
| - pcmpeqd %xmm0, %xmm3 |
| - pcmpeqd %xmm1, %xmm0 |
| - pmovmskb %xmm3, %rdx |
| - pmovmskb %xmm0, %rax |
| - shr %cl, %rdx |
| - shr %cl, %rax |
| - add $16, %rdi |
| - |
| - test %rax, %rax |
| - jnz L(unaligned_match) |
| - |
| - test %rdx, %rdx |
| - jnz L(return_null) |
| - |
| - xor %r8, %r8 |
| - jmp L(loop) |
| - |
| - .p2align 4 |
| -L(unaligned_match): |
| - test %rdx, %rdx |
| - jnz L(prolog_find_zero) |
| - |
| - mov %rax, %r8 |
| - lea (%rdi, %rcx), %rsi |
| - |
| -/* Loop start on aligned string. */ |
| - .p2align 4 |
| -L(loop): |
| - movdqa (%rdi), %xmm0 |
| - pcmpeqd %xmm0, %xmm2 |
| - add $16, %rdi |
| - pcmpeqd %xmm1, %xmm0 |
| - pmovmskb %xmm2, %rcx |
| - pmovmskb %xmm0, %rax |
| - or %rax, %rcx |
| - jnz L(matches) |
| - |
| - movdqa (%rdi), %xmm3 |
| - pcmpeqd %xmm3, %xmm2 |
| - add $16, %rdi |
| - pcmpeqd %xmm1, %xmm3 |
| - pmovmskb %xmm2, %rcx |
| - pmovmskb %xmm3, %rax |
| - or %rax, %rcx |
| - jnz L(matches) |
| - |
| - movdqa (%rdi), %xmm4 |
| - pcmpeqd %xmm4, %xmm2 |
| - add $16, %rdi |
| - pcmpeqd %xmm1, %xmm4 |
| - pmovmskb %xmm2, %rcx |
| - pmovmskb %xmm4, %rax |
| - or %rax, %rcx |
| - jnz L(matches) |
| - |
| - movdqa (%rdi), %xmm5 |
| - pcmpeqd %xmm5, %xmm2 |
| - add $16, %rdi |
| - pcmpeqd %xmm1, %xmm5 |
| - pmovmskb %xmm2, %rcx |
| - pmovmskb %xmm5, %rax |
| - or %rax, %rcx |
| - jz L(loop) |
| - |
| - .p2align 4 |
| -L(matches): |
| - test %rax, %rax |
| - jnz L(match) |
| -L(return_value): |
| - test %r8, %r8 |
| - jz L(return_null) |
| - mov %r8, %rax |
| - mov %rsi, %rdi |
| - |
| - test $15 << 4, %ah |
| - jnz L(match_fourth_wchar) |
| - test %ah, %ah |
| - jnz L(match_third_wchar) |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(match): |
| - pmovmskb %xmm2, %rcx |
| - test %rcx, %rcx |
| - jnz L(find_zero) |
| - mov %rax, %r8 |
| - mov %rdi, %rsi |
| - jmp L(loop) |
| - |
| - .p2align 4 |
| -L(find_zero): |
| - test $15, %cl |
| - jnz L(find_zero_in_first_wchar) |
| - test %cl, %cl |
| - jnz L(find_zero_in_second_wchar) |
| - test $15, %ch |
| - jnz L(find_zero_in_third_wchar) |
| - |
| - and $1 << 13 - 1, %rax |
| - jz L(return_value) |
| - |
| - test $15 << 4, %ah |
| - jnz L(match_fourth_wchar) |
| - test %ah, %ah |
| - jnz L(match_third_wchar) |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(find_zero_in_first_wchar): |
| - test $1, %rax |
| - jz L(return_value) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(find_zero_in_second_wchar): |
| - and $1 << 5 - 1, %rax |
| - jz L(return_value) |
| - |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(find_zero_in_third_wchar): |
| - and $1 << 9 - 1, %rax |
| - jz L(return_value) |
| - |
| - test %ah, %ah |
| - jnz L(match_third_wchar) |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(prolog_find_zero): |
| - add %rcx, %rdi |
| - mov %rdx, %rcx |
| -L(prolog_find_zero_1): |
| - test $15, %cl |
| - jnz L(prolog_find_zero_in_first_wchar) |
| - test %cl, %cl |
| - jnz L(prolog_find_zero_in_second_wchar) |
| - test $15, %ch |
| - jnz L(prolog_find_zero_in_third_wchar) |
| - |
| - and $1 << 13 - 1, %rax |
| - jz L(return_null) |
| - |
| - test $15 << 4, %ah |
| - jnz L(match_fourth_wchar) |
| - test %ah, %ah |
| - jnz L(match_third_wchar) |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(prolog_find_zero_in_first_wchar): |
| - test $1, %rax |
| - jz L(return_null) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(prolog_find_zero_in_second_wchar): |
| - and $1 << 5 - 1, %rax |
| - jz L(return_null) |
| - |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(prolog_find_zero_in_third_wchar): |
| - and $1 << 9 - 1, %rax |
| - jz L(return_null) |
| - |
| - test %ah, %ah |
| - jnz L(match_third_wchar) |
| - test $15 << 4, %al |
| - jnz L(match_second_wchar) |
| - lea -16(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(match_second_wchar): |
| - lea -12(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(match_third_wchar): |
| - lea -8(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(match_fourth_wchar): |
| - lea -4(%rdi), %rax |
| - ret |
| - |
| - .p2align 4 |
| -L(return_null): |
| - xor %rax, %rax |
| - ret |
| - |
| -END (wcsrchr) |
| +#include "../strrchr.S" |