Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-283.patch

Blob History Raw

		08c3a6	`commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb`
		08c3a6	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		08c3a6	`Date: Mon Jun 6 21:11:30 2022 -0700`
		08c3a6
		08c3a6	`x86: Optimize memrchr-sse2.S`
		08c3a6
		08c3a6	`The new code:`
		08c3a6	`1. prioritizes smaller lengths more.`
		08c3a6	`2. optimizes target placement more carefully.`
		08c3a6	`3. reuses logic more.`
		08c3a6	`4. fixes up various inefficiencies in the logic.`
		08c3a6
		08c3a6	`The total code size saving is: 394 bytes`
		08c3a6	`Geometric Mean of all benchmarks New / Old: 0.874`
		08c3a6
		08c3a6	`Regressions:`
		08c3a6	`1. The page cross case is now colder, especially re-entry from the`
		08c3a6	`page cross case if a match is not found in the first VEC`
		08c3a6	`(roughly 50%). My general opinion with this patch is this is`
		08c3a6	`acceptable given the "coldness" of this case (less than 4%) and`
		08c3a6	`generally performance improvement in the other far more common`
		08c3a6	`cases.`
		08c3a6
		08c3a6	`2. There are some regressions 5-15% for medium/large user-arg`
		08c3a6	`lengths that have a match in the first VEC. This is because the`
		08c3a6	`logic was rewritten to optimize finds in the first VEC if the`
		08c3a6	`user-arg length is shorter (where we see roughly 20-50%`
		08c3a6	`performance improvements). It is not always the case this is a`
		08c3a6	`regression. My intuition is some frontend quirk is partially`
		08c3a6	`explaining the data although I haven't been able to find the`
		08c3a6	`root cause.`
		08c3a6
		08c3a6	`Full xcheck passes on x86_64.`
		08c3a6	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		08c3a6
		08c3a6	`(cherry picked from commit 731feee3869550e93177e604604c1765d81de571)`
		08c3a6
		08c3a6	`diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S`
		08c3a6	`index cc2001167d77c83c..c2a5902bf9385c67 100644`
		08c3a6	`--- a/sysdeps/x86_64/memrchr.S`
		08c3a6	`+++ b/sysdeps/x86_64/memrchr.S`
		08c3a6	`@@ -19,362 +19,333 @@`
		08c3a6	`<https://www.gnu.org/licenses/>. */`
		08c3a6
		08c3a6	`#include <sysdep.h>`
		08c3a6	`+#define VEC_SIZE 16`
		08c3a6	`+#define PAGE_SIZE 4096`
		08c3a6
		08c3a6	`.text`
		08c3a6	`-ENTRY (__memrchr)`
		08c3a6	`- movd %esi, %xmm1`
		08c3a6	`-`
		08c3a6	`- sub $16, %RDX_LP`
		08c3a6	`- jbe L(length_less16)`
		08c3a6	`-`
		08c3a6	`- punpcklbw %xmm1, %xmm1`
		08c3a6	`- punpcklbw %xmm1, %xmm1`
		08c3a6	`-`
		08c3a6	`- add %RDX_LP, %RDI_LP`
		08c3a6	`- pshufd $0, %xmm1, %xmm1`
		08c3a6	`-`
		08c3a6	`- movdqu (%rdi), %xmm0`
		08c3a6	`- pcmpeqb %xmm1, %xmm0`
		08c3a6	`-`
		08c3a6	`-/* Check if there is a match. */`
		08c3a6	`- pmovmskb %xmm0, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches0)`
		08c3a6	`-`
		08c3a6	`- sub $64, %rdi`
		08c3a6	`- mov %edi, %ecx`
		08c3a6	`- and $15, %ecx`
		08c3a6	`- jz L(loop_prolog)`
		08c3a6	`-`
		08c3a6	`- add $16, %rdi`
		08c3a6	`- add $16, %rdx`
		08c3a6	`- and $-16, %rdi`
		08c3a6	`- sub %rcx, %rdx`
		08c3a6	`-`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(loop_prolog):`
		08c3a6	`- sub $64, %rdx`
		08c3a6	`- jbe L(exit_loop)`
		08c3a6	`-`
		08c3a6	`- movdqa 48(%rdi), %xmm0`
		08c3a6	`- pcmpeqb %xmm1, %xmm0`
		08c3a6	`- pmovmskb %xmm0, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches48)`
		08c3a6	`-`
		08c3a6	`- movdqa 32(%rdi), %xmm2`
		08c3a6	`- pcmpeqb %xmm1, %xmm2`
		08c3a6	`- pmovmskb %xmm2, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches32)`
		08c3a6	`-`
		08c3a6	`- movdqa 16(%rdi), %xmm3`
		08c3a6	`- pcmpeqb %xmm1, %xmm3`
		08c3a6	`- pmovmskb %xmm3, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches16)`
		08c3a6	`-`
		08c3a6	`- movdqa (%rdi), %xmm4`
		08c3a6	`- pcmpeqb %xmm1, %xmm4`
		08c3a6	`- pmovmskb %xmm4, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches0)`
		08c3a6	`-`
		08c3a6	`- sub $64, %rdi`
		08c3a6	`- sub $64, %rdx`
		08c3a6	`- jbe L(exit_loop)`
		08c3a6	`-`
		08c3a6	`- movdqa 48(%rdi), %xmm0`
		08c3a6	`- pcmpeqb %xmm1, %xmm0`
		08c3a6	`- pmovmskb %xmm0, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches48)`
		08c3a6	`-`
		08c3a6	`- movdqa 32(%rdi), %xmm2`
		08c3a6	`- pcmpeqb %xmm1, %xmm2`
		08c3a6	`- pmovmskb %xmm2, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches32)`
		08c3a6	`-`
		08c3a6	`- movdqa 16(%rdi), %xmm3`
		08c3a6	`- pcmpeqb %xmm1, %xmm3`
		08c3a6	`- pmovmskb %xmm3, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches16)`
		08c3a6	`-`
		08c3a6	`- movdqa (%rdi), %xmm3`
		08c3a6	`- pcmpeqb %xmm1, %xmm3`
		08c3a6	`- pmovmskb %xmm3, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches0)`
		08c3a6	`-`
		08c3a6	`- mov %edi, %ecx`
		08c3a6	`- and $63, %ecx`
		08c3a6	`- jz L(align64_loop)`
		08c3a6	`-`
		08c3a6	`- add $64, %rdi`
		08c3a6	`- add $64, %rdx`
		08c3a6	`- and $-64, %rdi`
		08c3a6	`- sub %rcx, %rdx`
		08c3a6	`-`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(align64_loop):`
		08c3a6	`- sub $64, %rdi`
		08c3a6	`- sub $64, %rdx`
		08c3a6	`- jbe L(exit_loop)`
		08c3a6	`-`
		08c3a6	`- movdqa (%rdi), %xmm0`
		08c3a6	`- movdqa 16(%rdi), %xmm2`
		08c3a6	`- movdqa 32(%rdi), %xmm3`
		08c3a6	`- movdqa 48(%rdi), %xmm4`
		08c3a6	`-`
		08c3a6	`- pcmpeqb %xmm1, %xmm0`
		08c3a6	`- pcmpeqb %xmm1, %xmm2`
		08c3a6	`- pcmpeqb %xmm1, %xmm3`
		08c3a6	`- pcmpeqb %xmm1, %xmm4`
		08c3a6	`-`
		08c3a6	`- pmaxub %xmm3, %xmm0`
		08c3a6	`- pmaxub %xmm4, %xmm2`
		08c3a6	`- pmaxub %xmm0, %xmm2`
		08c3a6	`- pmovmskb %xmm2, %eax`
		08c3a6	`-`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jz L(align64_loop)`
		08c3a6	`-`
		08c3a6	`- pmovmskb %xmm4, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches48)`
		08c3a6	`-`
		08c3a6	`- pmovmskb %xmm3, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches32)`
		08c3a6	`-`
		08c3a6	`- movdqa 16(%rdi), %xmm2`
		08c3a6	`-`
		08c3a6	`- pcmpeqb %xmm1, %xmm2`
		08c3a6	`- pcmpeqb (%rdi), %xmm1`
		08c3a6	`-`
		08c3a6	`- pmovmskb %xmm2, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches16)`
		08c3a6	`-`
		08c3a6	`- pmovmskb %xmm1, %eax`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`-`
		08c3a6	`- add %rdi, %rax`
		08c3a6	`+ENTRY_P2ALIGN(__memrchr, 6)`
		08c3a6	`+#ifdef __ILP32__`
		08c3a6	`+ /* Clear upper bits. */`
		08c3a6	`+ mov %RDX_LP, %RDX_LP`
		08c3a6	`+#endif`
		08c3a6	`+ movd %esi, %xmm0`
		08c3a6	`+`
		08c3a6	`+ /* Get end pointer. */`
		08c3a6	`+ leaq (%rdx, %rdi), %rcx`
		08c3a6	`+`
		08c3a6	`+ punpcklbw %xmm0, %xmm0`
		08c3a6	`+ punpcklwd %xmm0, %xmm0`
		08c3a6	`+ pshufd $0, %xmm0, %xmm0`
		08c3a6	`+`
		08c3a6	`+ /* Check if we can load 1x VEC without cross a page. */`
		08c3a6	`+ testl $(PAGE_SIZE - VEC_SIZE), %ecx`
		08c3a6	`+ jz L(page_cross)`
		08c3a6	`+`
		08c3a6	`+ /* NB: This load happens regardless of whether rdx (len) is zero. Since`
		08c3a6	`+ it doesn't cross a page and the standard gurantees any pointer have`
		08c3a6	`+ at least one-valid byte this load must be safe. For the entire`
		08c3a6	`+ history of the x86 memrchr implementation this has been possible so`
		08c3a6	`+ no code "should" be relying on a zero-length check before this load.`
		08c3a6	`+ The zero-length check is moved to the page cross case because it is`
		08c3a6	`+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE`
		08c3a6	`+ into 2-cache lines. */`
		08c3a6	`+ movups -(VEC_SIZE)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ subq $VEC_SIZE, %rdx`
		08c3a6	`+ ja L(more_1x_vec)`
		08c3a6	`+L(ret_vec_x0_test):`
		08c3a6	`+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is`
		08c3a6	`+ zero. */`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ jz L(ret_0)`
		08c3a6	+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
		08c3a6	`+ if out of bounds. */`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`+ jl L(zero_0)`
		08c3a6	`+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base`
		08c3a6	`+ ptr. */`
		08c3a6	`+ addq %rdi, %rax`
		08c3a6	`+L(ret_0):`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(exit_loop):`
		08c3a6	`- add $64, %edx`
		08c3a6	`- cmp $32, %edx`
		08c3a6	`- jbe L(exit_loop_32)`
		08c3a6	`-`
		08c3a6	`- movdqa 48(%rdi), %xmm0`
		08c3a6	`- pcmpeqb %xmm1, %xmm0`
		08c3a6	`- pmovmskb %xmm0, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches48)`
		08c3a6	`-`
		08c3a6	`- movdqa 32(%rdi), %xmm2`
		08c3a6	`- pcmpeqb %xmm1, %xmm2`
		08c3a6	`- pmovmskb %xmm2, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches32)`
		08c3a6	`-`
		08c3a6	`- movdqa 16(%rdi), %xmm3`
		08c3a6	`- pcmpeqb %xmm1, %xmm3`
		08c3a6	`- pmovmskb %xmm3, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches16_1)`
		08c3a6	`- cmp $48, %edx`
		08c3a6	`- jbe L(return_null)`
		08c3a6	`-`
		08c3a6	`- pcmpeqb (%rdi), %xmm1`
		08c3a6	`- pmovmskb %xmm1, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches0_1)`
		08c3a6	`- xor %eax, %eax`
		08c3a6	`+ .p2align 4,, 5`
		08c3a6	`+L(ret_vec_x0):`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ leaq -(VEC_SIZE)(%rcx, %rax), %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(exit_loop_32):`
		08c3a6	`- movdqa 48(%rdi), %xmm0`
		08c3a6	`- pcmpeqb %xmm1, %xmm0`
		08c3a6	`- pmovmskb %xmm0, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches48_1)`
		08c3a6	`- cmp $16, %edx`
		08c3a6	`- jbe L(return_null)`
		08c3a6	`-`
		08c3a6	`- pcmpeqb 32(%rdi), %xmm1`
		08c3a6	`- pmovmskb %xmm1, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(matches32_1)`
		08c3a6	`- xor %eax, %eax`
		08c3a6	`+ .p2align 4,, 2`
		08c3a6	`+L(zero_0):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches0):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- add %rdi, %rax`
		08c3a6	`- ret`
		08c3a6	`-`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches16):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- lea 16(%rax, %rdi), %rax`
		08c3a6	`- ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches32):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- lea 32(%rax, %rdi), %rax`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(more_1x_vec):`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x0)`
		08c3a6	`+`
		08c3a6	`+ /* Align rcx (pointer to string). */`
		08c3a6	`+ decq %rcx`
		08c3a6	`+ andq $-VEC_SIZE, %rcx`
		08c3a6	`+`
		08c3a6	`+ movq %rcx, %rdx`
		08c3a6	+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
		08c3a6	+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
		08c3a6	`+ it adds more frontend uops (even if the moves can be eliminated) and`
		08c3a6	`+ some percentage of the time actual backend uops. */`
		08c3a6	`+ movaps -(VEC_SIZE)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ subq %rdi, %rdx`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ cmpq $(VEC_SIZE * 2), %rdx`
		08c3a6	`+ ja L(more_2x_vec)`
		08c3a6	`+L(last_2x_vec):`
		08c3a6	`+ subl $VEC_SIZE, %edx`
		08c3a6	`+ jbe L(ret_vec_x0_test)`
		08c3a6	`+`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x0)`
		08c3a6	`+`
		08c3a6	`+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ subl $VEC_SIZE, %edx`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ jz L(ret_1)`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`+ jl L(zero_0)`
		08c3a6	`+ addq %rdi, %rax`
		08c3a6	`+L(ret_1):`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches48):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- lea 48(%rax, %rdi), %rax`
		08c3a6	`+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)`
		08c3a6	`+ causes the hot pause (length <= VEC_SIZE) to span multiple cache`
		08c3a6	`+ lines. Naturally aligned % 16 to 8-bytes. */`
		08c3a6	`+L(page_cross):`
		08c3a6	`+ /* Zero length check. */`
		08c3a6	`+ testq %rdx, %rdx`
		08c3a6	`+ jz L(zero_0)`
		08c3a6	`+`
		08c3a6	`+ leaq -1(%rcx), %r8`
		08c3a6	`+ andq $-(VEC_SIZE), %r8`
		08c3a6	`+`
		08c3a6	`+ movaps (%r8), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %esi`
		08c3a6	`+ /* Shift out negative alignment (because we are starting from endptr and`
		08c3a6	`+ working backwards). */`
		08c3a6	`+ negl %ecx`
		08c3a6	`+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count`
		08c3a6	`+ explicitly. */`
		08c3a6	`+ andl $(VEC_SIZE - 1), %ecx`
		08c3a6	`+ shl %cl, %esi`
		08c3a6	`+ movzwl %si, %eax`
		08c3a6	`+ leaq (%rdi, %rdx), %rcx`
		08c3a6	`+ cmpq %rdi, %r8`
		08c3a6	`+ ja L(more_1x_vec)`
		08c3a6	`+ subl $VEC_SIZE, %edx`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ jz L(ret_2)`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`+ jl L(zero_1)`
		08c3a6	`+ addq %rdi, %rax`
		08c3a6	`+L(ret_2):`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches0_1):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- sub $64, %rdx`
		08c3a6	`- add %rax, %rdx`
		08c3a6	`- jl L(return_null)`
		08c3a6	`- add %rdi, %rax`
		08c3a6	`+ /* Fits in aliging bytes. */`
		08c3a6	`+L(zero_1):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches16_1):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- sub $48, %rdx`
		08c3a6	`- add %rax, %rdx`
		08c3a6	`- jl L(return_null)`
		08c3a6	`- lea 16(%rdi, %rax), %rax`
		08c3a6	`+ .p2align 4,, 5`
		08c3a6	`+L(ret_vec_x1):`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches32_1):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- sub $32, %rdx`
		08c3a6	`- add %rax, %rdx`
		08c3a6	`- jl L(return_null)`
		08c3a6	`- lea 32(%rdi, %rax), %rax`
		08c3a6	`- ret`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(more_2x_vec):`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x0)`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(matches48_1):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- sub $16, %rdx`
		08c3a6	`- add %rax, %rdx`
		08c3a6	`- jl L(return_null)`
		08c3a6	`- lea 48(%rdi, %rax), %rax`
		08c3a6	`- ret`
		08c3a6	`+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x1)`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(return_null):`
		08c3a6	`- xor %eax, %eax`
		08c3a6	`- ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(length_less16_offset0):`
		08c3a6	`- test %edx, %edx`
		08c3a6	`- jz L(return_null)`
		08c3a6	`+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6
		08c3a6	`- mov %dl, %cl`
		08c3a6	`- pcmpeqb (%rdi), %xmm1`
		08c3a6	`+ subq $(VEC_SIZE * 4), %rdx`
		08c3a6	`+ ja L(more_4x_vec)`
		08c3a6
		08c3a6	`- mov $1, %edx`
		08c3a6	`- sal %cl, %edx`
		08c3a6	`- sub $1, %edx`
		08c3a6	`+ addl $(VEC_SIZE), %edx`
		08c3a6	`+ jle L(ret_vec_x2_test)`
		08c3a6
		08c3a6	`- pmovmskb %xmm1, %eax`
		08c3a6	`+L(last_vec):`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x2)`
		08c3a6
		08c3a6	`- and %edx, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jz L(return_null)`
		08c3a6	`+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- add %rdi, %rax`
		08c3a6	`+ subl $(VEC_SIZE), %edx`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ jz L(ret_3)`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`+ jl L(zero_2)`
		08c3a6	`+ addq %rdi, %rax`
		08c3a6	`+L(ret_3):`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(length_less16):`
		08c3a6	`- punpcklbw %xmm1, %xmm1`
		08c3a6	`- punpcklbw %xmm1, %xmm1`
		08c3a6	`-`
		08c3a6	`- add $16, %edx`
		08c3a6	`-`
		08c3a6	`- pshufd $0, %xmm1, %xmm1`
		08c3a6	`-`
		08c3a6	`- mov %edi, %ecx`
		08c3a6	`- and $15, %ecx`
		08c3a6	`- jz L(length_less16_offset0)`
		08c3a6	`-`
		08c3a6	`- mov %cl, %dh`
		08c3a6	`- mov %ecx, %esi`
		08c3a6	`- add %dl, %dh`
		08c3a6	`- and $-16, %rdi`
		08c3a6	`-`
		08c3a6	`- sub $16, %dh`
		08c3a6	`- ja L(length_less16_part2)`
		08c3a6	`-`
		08c3a6	`- pcmpeqb (%rdi), %xmm1`
		08c3a6	`- pmovmskb %xmm1, %eax`
		08c3a6	`-`
		08c3a6	`- sar %cl, %eax`
		08c3a6	`- mov %dl, %cl`
		08c3a6	`-`
		08c3a6	`- mov $1, %edx`
		08c3a6	`- sal %cl, %edx`
		08c3a6	`- sub $1, %edx`
		08c3a6	`-`
		08c3a6	`- and %edx, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jz L(return_null)`
		08c3a6	`-`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- add %rdi, %rax`
		08c3a6	`- add %rsi, %rax`
		08c3a6	`+ .p2align 4,, 6`
		08c3a6	`+L(ret_vec_x2_test):`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ jz L(zero_2)`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`+ jl L(zero_2)`
		08c3a6	`+ addq %rdi, %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(length_less16_part2):`
		08c3a6	`- movdqa 16(%rdi), %xmm2`
		08c3a6	`- pcmpeqb %xmm1, %xmm2`
		08c3a6	`- pmovmskb %xmm2, %eax`
		08c3a6	`-`
		08c3a6	`- mov %dh, %cl`
		08c3a6	`- mov $1, %edx`
		08c3a6	`- sal %cl, %edx`
		08c3a6	`- sub $1, %edx`
		08c3a6	`-`
		08c3a6	`- and %edx, %eax`
		08c3a6	`+L(zero_2):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jnz L(length_less16_part2_return)`
		08c3a6
		08c3a6	`- pcmpeqb (%rdi), %xmm1`
		08c3a6	`- pmovmskb %xmm1, %eax`
		08c3a6	`+ .p2align 4,, 5`
		08c3a6	`+L(ret_vec_x2):`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- mov %esi, %ecx`
		08c3a6	`- sar %cl, %eax`
		08c3a6	`- test %eax, %eax`
		08c3a6	`- jz L(return_null)`
		08c3a6	`+ .p2align 4,, 5`
		08c3a6	`+L(ret_vec_x3):`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- add %rdi, %rax`
		08c3a6	`- add %rsi, %rax`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(more_4x_vec):`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x2)`
		08c3a6	`+`
		08c3a6	`+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x3)`
		08c3a6	`+`
		08c3a6	`+ addq $-(VEC_SIZE * 4), %rcx`
		08c3a6	`+ cmpq $(VEC_SIZE * 4), %rdx`
		08c3a6	`+ jbe L(last_4x_vec)`
		08c3a6	`+`
		08c3a6	`+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end`
		08c3a6	`+ keeping the code from spilling to the next cache line. */`
		08c3a6	`+ addq $(VEC_SIZE * 4 - 1), %rcx`
		08c3a6	`+ andq $-(VEC_SIZE * 4), %rcx`
		08c3a6	`+ leaq (VEC_SIZE * 4)(%rdi), %rdx`
		08c3a6	`+ andq $-(VEC_SIZE * 4), %rdx`
		08c3a6	`+`
		08c3a6	`+ .p2align 4,, 11`
		08c3a6	`+L(loop_4x_vec):`
		08c3a6	`+ movaps (VEC_SIZE * -1)(%rcx), %xmm1`
		08c3a6	`+ movaps (VEC_SIZE * -2)(%rcx), %xmm2`
		08c3a6	`+ movaps (VEC_SIZE * -3)(%rcx), %xmm3`
		08c3a6	`+ movaps (VEC_SIZE * -4)(%rcx), %xmm4`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm2`
		08c3a6	`+ pcmpeqb %xmm0, %xmm3`
		08c3a6	`+ pcmpeqb %xmm0, %xmm4`
		08c3a6	`+`
		08c3a6	`+ por %xmm1, %xmm2`
		08c3a6	`+ por %xmm3, %xmm4`
		08c3a6	`+ por %xmm2, %xmm4`
		08c3a6	`+`
		08c3a6	`+ pmovmskb %xmm4, %esi`
		08c3a6	`+ testl %esi, %esi`
		08c3a6	`+ jnz L(loop_end)`
		08c3a6	`+`
		08c3a6	`+ addq $-(VEC_SIZE * 4), %rcx`
		08c3a6	`+ cmpq %rdx, %rcx`
		08c3a6	`+ jne L(loop_4x_vec)`
		08c3a6	`+`
		08c3a6	`+ subl %edi, %edx`
		08c3a6	`+`
		08c3a6	`+ /* Ends up being 1-byte nop. */`
		08c3a6	`+ .p2align 4,, 2`
		08c3a6	`+L(last_4x_vec):`
		08c3a6	`+ movaps -(VEC_SIZE)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ cmpl $(VEC_SIZE * 2), %edx`
		08c3a6	`+ jbe L(last_2x_vec)`
		08c3a6	`+`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_x0)`
		08c3a6	`+`
		08c3a6	`+`
		08c3a6	`+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_end)`
		08c3a6	`+`
		08c3a6	`+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1`
		08c3a6	`+ pcmpeqb %xmm0, %xmm1`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+`
		08c3a6	`+ subl $(VEC_SIZE * 3), %edx`
		08c3a6	`+ ja L(last_vec)`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ jz L(ret_4)`
		08c3a6	`+ addl %edx, %eax`
		08c3a6	`+ jl L(zero_3)`
		08c3a6	`+ addq %rdi, %rax`
		08c3a6	`+L(ret_4):`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(length_less16_part2_return):`
		08c3a6	`- bsr %eax, %eax`
		08c3a6	`- lea 16(%rax, %rdi), %rax`
		08c3a6	`+ /* Ends up being 1-byte nop. */`
		08c3a6	`+ .p2align 4,, 3`
		08c3a6	`+L(loop_end):`
		08c3a6	`+ pmovmskb %xmm1, %eax`
		08c3a6	`+ sall $16, %eax`
		08c3a6	`+ jnz L(ret_vec_end)`
		08c3a6	`+`
		08c3a6	`+ pmovmskb %xmm2, %eax`
		08c3a6	`+ testl %eax, %eax`
		08c3a6	`+ jnz L(ret_vec_end)`
		08c3a6	`+`
		08c3a6	`+ pmovmskb %xmm3, %eax`
		08c3a6	`+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)`
		08c3a6	`+ then it won't affect the result in esi (VEC4). If ecx is non-zero`
		08c3a6	`+ then CHAR in VEC3 and bsrq will use that position. */`
		08c3a6	`+ sall $16, %eax`
		08c3a6	`+ orl %esi, %eax`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`-END (__memrchr)`
		08c3a6	`+L(ret_vec_end):`
		08c3a6	`+ bsrl %eax, %eax`
		08c3a6	`+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax`
		08c3a6	`+ ret`
		08c3a6	`+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare`
		08c3a6	`+ aligning bytes. */`
		08c3a6	`+L(zero_3):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`+ ret`
		08c3a6	`+ /* 2-bytes from next cache line. */`
		08c3a6	`+END(__memrchr)`
		08c3a6	`weak_alias (__memrchr, memrchr)`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-283.patch