Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-283.patch

Blob History Raw

		076f82	`commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb`
		076f82	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		076f82	`Date: Mon Jun 6 21:11:30 2022 -0700`
		076f82
		076f82	`x86: Optimize memrchr-sse2.S`
		076f82
		076f82	`The new code:`
		076f82	`1. prioritizes smaller lengths more.`
		076f82	`2. optimizes target placement more carefully.`
		076f82	`3. reuses logic more.`
		076f82	`4. fixes up various inefficiencies in the logic.`
		076f82
		076f82	`The total code size saving is: 394 bytes`
		076f82	`Geometric Mean of all benchmarks New / Old: 0.874`
		076f82
		076f82	`Regressions:`
		076f82	`1. The page cross case is now colder, especially re-entry from the`
		076f82	`page cross case if a match is not found in the first VEC`
		076f82	`(roughly 50%). My general opinion with this patch is this is`
		076f82	`acceptable given the "coldness" of this case (less than 4%) and`
		076f82	`generally performance improvement in the other far more common`
		076f82	`cases.`
		076f82
		076f82	`2. There are some regressions 5-15% for medium/large user-arg`
		076f82	`lengths that have a match in the first VEC. This is because the`
		076f82	`logic was rewritten to optimize finds in the first VEC if the`
		076f82	`user-arg length is shorter (where we see roughly 20-50%`
		076f82	`performance improvements). It is not always the case this is a`
		076f82	`regression. My intuition is some frontend quirk is partially`
		076f82	`explaining the data although I haven't been able to find the`
		076f82	`root cause.`
		076f82
		076f82	`Full xcheck passes on x86_64.`
		076f82	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		076f82
		076f82	`(cherry picked from commit 731feee3869550e93177e604604c1765d81de571)`
		076f82
		076f82	`diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S`
		076f82	`index cc2001167d77c83c..c2a5902bf9385c67 100644`
		076f82	`--- a/sysdeps/x86_64/memrchr.S`
		076f82	`+++ b/sysdeps/x86_64/memrchr.S`
		076f82	`@@ -19,362 +19,333 @@`
		076f82	`<https://www.gnu.org/licenses/>. */`
		076f82
		076f82	`#include <sysdep.h>`
		076f82	`+#define VEC_SIZE 16`
		076f82	`+#define PAGE_SIZE 4096`
		076f82
		076f82	`.text`
		076f82	`-ENTRY (__memrchr)`
		076f82	`- movd %esi, %xmm1`
		076f82	`-`
		076f82	`- sub $16, %RDX_LP`
		076f82	`- jbe L(length_less16)`
		076f82	`-`
		076f82	`- punpcklbw %xmm1, %xmm1`
		076f82	`- punpcklbw %xmm1, %xmm1`
		076f82	`-`
		076f82	`- add %RDX_LP, %RDI_LP`
		076f82	`- pshufd $0, %xmm1, %xmm1`
		076f82	`-`
		076f82	`- movdqu (%rdi), %xmm0`
		076f82	`- pcmpeqb %xmm1, %xmm0`
		076f82	`-`
		076f82	`-/* Check if there is a match. */`
		076f82	`- pmovmskb %xmm0, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches0)`
		076f82	`-`
		076f82	`- sub $64, %rdi`
		076f82	`- mov %edi, %ecx`
		076f82	`- and $15, %ecx`
		076f82	`- jz L(loop_prolog)`
		076f82	`-`
		076f82	`- add $16, %rdi`
		076f82	`- add $16, %rdx`
		076f82	`- and $-16, %rdi`
		076f82	`- sub %rcx, %rdx`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`-L(loop_prolog):`
		076f82	`- sub $64, %rdx`
		076f82	`- jbe L(exit_loop)`
		076f82	`-`
		076f82	`- movdqa 48(%rdi), %xmm0`
		076f82	`- pcmpeqb %xmm1, %xmm0`
		076f82	`- pmovmskb %xmm0, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches48)`
		076f82	`-`
		076f82	`- movdqa 32(%rdi), %xmm2`
		076f82	`- pcmpeqb %xmm1, %xmm2`
		076f82	`- pmovmskb %xmm2, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches32)`
		076f82	`-`
		076f82	`- movdqa 16(%rdi), %xmm3`
		076f82	`- pcmpeqb %xmm1, %xmm3`
		076f82	`- pmovmskb %xmm3, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches16)`
		076f82	`-`
		076f82	`- movdqa (%rdi), %xmm4`
		076f82	`- pcmpeqb %xmm1, %xmm4`
		076f82	`- pmovmskb %xmm4, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches0)`
		076f82	`-`
		076f82	`- sub $64, %rdi`
		076f82	`- sub $64, %rdx`
		076f82	`- jbe L(exit_loop)`
		076f82	`-`
		076f82	`- movdqa 48(%rdi), %xmm0`
		076f82	`- pcmpeqb %xmm1, %xmm0`
		076f82	`- pmovmskb %xmm0, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches48)`
		076f82	`-`
		076f82	`- movdqa 32(%rdi), %xmm2`
		076f82	`- pcmpeqb %xmm1, %xmm2`
		076f82	`- pmovmskb %xmm2, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches32)`
		076f82	`-`
		076f82	`- movdqa 16(%rdi), %xmm3`
		076f82	`- pcmpeqb %xmm1, %xmm3`
		076f82	`- pmovmskb %xmm3, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches16)`
		076f82	`-`
		076f82	`- movdqa (%rdi), %xmm3`
		076f82	`- pcmpeqb %xmm1, %xmm3`
		076f82	`- pmovmskb %xmm3, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches0)`
		076f82	`-`
		076f82	`- mov %edi, %ecx`
		076f82	`- and $63, %ecx`
		076f82	`- jz L(align64_loop)`
		076f82	`-`
		076f82	`- add $64, %rdi`
		076f82	`- add $64, %rdx`
		076f82	`- and $-64, %rdi`
		076f82	`- sub %rcx, %rdx`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`-L(align64_loop):`
		076f82	`- sub $64, %rdi`
		076f82	`- sub $64, %rdx`
		076f82	`- jbe L(exit_loop)`
		076f82	`-`
		076f82	`- movdqa (%rdi), %xmm0`
		076f82	`- movdqa 16(%rdi), %xmm2`
		076f82	`- movdqa 32(%rdi), %xmm3`
		076f82	`- movdqa 48(%rdi), %xmm4`
		076f82	`-`
		076f82	`- pcmpeqb %xmm1, %xmm0`
		076f82	`- pcmpeqb %xmm1, %xmm2`
		076f82	`- pcmpeqb %xmm1, %xmm3`
		076f82	`- pcmpeqb %xmm1, %xmm4`
		076f82	`-`
		076f82	`- pmaxub %xmm3, %xmm0`
		076f82	`- pmaxub %xmm4, %xmm2`
		076f82	`- pmaxub %xmm0, %xmm2`
		076f82	`- pmovmskb %xmm2, %eax`
		076f82	`-`
		076f82	`- test %eax, %eax`
		076f82	`- jz L(align64_loop)`
		076f82	`-`
		076f82	`- pmovmskb %xmm4, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches48)`
		076f82	`-`
		076f82	`- pmovmskb %xmm3, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches32)`
		076f82	`-`
		076f82	`- movdqa 16(%rdi), %xmm2`
		076f82	`-`
		076f82	`- pcmpeqb %xmm1, %xmm2`
		076f82	`- pcmpeqb (%rdi), %xmm1`
		076f82	`-`
		076f82	`- pmovmskb %xmm2, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches16)`
		076f82	`-`
		076f82	`- pmovmskb %xmm1, %eax`
		076f82	`- bsr %eax, %eax`
		076f82	`-`
		076f82	`- add %rdi, %rax`
		076f82	`+ENTRY_P2ALIGN(__memrchr, 6)`
		076f82	`+#ifdef __ILP32__`
		076f82	`+ /* Clear upper bits. */`
		076f82	`+ mov %RDX_LP, %RDX_LP`
		076f82	`+#endif`
		076f82	`+ movd %esi, %xmm0`
		076f82	`+`
		076f82	`+ /* Get end pointer. */`
		076f82	`+ leaq (%rdx, %rdi), %rcx`
		076f82	`+`
		076f82	`+ punpcklbw %xmm0, %xmm0`
		076f82	`+ punpcklwd %xmm0, %xmm0`
		076f82	`+ pshufd $0, %xmm0, %xmm0`
		076f82	`+`
		076f82	`+ /* Check if we can load 1x VEC without cross a page. */`
		076f82	`+ testl $(PAGE_SIZE - VEC_SIZE), %ecx`
		076f82	`+ jz L(page_cross)`
		076f82	`+`
		076f82	`+ /* NB: This load happens regardless of whether rdx (len) is zero. Since`
		076f82	`+ it doesn't cross a page and the standard gurantees any pointer have`
		076f82	`+ at least one-valid byte this load must be safe. For the entire`
		076f82	`+ history of the x86 memrchr implementation this has been possible so`
		076f82	`+ no code "should" be relying on a zero-length check before this load.`
		076f82	`+ The zero-length check is moved to the page cross case because it is`
		076f82	`+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE`
		076f82	`+ into 2-cache lines. */`
		076f82	`+ movups -(VEC_SIZE)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ subq $VEC_SIZE, %rdx`
		076f82	`+ ja L(more_1x_vec)`
		076f82	`+L(ret_vec_x0_test):`
		076f82	`+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is`
		076f82	`+ zero. */`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ jz L(ret_0)`
		076f82	+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
		076f82	`+ if out of bounds. */`
		076f82	`+ addl %edx, %eax`
		076f82	`+ jl L(zero_0)`
		076f82	`+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base`
		076f82	`+ ptr. */`
		076f82	`+ addq %rdi, %rax`
		076f82	`+L(ret_0):`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(exit_loop):`
		076f82	`- add $64, %edx`
		076f82	`- cmp $32, %edx`
		076f82	`- jbe L(exit_loop_32)`
		076f82	`-`
		076f82	`- movdqa 48(%rdi), %xmm0`
		076f82	`- pcmpeqb %xmm1, %xmm0`
		076f82	`- pmovmskb %xmm0, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches48)`
		076f82	`-`
		076f82	`- movdqa 32(%rdi), %xmm2`
		076f82	`- pcmpeqb %xmm1, %xmm2`
		076f82	`- pmovmskb %xmm2, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches32)`
		076f82	`-`
		076f82	`- movdqa 16(%rdi), %xmm3`
		076f82	`- pcmpeqb %xmm1, %xmm3`
		076f82	`- pmovmskb %xmm3, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches16_1)`
		076f82	`- cmp $48, %edx`
		076f82	`- jbe L(return_null)`
		076f82	`-`
		076f82	`- pcmpeqb (%rdi), %xmm1`
		076f82	`- pmovmskb %xmm1, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches0_1)`
		076f82	`- xor %eax, %eax`
		076f82	`+ .p2align 4,, 5`
		076f82	`+L(ret_vec_x0):`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ leaq -(VEC_SIZE)(%rcx, %rax), %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(exit_loop_32):`
		076f82	`- movdqa 48(%rdi), %xmm0`
		076f82	`- pcmpeqb %xmm1, %xmm0`
		076f82	`- pmovmskb %xmm0, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches48_1)`
		076f82	`- cmp $16, %edx`
		076f82	`- jbe L(return_null)`
		076f82	`-`
		076f82	`- pcmpeqb 32(%rdi), %xmm1`
		076f82	`- pmovmskb %xmm1, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(matches32_1)`
		076f82	`- xor %eax, %eax`
		076f82	`+ .p2align 4,, 2`
		076f82	`+L(zero_0):`
		076f82	`+ xorl %eax, %eax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches0):`
		076f82	`- bsr %eax, %eax`
		076f82	`- add %rdi, %rax`
		076f82	`- ret`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`-L(matches16):`
		076f82	`- bsr %eax, %eax`
		076f82	`- lea 16(%rax, %rdi), %rax`
		076f82	`- ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches32):`
		076f82	`- bsr %eax, %eax`
		076f82	`- lea 32(%rax, %rdi), %rax`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(more_1x_vec):`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82	`+`
		076f82	`+ /* Align rcx (pointer to string). */`
		076f82	`+ decq %rcx`
		076f82	`+ andq $-VEC_SIZE, %rcx`
		076f82	`+`
		076f82	`+ movq %rcx, %rdx`
		076f82	+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
		076f82	+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
		076f82	`+ it adds more frontend uops (even if the moves can be eliminated) and`
		076f82	`+ some percentage of the time actual backend uops. */`
		076f82	`+ movaps -(VEC_SIZE)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ subq %rdi, %rdx`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ cmpq $(VEC_SIZE * 2), %rdx`
		076f82	`+ ja L(more_2x_vec)`
		076f82	`+L(last_2x_vec):`
		076f82	`+ subl $VEC_SIZE, %edx`
		076f82	`+ jbe L(ret_vec_x0_test)`
		076f82	`+`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82	`+`
		076f82	`+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ subl $VEC_SIZE, %edx`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ jz L(ret_1)`
		076f82	`+ addl %edx, %eax`
		076f82	`+ jl L(zero_0)`
		076f82	`+ addq %rdi, %rax`
		076f82	`+L(ret_1):`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches48):`
		076f82	`- bsr %eax, %eax`
		076f82	`- lea 48(%rax, %rdi), %rax`
		076f82	`+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)`
		076f82	`+ causes the hot pause (length <= VEC_SIZE) to span multiple cache`
		076f82	`+ lines. Naturally aligned % 16 to 8-bytes. */`
		076f82	`+L(page_cross):`
		076f82	`+ /* Zero length check. */`
		076f82	`+ testq %rdx, %rdx`
		076f82	`+ jz L(zero_0)`
		076f82	`+`
		076f82	`+ leaq -1(%rcx), %r8`
		076f82	`+ andq $-(VEC_SIZE), %r8`
		076f82	`+`
		076f82	`+ movaps (%r8), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %esi`
		076f82	`+ /* Shift out negative alignment (because we are starting from endptr and`
		076f82	`+ working backwards). */`
		076f82	`+ negl %ecx`
		076f82	`+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count`
		076f82	`+ explicitly. */`
		076f82	`+ andl $(VEC_SIZE - 1), %ecx`
		076f82	`+ shl %cl, %esi`
		076f82	`+ movzwl %si, %eax`
		076f82	`+ leaq (%rdi, %rdx), %rcx`
		076f82	`+ cmpq %rdi, %r8`
		076f82	`+ ja L(more_1x_vec)`
		076f82	`+ subl $VEC_SIZE, %edx`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ jz L(ret_2)`
		076f82	`+ addl %edx, %eax`
		076f82	`+ jl L(zero_1)`
		076f82	`+ addq %rdi, %rax`
		076f82	`+L(ret_2):`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches0_1):`
		076f82	`- bsr %eax, %eax`
		076f82	`- sub $64, %rdx`
		076f82	`- add %rax, %rdx`
		076f82	`- jl L(return_null)`
		076f82	`- add %rdi, %rax`
		076f82	`+ /* Fits in aliging bytes. */`
		076f82	`+L(zero_1):`
		076f82	`+ xorl %eax, %eax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches16_1):`
		076f82	`- bsr %eax, %eax`
		076f82	`- sub $48, %rdx`
		076f82	`- add %rax, %rdx`
		076f82	`- jl L(return_null)`
		076f82	`- lea 16(%rdi, %rax), %rax`
		076f82	`+ .p2align 4,, 5`
		076f82	`+L(ret_vec_x1):`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches32_1):`
		076f82	`- bsr %eax, %eax`
		076f82	`- sub $32, %rdx`
		076f82	`- add %rax, %rdx`
		076f82	`- jl L(return_null)`
		076f82	`- lea 32(%rdi, %rax), %rax`
		076f82	`- ret`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(more_2x_vec):`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(matches48_1):`
		076f82	`- bsr %eax, %eax`
		076f82	`- sub $16, %rdx`
		076f82	`- add %rax, %rdx`
		076f82	`- jl L(return_null)`
		076f82	`- lea 48(%rdi, %rax), %rax`
		076f82	`- ret`
		076f82	`+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x1)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(return_null):`
		076f82	`- xor %eax, %eax`
		076f82	`- ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(length_less16_offset0):`
		076f82	`- test %edx, %edx`
		076f82	`- jz L(return_null)`
		076f82	`+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82
		076f82	`- mov %dl, %cl`
		076f82	`- pcmpeqb (%rdi), %xmm1`
		076f82	`+ subq $(VEC_SIZE * 4), %rdx`
		076f82	`+ ja L(more_4x_vec)`
		076f82
		076f82	`- mov $1, %edx`
		076f82	`- sal %cl, %edx`
		076f82	`- sub $1, %edx`
		076f82	`+ addl $(VEC_SIZE), %edx`
		076f82	`+ jle L(ret_vec_x2_test)`
		076f82
		076f82	`- pmovmskb %xmm1, %eax`
		076f82	`+L(last_vec):`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x2)`
		076f82
		076f82	`- and %edx, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jz L(return_null)`
		076f82	`+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82
		076f82	`- bsr %eax, %eax`
		076f82	`- add %rdi, %rax`
		076f82	`+ subl $(VEC_SIZE), %edx`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ jz L(ret_3)`
		076f82	`+ addl %edx, %eax`
		076f82	`+ jl L(zero_2)`
		076f82	`+ addq %rdi, %rax`
		076f82	`+L(ret_3):`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(length_less16):`
		076f82	`- punpcklbw %xmm1, %xmm1`
		076f82	`- punpcklbw %xmm1, %xmm1`
		076f82	`-`
		076f82	`- add $16, %edx`
		076f82	`-`
		076f82	`- pshufd $0, %xmm1, %xmm1`
		076f82	`-`
		076f82	`- mov %edi, %ecx`
		076f82	`- and $15, %ecx`
		076f82	`- jz L(length_less16_offset0)`
		076f82	`-`
		076f82	`- mov %cl, %dh`
		076f82	`- mov %ecx, %esi`
		076f82	`- add %dl, %dh`
		076f82	`- and $-16, %rdi`
		076f82	`-`
		076f82	`- sub $16, %dh`
		076f82	`- ja L(length_less16_part2)`
		076f82	`-`
		076f82	`- pcmpeqb (%rdi), %xmm1`
		076f82	`- pmovmskb %xmm1, %eax`
		076f82	`-`
		076f82	`- sar %cl, %eax`
		076f82	`- mov %dl, %cl`
		076f82	`-`
		076f82	`- mov $1, %edx`
		076f82	`- sal %cl, %edx`
		076f82	`- sub $1, %edx`
		076f82	`-`
		076f82	`- and %edx, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jz L(return_null)`
		076f82	`-`
		076f82	`- bsr %eax, %eax`
		076f82	`- add %rdi, %rax`
		076f82	`- add %rsi, %rax`
		076f82	`+ .p2align 4,, 6`
		076f82	`+L(ret_vec_x2_test):`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ jz L(zero_2)`
		076f82	`+ addl %edx, %eax`
		076f82	`+ jl L(zero_2)`
		076f82	`+ addq %rdi, %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(length_less16_part2):`
		076f82	`- movdqa 16(%rdi), %xmm2`
		076f82	`- pcmpeqb %xmm1, %xmm2`
		076f82	`- pmovmskb %xmm2, %eax`
		076f82	`-`
		076f82	`- mov %dh, %cl`
		076f82	`- mov $1, %edx`
		076f82	`- sal %cl, %edx`
		076f82	`- sub $1, %edx`
		076f82	`-`
		076f82	`- and %edx, %eax`
		076f82	`+L(zero_2):`
		076f82	`+ xorl %eax, %eax`
		076f82	`+ ret`
		076f82
		076f82	`- test %eax, %eax`
		076f82	`- jnz L(length_less16_part2_return)`
		076f82
		076f82	`- pcmpeqb (%rdi), %xmm1`
		076f82	`- pmovmskb %xmm1, %eax`
		076f82	`+ .p2align 4,, 5`
		076f82	`+L(ret_vec_x2):`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax`
		076f82	`+ ret`
		076f82
		076f82	`- mov %esi, %ecx`
		076f82	`- sar %cl, %eax`
		076f82	`- test %eax, %eax`
		076f82	`- jz L(return_null)`
		076f82	`+ .p2align 4,, 5`
		076f82	`+L(ret_vec_x3):`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
		076f82	`+ ret`
		076f82
		076f82	`- bsr %eax, %eax`
		076f82	`- add %rdi, %rax`
		076f82	`- add %rsi, %rax`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(more_4x_vec):`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x2)`
		076f82	`+`
		076f82	`+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x3)`
		076f82	`+`
		076f82	`+ addq $-(VEC_SIZE * 4), %rcx`
		076f82	`+ cmpq $(VEC_SIZE * 4), %rdx`
		076f82	`+ jbe L(last_4x_vec)`
		076f82	`+`
		076f82	`+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end`
		076f82	`+ keeping the code from spilling to the next cache line. */`
		076f82	`+ addq $(VEC_SIZE * 4 - 1), %rcx`
		076f82	`+ andq $-(VEC_SIZE * 4), %rcx`
		076f82	`+ leaq (VEC_SIZE * 4)(%rdi), %rdx`
		076f82	`+ andq $-(VEC_SIZE * 4), %rdx`
		076f82	`+`
		076f82	`+ .p2align 4,, 11`
		076f82	`+L(loop_4x_vec):`
		076f82	`+ movaps (VEC_SIZE * -1)(%rcx), %xmm1`
		076f82	`+ movaps (VEC_SIZE * -2)(%rcx), %xmm2`
		076f82	`+ movaps (VEC_SIZE * -3)(%rcx), %xmm3`
		076f82	`+ movaps (VEC_SIZE * -4)(%rcx), %xmm4`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm2`
		076f82	`+ pcmpeqb %xmm0, %xmm3`
		076f82	`+ pcmpeqb %xmm0, %xmm4`
		076f82	`+`
		076f82	`+ por %xmm1, %xmm2`
		076f82	`+ por %xmm3, %xmm4`
		076f82	`+ por %xmm2, %xmm4`
		076f82	`+`
		076f82	`+ pmovmskb %xmm4, %esi`
		076f82	`+ testl %esi, %esi`
		076f82	`+ jnz L(loop_end)`
		076f82	`+`
		076f82	`+ addq $-(VEC_SIZE * 4), %rcx`
		076f82	`+ cmpq %rdx, %rcx`
		076f82	`+ jne L(loop_4x_vec)`
		076f82	`+`
		076f82	`+ subl %edi, %edx`
		076f82	`+`
		076f82	`+ /* Ends up being 1-byte nop. */`
		076f82	`+ .p2align 4,, 2`
		076f82	`+L(last_4x_vec):`
		076f82	`+ movaps -(VEC_SIZE)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ cmpl $(VEC_SIZE * 2), %edx`
		076f82	`+ jbe L(last_2x_vec)`
		076f82	`+`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82	`+`
		076f82	`+`
		076f82	`+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_end)`
		076f82	`+`
		076f82	`+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1`
		076f82	`+ pcmpeqb %xmm0, %xmm1`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+`
		076f82	`+ subl $(VEC_SIZE * 3), %edx`
		076f82	`+ ja L(last_vec)`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ jz L(ret_4)`
		076f82	`+ addl %edx, %eax`
		076f82	`+ jl L(zero_3)`
		076f82	`+ addq %rdi, %rax`
		076f82	`+L(ret_4):`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(length_less16_part2_return):`
		076f82	`- bsr %eax, %eax`
		076f82	`- lea 16(%rax, %rdi), %rax`
		076f82	`+ /* Ends up being 1-byte nop. */`
		076f82	`+ .p2align 4,, 3`
		076f82	`+L(loop_end):`
		076f82	`+ pmovmskb %xmm1, %eax`
		076f82	`+ sall $16, %eax`
		076f82	`+ jnz L(ret_vec_end)`
		076f82	`+`
		076f82	`+ pmovmskb %xmm2, %eax`
		076f82	`+ testl %eax, %eax`
		076f82	`+ jnz L(ret_vec_end)`
		076f82	`+`
		076f82	`+ pmovmskb %xmm3, %eax`
		076f82	`+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)`
		076f82	`+ then it won't affect the result in esi (VEC4). If ecx is non-zero`
		076f82	`+ then CHAR in VEC3 and bsrq will use that position. */`
		076f82	`+ sall $16, %eax`
		076f82	`+ orl %esi, %eax`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
		076f82	`ret`
		076f82
		076f82	`-END (__memrchr)`
		076f82	`+L(ret_vec_end):`
		076f82	`+ bsrl %eax, %eax`
		076f82	`+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax`
		076f82	`+ ret`
		076f82	`+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare`
		076f82	`+ aligning bytes. */`
		076f82	`+L(zero_3):`
		076f82	`+ xorl %eax, %eax`
		076f82	`+ ret`
		076f82	`+ /* 2-bytes from next cache line. */`
		076f82	`+END(__memrchr)`
		076f82	`weak_alias (__memrchr, memrchr)`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-283.patch