Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-285.patch

Blob History Raw

		076f82	`commit b05bd59823bcedee281d3fd5bd4928698ea9d69d`
		076f82	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		076f82	`Date: Mon Jun 6 21:11:32 2022 -0700`
		076f82
		076f82	`x86: Optimize memrchr-avx2.S`
		076f82
		076f82	`The new code:`
		076f82	`1. prioritizes smaller user-arg lengths more.`
		076f82	`2. optimizes target placement more carefully`
		076f82	`3. reuses logic more`
		076f82	`4. fixes up various inefficiencies in the logic. The biggest`
		076f82	case here is the `lzcnt` logic for checking returns which
		076f82	`saves either a branch or multiple instructions.`
		076f82
		076f82	`The total code size saving is: 306 bytes`
		076f82	`Geometric Mean of all benchmarks New / Old: 0.760`
		076f82
		076f82	`Regressions:`
		076f82	`There are some regressions. Particularly where the length (user arg`
		076f82	`length) is large but the position of the match char is near the`
		076f82	`beginning of the string (in first VEC). This case has roughly a`
		076f82	`10-20% regression.`
		076f82
		076f82	`This is because the new logic gives the hot path for immediate matches`
		076f82	`to shorter lengths (the more common input). This case has roughly`
		076f82	`a 15-45% speedup.`
		076f82
		076f82	`Full xcheck passes on x86_64.`
		076f82	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		076f82
		076f82	`(cherry picked from commit af5306a735eb0966fdc2f8ccdafa8888e2df0c87)`
		076f82
		076f82	`diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S`
		076f82	`index cea2d2a72db7406a..5e9beeeef2677c9f 100644`
		076f82	`--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S`
		076f82	`+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S`
		076f82	`@@ -2,6 +2,7 @@`
		076f82	`# define MEMRCHR __memrchr_avx2_rtm`
		076f82	`#endif`
		076f82
		076f82	`+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST`
		076f82	`#define ZERO_UPPER_VEC_REGISTERS_RETURN \`
		076f82	`ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST`
		076f82
		076f82	`diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S`
		076f82	`index ac7370cb06e9a0fd..5f8e0be18cfe4fad 100644`
		076f82	`--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S`
		076f82	`+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S`
		076f82	`@@ -21,340 +21,318 @@`
		076f82	`# include <sysdep.h>`
		076f82
		076f82	`# ifndef MEMRCHR`
		076f82	`-# define MEMRCHR __memrchr_avx2`
		076f82	`+# define MEMRCHR __memrchr_avx2`
		076f82	`# endif`
		076f82
		076f82	`# ifndef VZEROUPPER`
		076f82	`-# define VZEROUPPER vzeroupper`
		076f82	`+# define VZEROUPPER vzeroupper`
		076f82	`# endif`
		076f82
		076f82	`# ifndef SECTION`
		076f82	`# define SECTION(p) p##.avx`
		076f82	`# endif`
		076f82
		076f82	`-# define VEC_SIZE 32`
		076f82	`+# define VEC_SIZE 32`
		076f82	`+# define PAGE_SIZE 4096`
		076f82	`+ .section SECTION(.text), "ax", @progbits`
		076f82	`+ENTRY(MEMRCHR)`
		076f82	`+# ifdef __ILP32__`
		076f82	`+ /* Clear upper bits. */`
		076f82	`+ and %RDX_LP, %RDX_LP`
		076f82	`+# else`
		076f82	`+ test %RDX_LP, %RDX_LP`
		076f82	`+# endif`
		076f82	`+ jz L(zero_0)`
		076f82
		076f82	`- .section SECTION(.text),"ax",@progbits`
		076f82	`-ENTRY (MEMRCHR)`
		076f82	`- /* Broadcast CHAR to YMM0. */`
		076f82	`vmovd %esi, %xmm0`
		076f82	`- vpbroadcastb %xmm0, %ymm0`
		076f82	`-`
		076f82	`- sub $VEC_SIZE, %RDX_LP`
		076f82	`- jbe L(last_vec_or_less)`
		076f82	`-`
		076f82	`- add %RDX_LP, %RDI_LP`
		076f82	`-`
		076f82	`- /* Check the last VEC_SIZE bytes. */`
		076f82	`- vpcmpeqb (%rdi), %ymm0, %ymm1`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x0)`
		076f82	`+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a`
		076f82	`+ correct page cross check and 2) it correctly sets up end ptr to be`
		076f82	`+ subtract by lzcnt aligned. */`
		076f82	`+ leaq -1(%rdx, %rdi), %rax`
		076f82
		076f82	`- subq $(VEC_SIZE * 4), %rdi`
		076f82	`- movl %edi, %ecx`
		076f82	`- andl $(VEC_SIZE - 1), %ecx`
		076f82	`- jz L(aligned_more)`
		076f82	`+ vpbroadcastb %xmm0, %ymm0`
		076f82
		076f82	`- /* Align data for aligned loads in the loop. */`
		076f82	`- addq $VEC_SIZE, %rdi`
		076f82	`- addq $VEC_SIZE, %rdx`
		076f82	`- andq $-VEC_SIZE, %rdi`
		076f82	`- subq %rcx, %rdx`
		076f82	`+ /* Check if we can load 1x VEC without cross a page. */`
		076f82	`+ testl $(PAGE_SIZE - VEC_SIZE), %eax`
		076f82	`+ jz L(page_cross)`
		076f82	`+`
		076f82	`+ vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ cmpq $VEC_SIZE, %rdx`
		076f82	`+ ja L(more_1x_vec)`
		076f82	`+`
		076f82	`+L(ret_vec_x0_test):`
		076f82	`+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which`
		076f82	`+ will gurantee edx (len) is less than it. */`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+`
		076f82	`+ /* Hoist vzeroupper (not great for RTM) to save code size. This allows`
		076f82	`+ all logic for edx (len) <= VEC_SIZE to fit in first cache line. */`
		076f82	`+ COND_VZEROUPPER`
		076f82	`+ cmpl %ecx, %edx`
		076f82	`+ jle L(zero_0)`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(aligned_more):`
		076f82	`- subq $(VEC_SIZE * 4), %rdx`
		076f82	`- jbe L(last_4x_vec_or_less)`
		076f82	`-`
		076f82	`- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time`
		076f82	`- since data is only aligned to VEC_SIZE. */`
		076f82	`- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3)`
		076f82	`-`
		076f82	`- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2`
		076f82	`- vpmovmskb %ymm2, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x2)`
		076f82	`-`
		076f82	`- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3`
		076f82	`- vpmovmskb %ymm3, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1)`
		076f82	`-`
		076f82	`- vpcmpeqb (%rdi), %ymm0, %ymm4`
		076f82	`- vpmovmskb %ymm4, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x0)`
		076f82	`-`
		076f82	`- /* Align data to 4 * VEC_SIZE for loop with fewer branches.`
		076f82	`- There are some overlaps with above if data isn't aligned`
		076f82	`- to 4 * VEC_SIZE. */`
		076f82	`- movl %edi, %ecx`
		076f82	`- andl $(VEC_SIZE * 4 - 1), %ecx`
		076f82	`- jz L(loop_4x_vec)`
		076f82	`-`
		076f82	`- addq $(VEC_SIZE * 4), %rdi`
		076f82	`- addq $(VEC_SIZE * 4), %rdx`
		076f82	`- andq $-(VEC_SIZE * 4), %rdi`
		076f82	`- subq %rcx, %rdx`
		076f82	`+ /* Fits in aligning bytes of first cache line. */`
		076f82	`+L(zero_0):`
		076f82	`+ xorl %eax, %eax`
		076f82	`+ ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(loop_4x_vec):`
		076f82	`- /* Compare 4 * VEC at a time forward. */`
		076f82	`- subq $(VEC_SIZE * 4), %rdi`
		076f82	`- subq $(VEC_SIZE * 4), %rdx`
		076f82	`- jbe L(last_4x_vec_or_less)`
		076f82	`-`
		076f82	`- vmovdqa (%rdi), %ymm1`
		076f82	`- vmovdqa VEC_SIZE(%rdi), %ymm2`
		076f82	`- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3`
		076f82	`- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4`
		076f82	`-`
		076f82	`- vpcmpeqb %ymm1, %ymm0, %ymm1`
		076f82	`- vpcmpeqb %ymm2, %ymm0, %ymm2`
		076f82	`- vpcmpeqb %ymm3, %ymm0, %ymm3`
		076f82	`- vpcmpeqb %ymm4, %ymm0, %ymm4`
		076f82	`-`
		076f82	`- vpor %ymm1, %ymm2, %ymm5`
		076f82	`- vpor %ymm3, %ymm4, %ymm6`
		076f82	`- vpor %ymm5, %ymm6, %ymm5`
		076f82	`-`
		076f82	`- vpmovmskb %ymm5, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(loop_4x_vec)`
		076f82	`-`
		076f82	`- /* There is a match. */`
		076f82	`- vpmovmskb %ymm4, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3)`
		076f82	`-`
		076f82	`- vpmovmskb %ymm3, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x2)`
		076f82	`-`
		076f82	`- vpmovmskb %ymm2, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1)`
		076f82	`-`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ .p2align 4,, 9`
		076f82	`+L(ret_vec_x0):`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq %rcx, %rax`
		076f82	`L(return_vzeroupper):`
		076f82	`ZERO_UPPER_VEC_REGISTERS_RETURN`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_4x_vec_or_less):`
		076f82	`- addl $(VEC_SIZE * 4), %edx`
		076f82	`- cmpl $(VEC_SIZE * 2), %edx`
		076f82	`- jbe L(last_2x_vec)`
		076f82	`-`
		076f82	`- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3)`
		076f82	`-`
		076f82	`- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2`
		076f82	`- vpmovmskb %ymm2, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x2)`
		076f82	`-`
		076f82	`- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3`
		076f82	`- vpmovmskb %ymm3, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1_check)`
		076f82	`- cmpl $(VEC_SIZE * 3), %edx`
		076f82	`- jbe L(zero)`
		076f82	`-`
		076f82	`- vpcmpeqb (%rdi), %ymm0, %ymm4`
		076f82	`- vpmovmskb %ymm4, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $(VEC_SIZE * 4), %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`+ .p2align 4,, 10`
		076f82	`+L(more_1x_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82	`+`
		076f82	`+ /* Align rax (string pointer). */`
		076f82	`+ andq $-VEC_SIZE, %rax`
		076f82	`+`
		076f82	`+ /* Recompute remaining length after aligning. */`
		076f82	`+ movq %rax, %rdx`
		076f82	`+ /* Need this comparison next no matter what. */`
		076f82	`+ vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1`
		076f82	`+ subq %rdi, %rdx`
		076f82	`+ decq %rax`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ /* Fall through for short (hotter than length). */`
		076f82	`+ cmpq $(VEC_SIZE * 2), %rdx`
		076f82	`+ ja L(more_2x_vec)`
		076f82	`L(last_2x_vec):`
		076f82	`- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3_check)`
		076f82	`cmpl $VEC_SIZE, %edx`
		076f82	`- jbe L(zero)`
		076f82	`-`
		076f82	`- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $(VEC_SIZE * 2), %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addl $(VEC_SIZE * 2), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x0):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82	`+ jbe L(ret_vec_x0_test)`
		076f82	`+`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82	`+`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ /* 64-bit lzcnt. This will naturally add 32 to position. */`
		076f82	`+ lzcntq %rcx, %rcx`
		076f82	`+ COND_VZEROUPPER`
		076f82	`+ cmpl %ecx, %edx`
		076f82	`+ jle L(zero_0)`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x1):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addl $VEC_SIZE, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x2):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addl $(VEC_SIZE * 2), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ /* Inexpensive place to put this regarding code size / target alignments`
		076f82	`+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross`
		076f82	`+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit`
		076f82	`+ in first cache line. */`
		076f82	`+L(page_cross):`
		076f82	`+ movq %rax, %rsi`
		076f82	`+ andq $-VEC_SIZE, %rsi`
		076f82	`+ vpcmpeqb (%rsi), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ /* Shift out negative alignment (because we are starting from endptr and`
		076f82	`+ working backwards). */`
		076f82	`+ movl %eax, %r8d`
		076f82	`+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */`
		076f82	`+ notl %r8d`
		076f82	`+ shlxl %r8d, %ecx, %ecx`
		076f82	`+ cmpq %rdi, %rsi`
		076f82	`+ ja L(more_1x_vec)`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ COND_VZEROUPPER`
		076f82	`+ cmpl %ecx, %edx`
		076f82	`+ jle L(zero_0)`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ ret`
		076f82	`+ .p2align 4,, 11`
		076f82	`+L(ret_vec_x1):`
		076f82	`+ /* This will naturally add 32 to position. */`
		076f82	`+ lzcntq %rcx, %rcx`
		076f82	`+ subq %rcx, %rax`
		076f82	`VZEROUPPER_RETURN`
		076f82	`+ .p2align 4,, 10`
		076f82	`+L(more_2x_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x3):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addl $(VEC_SIZE * 3), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- ret`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x1)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x1_check):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $(VEC_SIZE * 3), %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addl $VEC_SIZE, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x3_check):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $VEC_SIZE, %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addl $(VEC_SIZE * 3), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82	`+ /* Needed no matter what. */`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(zero):`
		076f82	`- xorl %eax, %eax`
		076f82	`- VZEROUPPER_RETURN`
		076f82	`+ subq $(VEC_SIZE * 4), %rdx`
		076f82	`+ ja L(more_4x_vec)`
		076f82	`+`
		076f82	`+ cmpl $(VEC_SIZE * -1), %edx`
		076f82	`+ jle L(ret_vec_x2_test)`
		076f82	`+`
		076f82	`+L(last_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x2)`
		076f82	`+`
		076f82	`+ /* Needed no matter what. */`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq $(VEC_SIZE * 3), %rax`
		076f82	`+ COND_VZEROUPPER`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ cmpq %rax, %rdi`
		076f82	`+ ja L(zero_2)`
		076f82	`+ ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(null):`
		076f82	`+ /* First in aligning bytes. */`
		076f82	`+L(zero_2):`
		076f82	`xorl %eax, %eax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_or_less_aligned):`
		076f82	`- movl %edx, %ecx`
		076f82	`+ .p2align 4,, 4`
		076f82	`+L(ret_vec_x2_test):`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq $(VEC_SIZE * 2), %rax`
		076f82	`+ COND_VZEROUPPER`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ cmpq %rax, %rdi`
		076f82	`+ ja L(zero_2)`
		076f82	`+ ret`
		076f82
		076f82	`- vpcmpeqb (%rdi), %ymm0, %ymm1`
		076f82
		076f82	`- movl $1, %edx`
		076f82	`- /* Support rdx << 32. */`
		076f82	`- salq %cl, %rdx`
		076f82	`- subq $1, %rdx`
		076f82	`+ .p2align 4,, 11`
		076f82	`+L(ret_vec_x2):`
		076f82	`+ /* ecx must be non-zero. */`
		076f82	`+ bsrl %ecx, %ecx`
		076f82	`+ leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax`
		076f82	`+ VZEROUPPER_RETURN`
		076f82
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`+ .p2align 4,, 14`
		076f82	`+L(ret_vec_x3):`
		076f82	`+ /* ecx must be non-zero. */`
		076f82	`+ bsrl %ecx, %ecx`
		076f82	`+ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax`
		076f82	`+ VZEROUPPER_RETURN`
		076f82
		076f82	`- /* Remove the trailing bytes. */`
		076f82	`- andl %edx, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82
		076f82	`.p2align 4`
		076f82	`-L(last_vec_or_less):`
		076f82	`- addl $VEC_SIZE, %edx`
		076f82	`+L(more_4x_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x2)`
		076f82
		076f82	`- /* Check for zero length. */`
		076f82	`- testl %edx, %edx`
		076f82	`- jz L(null)`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82
		076f82	`- movl %edi, %ecx`
		076f82	`- andl $(VEC_SIZE - 1), %ecx`
		076f82	`- jz L(last_vec_or_less_aligned)`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x3)`
		076f82
		076f82	`- movl %ecx, %esi`
		076f82	`- movl %ecx, %r8d`
		076f82	`- addl %edx, %esi`
		076f82	`- andq $-VEC_SIZE, %rdi`
		076f82	`+ /* Check if near end before re-aligning (otherwise might do an`
		076f82	`+ unnecissary loop iteration). */`
		076f82	`+ addq $-(VEC_SIZE * 4), %rax`
		076f82	`+ cmpq $(VEC_SIZE * 4), %rdx`
		076f82	`+ jbe L(last_4x_vec)`
		076f82
		076f82	`- subl $VEC_SIZE, %esi`
		076f82	`- ja L(last_vec_2x_aligned)`
		076f82	`+ /* Align rax to (VEC_SIZE - 1). */`
		076f82	`+ orq $(VEC_SIZE * 4 - 1), %rax`
		076f82	`+ movq %rdi, %rdx`
		076f82	`+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because`
		076f82	`+ lengths that overflow can be valid and break the comparison. */`
		076f82	`+ orq $(VEC_SIZE * 4 - 1), %rdx`
		076f82
		076f82	`- /* Check the last VEC. */`
		076f82	`- vpcmpeqb (%rdi), %ymm0, %ymm1`
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`-`
		076f82	`- /* Remove the leading and trailing bytes. */`
		076f82	`- sarl %cl, %eax`
		076f82	`- movl %edx, %ecx`
		076f82	`+ .p2align 4`
		076f82	`+L(loop_4x_vec):`
		076f82	`+ /* Need this comparison next no matter what. */`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4`
		076f82
		076f82	`- movl $1, %edx`
		076f82	`- sall %cl, %edx`
		076f82	`- subl $1, %edx`
		076f82	`+ vpor %ymm1, %ymm2, %ymm2`
		076f82	`+ vpor %ymm3, %ymm4, %ymm4`
		076f82	`+ vpor %ymm2, %ymm4, %ymm4`
		076f82	`+ vpmovmskb %ymm4, %esi`
		076f82
		076f82	`- andl %edx, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`+ testl %esi, %esi`
		076f82	`+ jnz L(loop_end)`
		076f82
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- addq %r8, %rax`
		076f82	`- VZEROUPPER_RETURN`
		076f82	`+ addq $(VEC_SIZE * -4), %rax`
		076f82	`+ cmpq %rdx, %rax`
		076f82	`+ jne L(loop_4x_vec)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_2x_aligned):`
		076f82	`- movl %esi, %ecx`
		076f82	`+ subl %edi, %edx`
		076f82	`+ incl %edx`
		076f82
		076f82	`- /* Check the last VEC. */`
		076f82	`- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1`
		076f82	`+L(last_4x_vec):`
		076f82	`+ /* Used no matter what. */`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82
		076f82	`- movl $1, %edx`
		076f82	`- sall %cl, %edx`
		076f82	`- subl $1, %edx`
		076f82	`+ cmpl $(VEC_SIZE * 2), %edx`
		076f82	`+ jbe L(last_2x_vec)`
		076f82
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0_end)`
		076f82
		076f82	`- /* Remove the trailing bytes. */`
		076f82	`- andl %edx, %eax`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x1_end)`
		076f82
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1)`
		076f82	`+ /* Used no matter what. */`
		076f82	`+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82
		076f82	`- /* Check the second last VEC. */`
		076f82	`- vpcmpeqb (%rdi), %ymm0, %ymm1`
		076f82	`+ cmpl $(VEC_SIZE * 3), %edx`
		076f82	`+ ja L(last_vec)`
		076f82	`+`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq $(VEC_SIZE * 2), %rax`
		076f82	`+ COND_VZEROUPPER`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ cmpq %rax, %rdi`
		076f82	`+ jbe L(ret0)`
		076f82	`+ xorl %eax, %eax`
		076f82	`+L(ret0):`
		076f82	`+ ret`
		076f82
		076f82	`- movl %r8d, %ecx`
		076f82
		076f82	`- vpmovmskb %ymm1, %eax`
		076f82	`+ .p2align 4`
		076f82	`+L(loop_end):`
		076f82	`+ vpmovmskb %ymm1, %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0_end)`
		076f82	`+`
		076f82	`+ vpmovmskb %ymm2, %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x1_end)`
		076f82	`+`
		076f82	`+ vpmovmskb %ymm3, %ecx`
		076f82	`+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)`
		076f82	`+ then it won't affect the result in esi (VEC4). If ecx is non-zero`
		076f82	`+ then CHAR in VEC3 and bsrq will use that position. */`
		076f82	`+ salq $32, %rcx`
		076f82	`+ orq %rsi, %rcx`
		076f82	`+ bsrq %rcx, %rcx`
		076f82	`+ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax`
		076f82	`+ VZEROUPPER_RETURN`
		076f82
		076f82	`- /* Remove the leading bytes. Must use unsigned right shift for`
		076f82	`- bsrl below. */`
		076f82	`- shrl %cl, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`+ .p2align 4,, 4`
		076f82	`+L(ret_vec_x1_end):`
		076f82	`+ /* 64-bit version will automatically add 32 (VEC_SIZE). */`
		076f82	`+ lzcntq %rcx, %rcx`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ VZEROUPPER_RETURN`
		076f82
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- addq %r8, %rax`
		076f82	`+ .p2align 4,, 4`
		076f82	`+L(ret_vec_x0_end):`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq %rcx, %rax`
		076f82	`VZEROUPPER_RETURN`
		076f82	`-END (MEMRCHR)`
		076f82	`+`
		076f82	`+ /* 2 bytes until next cache line. */`
		076f82	`+END(MEMRCHR)`
		076f82	`#endif`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-285.patch