Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-284.patch

Blob History Raw

		076f82	`commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67`
		076f82	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		076f82	`Date: Mon Jun 6 21:11:31 2022 -0700`
		076f82
		076f82	`x86: Optimize memrchr-evex.S`
		076f82
		076f82	`The new code:`
		076f82	`1. prioritizes smaller user-arg lengths more.`
		076f82	`2. optimizes target placement more carefully`
		076f82	`3. reuses logic more`
		076f82	`4. fixes up various inefficiencies in the logic. The biggest`
		076f82	case here is the `lzcnt` logic for checking returns which
		076f82	`saves either a branch or multiple instructions.`
		076f82
		076f82	`The total code size saving is: 263 bytes`
		076f82	`Geometric Mean of all benchmarks New / Old: 0.755`
		076f82
		076f82	`Regressions:`
		076f82	`There are some regressions. Particularly where the length (user arg`
		076f82	`length) is large but the position of the match char is near the`
		076f82	`beginning of the string (in first VEC). This case has roughly a`
		076f82	`20% regression.`
		076f82
		076f82	`This is because the new logic gives the hot path for immediate matches`
		076f82	`to shorter lengths (the more common input). This case has roughly`
		076f82	`a 35% speedup.`
		076f82
		076f82	`Full xcheck passes on x86_64.`
		076f82	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		076f82
		076f82	`(cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9)`
		076f82
		076f82	`diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S`
		076f82	`index 16bf8e02b1e80c84..bddc89c3754894ed 100644`
		076f82	`--- a/sysdeps/x86_64/multiarch/memrchr-evex.S`
		076f82	`+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S`
		076f82	`@@ -19,319 +19,316 @@`
		076f82	`#if IS_IN (libc)`
		076f82
		076f82	`# include <sysdep.h>`
		076f82	`+# include "evex256-vecs.h"`
		076f82	`+# if VEC_SIZE != 32`
		076f82	`+# error "VEC_SIZE != 32 unimplemented"`
		076f82	`+# endif`
		076f82	`+`
		076f82	`+# ifndef MEMRCHR`
		076f82	`+# define MEMRCHR __memrchr_evex`
		076f82	`+# endif`
		076f82	`+`
		076f82	`+# define PAGE_SIZE 4096`
		076f82	`+# define VECMATCH VEC(0)`
		076f82	`+`
		076f82	`+ .section SECTION(.text), "ax", @progbits`
		076f82	`+ENTRY_P2ALIGN(MEMRCHR, 6)`
		076f82	`+# ifdef __ILP32__`
		076f82	`+ /* Clear upper bits. */`
		076f82	`+ and %RDX_LP, %RDX_LP`
		076f82	`+# else`
		076f82	`+ test %RDX_LP, %RDX_LP`
		076f82	`+# endif`
		076f82	`+ jz L(zero_0)`
		076f82	`+`
		076f82	`+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a`
		076f82	`+ correct page cross check and 2) it correctly sets up end ptr to be`
		076f82	`+ subtract by lzcnt aligned. */`
		076f82	`+ leaq -1(%rdi, %rdx), %rax`
		076f82	`+ vpbroadcastb %esi, %VECMATCH`
		076f82	`+`
		076f82	`+ /* Check if we can load 1x VEC without cross a page. */`
		076f82	`+ testl $(PAGE_SIZE - VEC_SIZE), %eax`
		076f82	`+ jz L(page_cross)`
		076f82	`+`
		076f82	`+ /* Don't use rax for pointer here because EVEX has better encoding with`
		076f82	`+ offset % VEC_SIZE == 0. */`
		076f82	`+ vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82	`+`
		076f82	`+ /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */`
		076f82	`+ cmpq $VEC_SIZE, %rdx`
		076f82	`+ ja L(more_1x_vec)`
		076f82	`+L(ret_vec_x0_test):`
		076f82	`+`
		076f82	`+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which`
		076f82	`+ will guarantee edx (len) is less than it. */`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ cmpl %ecx, %edx`
		076f82	`+ jle L(zero_0)`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ ret`
		076f82
		076f82	`-# define VMOVA vmovdqa64`
		076f82	`-`
		076f82	`-# define YMMMATCH ymm16`
		076f82	`-`
		076f82	`-# define VEC_SIZE 32`
		076f82	`-`
		076f82	`- .section .text.evex,"ax",@progbits`
		076f82	`-ENTRY (__memrchr_evex)`
		076f82	`- /* Broadcast CHAR to YMMMATCH. */`
		076f82	`- vpbroadcastb %esi, %YMMMATCH`
		076f82	`-`
		076f82	`- sub $VEC_SIZE, %RDX_LP`
		076f82	`- jbe L(last_vec_or_less)`
		076f82	`-`
		076f82	`- add %RDX_LP, %RDI_LP`
		076f82	`-`
		076f82	`- /* Check the last VEC_SIZE bytes. */`
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x0)`
		076f82	`-`
		076f82	`- subq $(VEC_SIZE * 4), %rdi`
		076f82	`- movl %edi, %ecx`
		076f82	`- andl $(VEC_SIZE - 1), %ecx`
		076f82	`- jz L(aligned_more)`
		076f82	`-`
		076f82	`- /* Align data for aligned loads in the loop. */`
		076f82	`- addq $VEC_SIZE, %rdi`
		076f82	`- addq $VEC_SIZE, %rdx`
		076f82	`- andq $-VEC_SIZE, %rdi`
		076f82	`- subq %rcx, %rdx`
		076f82	`-`
		076f82	`- .p2align 4`
		076f82	`-L(aligned_more):`
		076f82	`- subq $(VEC_SIZE * 4), %rdx`
		076f82	`- jbe L(last_4x_vec_or_less)`
		076f82	`-`
		076f82	`- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time`
		076f82	`- since data is only aligned to VEC_SIZE. */`
		076f82	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3)`
		076f82	`-`
		076f82	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2`
		076f82	`- kmovd %k2, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x2)`
		076f82	`-`
		076f82	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3`
		076f82	`- kmovd %k3, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1)`
		076f82	`-`
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k4`
		076f82	`- kmovd %k4, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x0)`
		076f82	`-`
		076f82	`- /* Align data to 4 * VEC_SIZE for loop with fewer branches.`
		076f82	`- There are some overlaps with above if data isn't aligned`
		076f82	`- to 4 * VEC_SIZE. */`
		076f82	`- movl %edi, %ecx`
		076f82	`- andl $(VEC_SIZE * 4 - 1), %ecx`
		076f82	`- jz L(loop_4x_vec)`
		076f82	`-`
		076f82	`- addq $(VEC_SIZE * 4), %rdi`
		076f82	`- addq $(VEC_SIZE * 4), %rdx`
		076f82	`- andq $-(VEC_SIZE * 4), %rdi`
		076f82	`- subq %rcx, %rdx`
		076f82	`+ /* Fits in aligning bytes of first cache line. */`
		076f82	`+L(zero_0):`
		076f82	`+ xorl %eax, %eax`
		076f82	`+ ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(loop_4x_vec):`
		076f82	`- /* Compare 4 * VEC at a time forward. */`
		076f82	`- subq $(VEC_SIZE * 4), %rdi`
		076f82	`- subq $(VEC_SIZE * 4), %rdx`
		076f82	`- jbe L(last_4x_vec_or_less)`
		076f82	`-`
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		076f82	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2`
		076f82	`- kord %k1, %k2, %k5`
		076f82	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3`
		076f82	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4`
		076f82	`-`
		076f82	`- kord %k3, %k4, %k6`
		076f82	`- kortestd %k5, %k6`
		076f82	`- jz L(loop_4x_vec)`
		076f82	`-`
		076f82	`- /* There is a match. */`
		076f82	`- kmovd %k4, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3)`
		076f82	`-`
		076f82	`- kmovd %k3, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x2)`
		076f82	`-`
		076f82	`- kmovd %k2, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1)`
		076f82	`-`
		076f82	`- kmovd %k1, %eax`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ .p2align 4,, 9`
		076f82	`+L(ret_vec_x0_dec):`
		076f82	`+ decq %rax`
		076f82	`+L(ret_vec_x0):`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq %rcx, %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_4x_vec_or_less):`
		076f82	`- addl $(VEC_SIZE * 4), %edx`
		076f82	`- cmpl $(VEC_SIZE * 2), %edx`
		076f82	`- jbe L(last_2x_vec)`
		076f82	`+ .p2align 4,, 10`
		076f82	`+L(more_1x_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82
		076f82	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3)`
		076f82	`+ /* Align rax (pointer to string). */`
		076f82	`+ andq $-VEC_SIZE, %rax`
		076f82
		076f82	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2`
		076f82	`- kmovd %k2, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x2)`
		076f82	`+ /* Recompute length after aligning. */`
		076f82	`+ movq %rax, %rdx`
		076f82
		076f82	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3`
		076f82	`- kmovd %k3, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1_check)`
		076f82	`- cmpl $(VEC_SIZE * 3), %edx`
		076f82	`- jbe L(zero)`
		076f82	`+ /* Need no matter what. */`
		076f82	`+ vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k4`
		076f82	`- kmovd %k4, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $(VEC_SIZE * 4), %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addq %rdi, %rax`
		076f82	`- ret`
		076f82	`+ subq %rdi, %rdx`
		076f82
		076f82	`- .p2align 4`
		076f82	`+ cmpq $(VEC_SIZE * 2), %rdx`
		076f82	`+ ja L(more_2x_vec)`
		076f82	`L(last_2x_vec):`
		076f82	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x3_check)`
		076f82	`+`
		076f82	`+ /* Must dec rax because L(ret_vec_x0_test) expects it. */`
		076f82	`+ decq %rax`
		076f82	`cmpl $VEC_SIZE, %edx`
		076f82	`- jbe L(zero)`
		076f82	`-`
		076f82	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $(VEC_SIZE * 2), %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addl $(VEC_SIZE * 2), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ jbe L(ret_vec_x0_test)`
		076f82	`+`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0)`
		076f82	`+`
		076f82	`+ /* Don't use rax for pointer here because EVEX has better encoding with`
		076f82	`+ offset % VEC_SIZE == 0. */`
		076f82	`+ vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82	`+ /* NB: 64-bit lzcnt. This will naturally add 32 to position. */`
		076f82	`+ lzcntq %rcx, %rcx`
		076f82	`+ cmpl %ecx, %edx`
		076f82	`+ jle L(zero_0)`
		076f82	`+ subq %rcx, %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x0):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ /* Inexpensive place to put this regarding code size / target alignments`
		076f82	`+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross`
		076f82	`+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit`
		076f82	`+ in first cache line. */`
		076f82	`+L(page_cross):`
		076f82	`+ movq %rax, %rsi`
		076f82	`+ andq $-VEC_SIZE, %rsi`
		076f82	`+ vpcmpb $0, (%rsi), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %r8d`
		076f82	`+ /* Shift out negative alignment (because we are starting from endptr and`
		076f82	`+ working backwards). */`
		076f82	`+ movl %eax, %ecx`
		076f82	`+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */`
		076f82	`+ notl %ecx`
		076f82	`+ shlxl %ecx, %r8d, %ecx`
		076f82	`+ cmpq %rdi, %rsi`
		076f82	`+ ja L(more_1x_vec)`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ cmpl %ecx, %edx`
		076f82	`+ jle L(zero_1)`
		076f82	`+ subq %rcx, %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x1):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addl $VEC_SIZE, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ /* Continue creating zero labels that fit in aligning bytes and get`
		076f82	`+ 2-byte encoding / are in the same cache line as condition. */`
		076f82	`+L(zero_1):`
		076f82	`+ xorl %eax, %eax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x2):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addl $(VEC_SIZE * 2), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(ret_vec_x1):`
		076f82	`+ /* This will naturally add 32 to position. */`
		076f82	`+ bsrl %ecx, %ecx`
		076f82	`+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x3):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addl $(VEC_SIZE * 3), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- ret`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(more_2x_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0_dec)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x1_check):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $(VEC_SIZE * 3), %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addl $VEC_SIZE, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- ret`
		076f82	`+ vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x1)`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_x3_check):`
		076f82	`- bsrl %eax, %eax`
		076f82	`- subq $VEC_SIZE, %rdx`
		076f82	`- addq %rax, %rdx`
		076f82	`- jl L(zero)`
		076f82	`- addl $(VEC_SIZE * 3), %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- ret`
		076f82	`+ /* Need no matter what. */`
		076f82	`+ vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(zero):`
		076f82	`- xorl %eax, %eax`
		076f82	`+ subq $(VEC_SIZE * 4), %rdx`
		076f82	`+ ja L(more_4x_vec)`
		076f82	`+`
		076f82	`+ cmpl $(VEC_SIZE * -1), %edx`
		076f82	`+ jle L(ret_vec_x2_test)`
		076f82	`+L(last_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x2)`
		076f82	`+`
		076f82	`+`
		076f82	`+ /* Need no matter what. */`
		076f82	`+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq $(VEC_SIZE * 3 + 1), %rax`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ cmpq %rax, %rdi`
		076f82	`+ ja L(zero_1)`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_or_less_aligned):`
		076f82	`- movl %edx, %ecx`
		076f82	`-`
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		076f82	`-`
		076f82	`- movl $1, %edx`
		076f82	`- /* Support rdx << 32. */`
		076f82	`- salq %cl, %rdx`
		076f82	`- subq $1, %rdx`
		076f82	`-`
		076f82	`- kmovd %k1, %eax`
		076f82	`-`
		076f82	`- /* Remove the trailing bytes. */`
		076f82	`- andl %edx, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`-`
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(ret_vec_x2_test):`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq $(VEC_SIZE * 2 + 1), %rax`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ cmpq %rax, %rdi`
		076f82	`+ ja L(zero_1)`
		076f82	`ret`
		076f82
		076f82	`- .p2align 4`
		076f82	`-L(last_vec_or_less):`
		076f82	`- addl $VEC_SIZE, %edx`
		076f82	`-`
		076f82	`- /* Check for zero length. */`
		076f82	`- testl %edx, %edx`
		076f82	`- jz L(zero)`
		076f82	`-`
		076f82	`- movl %edi, %ecx`
		076f82	`- andl $(VEC_SIZE - 1), %ecx`
		076f82	`- jz L(last_vec_or_less_aligned)`
		076f82	`-`
		076f82	`- movl %ecx, %esi`
		076f82	`- movl %ecx, %r8d`
		076f82	`- addl %edx, %esi`
		076f82	`- andq $-VEC_SIZE, %rdi`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(ret_vec_x2):`
		076f82	`+ bsrl %ecx, %ecx`
		076f82	`+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax`
		076f82	`+ ret`
		076f82
		076f82	`- subl $VEC_SIZE, %esi`
		076f82	`- ja L(last_vec_2x_aligned)`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(ret_vec_x3):`
		076f82	`+ bsrl %ecx, %ecx`
		076f82	`+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
		076f82	`+ ret`
		076f82
		076f82	`- /* Check the last VEC. */`
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		076f82	`- kmovd %k1, %eax`
		076f82	`+ .p2align 4,, 8`
		076f82	`+L(more_4x_vec):`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x2)`
		076f82
		076f82	`- /* Remove the leading and trailing bytes. */`
		076f82	`- sarl %cl, %eax`
		076f82	`- movl %edx, %ecx`
		076f82	`+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82
		076f82	`- movl $1, %edx`
		076f82	`- sall %cl, %edx`
		076f82	`- subl $1, %edx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x3)`
		076f82
		076f82	`- andl %edx, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`+ /* Check if near end before re-aligning (otherwise might do an`
		076f82	`+ unnecessary loop iteration). */`
		076f82	`+ addq $-(VEC_SIZE * 4), %rax`
		076f82	`+ cmpq $(VEC_SIZE * 4), %rdx`
		076f82	`+ jbe L(last_4x_vec)`
		076f82
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- addq %r8, %rax`
		076f82	`- ret`
		076f82	`+ decq %rax`
		076f82	`+ andq $-(VEC_SIZE * 4), %rax`
		076f82	`+ movq %rdi, %rdx`
		076f82	`+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because`
		076f82	`+ lengths that overflow can be valid and break the comparison. */`
		076f82	`+ andq $-(VEC_SIZE * 4), %rdx`
		076f82
		076f82	`.p2align 4`
		076f82	`-L(last_vec_2x_aligned):`
		076f82	`- movl %esi, %ecx`
		076f82	`-`
		076f82	`- /* Check the last VEC. */`
		076f82	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1`
		076f82	`+L(loop_4x_vec):`
		076f82	`+ /* Store 1 were not-equals and 0 where equals in k1 (used to mask later`
		076f82	`+ on). */`
		076f82	`+ vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1`
		076f82	`+`
		076f82	`+ /* VEC(2/3) will have zero-byte where we found a CHAR. */`
		076f82	`+ vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)`
		076f82	`+ vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)`
		076f82	`+ vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4`
		076f82	`+`
		076f82	`+ /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where`
		076f82	`+ CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */`
		076f82	`+ vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}`
		076f82	`+ vptestnmb %VEC(3), %VEC(3), %k2`
		076f82	`+`
		076f82	`+ /* Any 1s and we found CHAR. */`
		076f82	`+ kortestd %k2, %k4`
		076f82	`+ jnz L(loop_end)`
		076f82	`+`
		076f82	`+ addq $-(VEC_SIZE * 4), %rax`
		076f82	`+ cmpq %rdx, %rax`
		076f82	`+ jne L(loop_4x_vec)`
		076f82	`+`
		076f82	`+ /* Need to re-adjust rdx / rax for L(last_4x_vec). */`
		076f82	`+ subq $-(VEC_SIZE * 4), %rdx`
		076f82	`+ movq %rdx, %rax`
		076f82	`+ subl %edi, %edx`
		076f82	`+L(last_4x_vec):`
		076f82	`+`
		076f82	`+ /* Used no matter what. */`
		076f82	`+ vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82
		076f82	`- movl $1, %edx`
		076f82	`- sall %cl, %edx`
		076f82	`- subl $1, %edx`
		076f82	`+ cmpl $(VEC_SIZE * 2), %edx`
		076f82	`+ jbe L(last_2x_vec)`
		076f82
		076f82	`- kmovd %k1, %eax`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0_dec)`
		076f82
		076f82	`- /* Remove the trailing bytes. */`
		076f82	`- andl %edx, %eax`
		076f82
		076f82	`- testl %eax, %eax`
		076f82	`- jnz L(last_vec_x1)`
		076f82	`+ vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82
		076f82	`- /* Check the second last VEC. */`
		076f82	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x1)`
		076f82
		076f82	`- movl %r8d, %ecx`
		076f82	`+ /* Used no matter what. */`
		076f82	`+ vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82
		076f82	`- kmovd %k1, %eax`
		076f82	`+ cmpl $(VEC_SIZE * 3), %edx`
		076f82	`+ ja L(last_vec)`
		076f82
		076f82	`- /* Remove the leading bytes. Must use unsigned right shift for`
		076f82	`- bsrl below. */`
		076f82	`- shrl %cl, %eax`
		076f82	`- testl %eax, %eax`
		076f82	`- jz L(zero)`
		076f82	`+ lzcntl %ecx, %ecx`
		076f82	`+ subq $(VEC_SIZE * 2 + 1), %rax`
		076f82	`+ subq %rcx, %rax`
		076f82	`+ cmpq %rax, %rdi`
		076f82	`+ jbe L(ret_1)`
		076f82	`+ xorl %eax, %eax`
		076f82	`+L(ret_1):`
		076f82	`+ ret`
		076f82
		076f82	`- bsrl %eax, %eax`
		076f82	`- addq %rdi, %rax`
		076f82	`- addq %r8, %rax`
		076f82	`+ .p2align 4,, 6`
		076f82	`+L(loop_end):`
		076f82	`+ kmovd %k1, %ecx`
		076f82	`+ notl %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x0_end)`
		076f82	`+`
		076f82	`+ vptestnmb %VEC(2), %VEC(2), %k0`
		076f82	`+ kmovd %k0, %ecx`
		076f82	`+ testl %ecx, %ecx`
		076f82	`+ jnz L(ret_vec_x1_end)`
		076f82	`+`
		076f82	`+ kmovd %k2, %ecx`
		076f82	`+ kmovd %k4, %esi`
		076f82	`+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)`
		076f82	`+ then it won't affect the result in esi (VEC4). If ecx is non-zero`
		076f82	`+ then CHAR in VEC3 and bsrq will use that position. */`
		076f82	`+ salq $32, %rcx`
		076f82	`+ orq %rsi, %rcx`
		076f82	`+ bsrq %rcx, %rcx`
		076f82	`+ addq %rcx, %rax`
		076f82	`+ ret`
		076f82	`+ .p2align 4,, 4`
		076f82	`+L(ret_vec_x0_end):`
		076f82	`+ addq $(VEC_SIZE), %rax`
		076f82	`+L(ret_vec_x1_end):`
		076f82	`+ bsrl %ecx, %ecx`
		076f82	`+ leaq (VEC_SIZE * 2)(%rax, %rcx), %rax`
		076f82	`ret`
		076f82	`-END (__memrchr_evex)`
		076f82	`+`
		076f82	`+END(MEMRCHR)`
		076f82	`#endif`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-284.patch