Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/glibc-upstream-2.34-284.patch

Blob History Raw

		08c3a6	`commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67`
		08c3a6	`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
		08c3a6	`Date: Mon Jun 6 21:11:31 2022 -0700`
		08c3a6
		08c3a6	`x86: Optimize memrchr-evex.S`
		08c3a6
		08c3a6	`The new code:`
		08c3a6	`1. prioritizes smaller user-arg lengths more.`
		08c3a6	`2. optimizes target placement more carefully`
		08c3a6	`3. reuses logic more`
		08c3a6	`4. fixes up various inefficiencies in the logic. The biggest`
		08c3a6	case here is the `lzcnt` logic for checking returns which
		08c3a6	`saves either a branch or multiple instructions.`
		08c3a6
		08c3a6	`The total code size saving is: 263 bytes`
		08c3a6	`Geometric Mean of all benchmarks New / Old: 0.755`
		08c3a6
		08c3a6	`Regressions:`
		08c3a6	`There are some regressions. Particularly where the length (user arg`
		08c3a6	`length) is large but the position of the match char is near the`
		08c3a6	`beginning of the string (in first VEC). This case has roughly a`
		08c3a6	`20% regression.`
		08c3a6
		08c3a6	`This is because the new logic gives the hot path for immediate matches`
		08c3a6	`to shorter lengths (the more common input). This case has roughly`
		08c3a6	`a 35% speedup.`
		08c3a6
		08c3a6	`Full xcheck passes on x86_64.`
		08c3a6	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		08c3a6
		08c3a6	`(cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9)`
		08c3a6
		08c3a6	`diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S`
		08c3a6	`index 16bf8e02b1e80c84..bddc89c3754894ed 100644`
		08c3a6	`--- a/sysdeps/x86_64/multiarch/memrchr-evex.S`
		08c3a6	`+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S`
		08c3a6	`@@ -19,319 +19,316 @@`
		08c3a6	`#if IS_IN (libc)`
		08c3a6
		08c3a6	`# include <sysdep.h>`
		08c3a6	`+# include "evex256-vecs.h"`
		08c3a6	`+# if VEC_SIZE != 32`
		08c3a6	`+# error "VEC_SIZE != 32 unimplemented"`
		08c3a6	`+# endif`
		08c3a6	`+`
		08c3a6	`+# ifndef MEMRCHR`
		08c3a6	`+# define MEMRCHR __memrchr_evex`
		08c3a6	`+# endif`
		08c3a6	`+`
		08c3a6	`+# define PAGE_SIZE 4096`
		08c3a6	`+# define VECMATCH VEC(0)`
		08c3a6	`+`
		08c3a6	`+ .section SECTION(.text), "ax", @progbits`
		08c3a6	`+ENTRY_P2ALIGN(MEMRCHR, 6)`
		08c3a6	`+# ifdef __ILP32__`
		08c3a6	`+ /* Clear upper bits. */`
		08c3a6	`+ and %RDX_LP, %RDX_LP`
		08c3a6	`+# else`
		08c3a6	`+ test %RDX_LP, %RDX_LP`
		08c3a6	`+# endif`
		08c3a6	`+ jz L(zero_0)`
		08c3a6	`+`
		08c3a6	`+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a`
		08c3a6	`+ correct page cross check and 2) it correctly sets up end ptr to be`
		08c3a6	`+ subtract by lzcnt aligned. */`
		08c3a6	`+ leaq -1(%rdi, %rdx), %rax`
		08c3a6	`+ vpbroadcastb %esi, %VECMATCH`
		08c3a6	`+`
		08c3a6	`+ /* Check if we can load 1x VEC without cross a page. */`
		08c3a6	`+ testl $(PAGE_SIZE - VEC_SIZE), %eax`
		08c3a6	`+ jz L(page_cross)`
		08c3a6	`+`
		08c3a6	`+ /* Don't use rax for pointer here because EVEX has better encoding with`
		08c3a6	`+ offset % VEC_SIZE == 0. */`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6	`+`
		08c3a6	`+ /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */`
		08c3a6	`+ cmpq $VEC_SIZE, %rdx`
		08c3a6	`+ ja L(more_1x_vec)`
		08c3a6	`+L(ret_vec_x0_test):`
		08c3a6	`+`
		08c3a6	`+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which`
		08c3a6	`+ will guarantee edx (len) is less than it. */`
		08c3a6	`+ lzcntl %ecx, %ecx`
		08c3a6	`+ cmpl %ecx, %edx`
		08c3a6	`+ jle L(zero_0)`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`-# define VMOVA vmovdqa64`
		08c3a6	`-`
		08c3a6	`-# define YMMMATCH ymm16`
		08c3a6	`-`
		08c3a6	`-# define VEC_SIZE 32`
		08c3a6	`-`
		08c3a6	`- .section .text.evex,"ax",@progbits`
		08c3a6	`-ENTRY (__memrchr_evex)`
		08c3a6	`- /* Broadcast CHAR to YMMMATCH. */`
		08c3a6	`- vpbroadcastb %esi, %YMMMATCH`
		08c3a6	`-`
		08c3a6	`- sub $VEC_SIZE, %RDX_LP`
		08c3a6	`- jbe L(last_vec_or_less)`
		08c3a6	`-`
		08c3a6	`- add %RDX_LP, %RDI_LP`
		08c3a6	`-`
		08c3a6	`- /* Check the last VEC_SIZE bytes. */`
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x0)`
		08c3a6	`-`
		08c3a6	`- subq $(VEC_SIZE * 4), %rdi`
		08c3a6	`- movl %edi, %ecx`
		08c3a6	`- andl $(VEC_SIZE - 1), %ecx`
		08c3a6	`- jz L(aligned_more)`
		08c3a6	`-`
		08c3a6	`- /* Align data for aligned loads in the loop. */`
		08c3a6	`- addq $VEC_SIZE, %rdi`
		08c3a6	`- addq $VEC_SIZE, %rdx`
		08c3a6	`- andq $-VEC_SIZE, %rdi`
		08c3a6	`- subq %rcx, %rdx`
		08c3a6	`-`
		08c3a6	`- .p2align 4`
		08c3a6	`-L(aligned_more):`
		08c3a6	`- subq $(VEC_SIZE * 4), %rdx`
		08c3a6	`- jbe L(last_4x_vec_or_less)`
		08c3a6	`-`
		08c3a6	`- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time`
		08c3a6	`- since data is only aligned to VEC_SIZE. */`
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x3)`
		08c3a6	`-`
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2`
		08c3a6	`- kmovd %k2, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x2)`
		08c3a6	`-`
		08c3a6	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3`
		08c3a6	`- kmovd %k3, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x1)`
		08c3a6	`-`
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k4`
		08c3a6	`- kmovd %k4, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x0)`
		08c3a6	`-`
		08c3a6	`- /* Align data to 4 * VEC_SIZE for loop with fewer branches.`
		08c3a6	`- There are some overlaps with above if data isn't aligned`
		08c3a6	`- to 4 * VEC_SIZE. */`
		08c3a6	`- movl %edi, %ecx`
		08c3a6	`- andl $(VEC_SIZE * 4 - 1), %ecx`
		08c3a6	`- jz L(loop_4x_vec)`
		08c3a6	`-`
		08c3a6	`- addq $(VEC_SIZE * 4), %rdi`
		08c3a6	`- addq $(VEC_SIZE * 4), %rdx`
		08c3a6	`- andq $-(VEC_SIZE * 4), %rdi`
		08c3a6	`- subq %rcx, %rdx`
		08c3a6	`+ /* Fits in aligning bytes of first cache line. */`
		08c3a6	`+L(zero_0):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(loop_4x_vec):`
		08c3a6	`- /* Compare 4 * VEC at a time forward. */`
		08c3a6	`- subq $(VEC_SIZE * 4), %rdi`
		08c3a6	`- subq $(VEC_SIZE * 4), %rdx`
		08c3a6	`- jbe L(last_4x_vec_or_less)`
		08c3a6	`-`
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		08c3a6	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2`
		08c3a6	`- kord %k1, %k2, %k5`
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3`
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4`
		08c3a6	`-`
		08c3a6	`- kord %k3, %k4, %k6`
		08c3a6	`- kortestd %k5, %k6`
		08c3a6	`- jz L(loop_4x_vec)`
		08c3a6	`-`
		08c3a6	`- /* There is a match. */`
		08c3a6	`- kmovd %k4, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x3)`
		08c3a6	`-`
		08c3a6	`- kmovd %k3, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x2)`
		08c3a6	`-`
		08c3a6	`- kmovd %k2, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x1)`
		08c3a6	`-`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`+ .p2align 4,, 9`
		08c3a6	`+L(ret_vec_x0_dec):`
		08c3a6	`+ decq %rax`
		08c3a6	`+L(ret_vec_x0):`
		08c3a6	`+ lzcntl %ecx, %ecx`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_4x_vec_or_less):`
		08c3a6	`- addl $(VEC_SIZE * 4), %edx`
		08c3a6	`- cmpl $(VEC_SIZE * 2), %edx`
		08c3a6	`- jbe L(last_2x_vec)`
		08c3a6	`+ .p2align 4,, 10`
		08c3a6	`+L(more_1x_vec):`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x0)`
		08c3a6
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x3)`
		08c3a6	`+ /* Align rax (pointer to string). */`
		08c3a6	`+ andq $-VEC_SIZE, %rax`
		08c3a6
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2`
		08c3a6	`- kmovd %k2, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x2)`
		08c3a6	`+ /* Recompute length after aligning. */`
		08c3a6	`+ movq %rax, %rdx`
		08c3a6
		08c3a6	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3`
		08c3a6	`- kmovd %k3, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x1_check)`
		08c3a6	`- cmpl $(VEC_SIZE * 3), %edx`
		08c3a6	`- jbe L(zero)`
		08c3a6	`+ /* Need no matter what. */`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k4`
		08c3a6	`- kmovd %k4, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jz L(zero)`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- subq $(VEC_SIZE * 4), %rdx`
		08c3a6	`- addq %rax, %rdx`
		08c3a6	`- jl L(zero)`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`- ret`
		08c3a6	`+ subq %rdi, %rdx`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`+ cmpq $(VEC_SIZE * 2), %rdx`
		08c3a6	`+ ja L(more_2x_vec)`
		08c3a6	`L(last_2x_vec):`
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x3_check)`
		08c3a6	`+`
		08c3a6	`+ /* Must dec rax because L(ret_vec_x0_test) expects it. */`
		08c3a6	`+ decq %rax`
		08c3a6	`cmpl $VEC_SIZE, %edx`
		08c3a6	`- jbe L(zero)`
		08c3a6	`-`
		08c3a6	`- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jz L(zero)`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- subq $(VEC_SIZE * 2), %rdx`
		08c3a6	`- addq %rax, %rdx`
		08c3a6	`- jl L(zero)`
		08c3a6	`- addl $(VEC_SIZE * 2), %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`+ jbe L(ret_vec_x0_test)`
		08c3a6	`+`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x0)`
		08c3a6	`+`
		08c3a6	`+ /* Don't use rax for pointer here because EVEX has better encoding with`
		08c3a6	`+ offset % VEC_SIZE == 0. */`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6	`+ /* NB: 64-bit lzcnt. This will naturally add 32 to position. */`
		08c3a6	`+ lzcntq %rcx, %rcx`
		08c3a6	`+ cmpl %ecx, %edx`
		08c3a6	`+ jle L(zero_0)`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_x0):`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`+ /* Inexpensive place to put this regarding code size / target alignments`
		08c3a6	`+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross`
		08c3a6	`+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit`
		08c3a6	`+ in first cache line. */`
		08c3a6	`+L(page_cross):`
		08c3a6	`+ movq %rax, %rsi`
		08c3a6	`+ andq $-VEC_SIZE, %rsi`
		08c3a6	`+ vpcmpb $0, (%rsi), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %r8d`
		08c3a6	`+ /* Shift out negative alignment (because we are starting from endptr and`
		08c3a6	`+ working backwards). */`
		08c3a6	`+ movl %eax, %ecx`
		08c3a6	`+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */`
		08c3a6	`+ notl %ecx`
		08c3a6	`+ shlxl %ecx, %r8d, %ecx`
		08c3a6	`+ cmpq %rdi, %rsi`
		08c3a6	`+ ja L(more_1x_vec)`
		08c3a6	`+ lzcntl %ecx, %ecx`
		08c3a6	`+ cmpl %ecx, %edx`
		08c3a6	`+ jle L(zero_1)`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_x1):`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addl $VEC_SIZE, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`+ /* Continue creating zero labels that fit in aligning bytes and get`
		08c3a6	`+ 2-byte encoding / are in the same cache line as condition. */`
		08c3a6	`+L(zero_1):`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_x2):`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addl $(VEC_SIZE * 2), %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(ret_vec_x1):`
		08c3a6	`+ /* This will naturally add 32 to position. */`
		08c3a6	`+ bsrl %ecx, %ecx`
		08c3a6	`+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_x3):`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addl $(VEC_SIZE * 3), %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`- ret`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(more_2x_vec):`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x0_dec)`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_x1_check):`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- subq $(VEC_SIZE * 3), %rdx`
		08c3a6	`- addq %rax, %rdx`
		08c3a6	`- jl L(zero)`
		08c3a6	`- addl $VEC_SIZE, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`- ret`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x1)`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_x3_check):`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- subq $VEC_SIZE, %rdx`
		08c3a6	`- addq %rax, %rdx`
		08c3a6	`- jl L(zero)`
		08c3a6	`- addl $(VEC_SIZE * 3), %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`- ret`
		08c3a6	`+ /* Need no matter what. */`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(zero):`
		08c3a6	`- xorl %eax, %eax`
		08c3a6	`+ subq $(VEC_SIZE * 4), %rdx`
		08c3a6	`+ ja L(more_4x_vec)`
		08c3a6	`+`
		08c3a6	`+ cmpl $(VEC_SIZE * -1), %edx`
		08c3a6	`+ jle L(ret_vec_x2_test)`
		08c3a6	`+L(last_vec):`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x2)`
		08c3a6	`+`
		08c3a6	`+`
		08c3a6	`+ /* Need no matter what. */`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6	`+ lzcntl %ecx, %ecx`
		08c3a6	`+ subq $(VEC_SIZE * 3 + 1), %rax`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`+ cmpq %rax, %rdi`
		08c3a6	`+ ja L(zero_1)`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_or_less_aligned):`
		08c3a6	`- movl %edx, %ecx`
		08c3a6	`-`
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		08c3a6	`-`
		08c3a6	`- movl $1, %edx`
		08c3a6	`- /* Support rdx << 32. */`
		08c3a6	`- salq %cl, %rdx`
		08c3a6	`- subq $1, %rdx`
		08c3a6	`-`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`-`
		08c3a6	`- /* Remove the trailing bytes. */`
		08c3a6	`- andl %edx, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jz L(zero)`
		08c3a6	`-`
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(ret_vec_x2_test):`
		08c3a6	`+ lzcntl %ecx, %ecx`
		08c3a6	`+ subq $(VEC_SIZE * 2 + 1), %rax`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`+ cmpq %rax, %rdi`
		08c3a6	`+ ja L(zero_1)`
		08c3a6	`ret`
		08c3a6
		08c3a6	`- .p2align 4`
		08c3a6	`-L(last_vec_or_less):`
		08c3a6	`- addl $VEC_SIZE, %edx`
		08c3a6	`-`
		08c3a6	`- /* Check for zero length. */`
		08c3a6	`- testl %edx, %edx`
		08c3a6	`- jz L(zero)`
		08c3a6	`-`
		08c3a6	`- movl %edi, %ecx`
		08c3a6	`- andl $(VEC_SIZE - 1), %ecx`
		08c3a6	`- jz L(last_vec_or_less_aligned)`
		08c3a6	`-`
		08c3a6	`- movl %ecx, %esi`
		08c3a6	`- movl %ecx, %r8d`
		08c3a6	`- addl %edx, %esi`
		08c3a6	`- andq $-VEC_SIZE, %rdi`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(ret_vec_x2):`
		08c3a6	`+ bsrl %ecx, %ecx`
		08c3a6	`+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- subl $VEC_SIZE, %esi`
		08c3a6	`- ja L(last_vec_2x_aligned)`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(ret_vec_x3):`
		08c3a6	`+ bsrl %ecx, %ecx`
		08c3a6	`+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- /* Check the last VEC. */`
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`+ .p2align 4,, 8`
		08c3a6	`+L(more_4x_vec):`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x2)`
		08c3a6
		08c3a6	`- /* Remove the leading and trailing bytes. */`
		08c3a6	`- sarl %cl, %eax`
		08c3a6	`- movl %edx, %ecx`
		08c3a6	`+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6
		08c3a6	`- movl $1, %edx`
		08c3a6	`- sall %cl, %edx`
		08c3a6	`- subl $1, %edx`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x3)`
		08c3a6
		08c3a6	`- andl %edx, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jz L(zero)`
		08c3a6	`+ /* Check if near end before re-aligning (otherwise might do an`
		08c3a6	`+ unnecessary loop iteration). */`
		08c3a6	`+ addq $-(VEC_SIZE * 4), %rax`
		08c3a6	`+ cmpq $(VEC_SIZE * 4), %rdx`
		08c3a6	`+ jbe L(last_4x_vec)`
		08c3a6
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`- addq %r8, %rax`
		08c3a6	`- ret`
		08c3a6	`+ decq %rax`
		08c3a6	`+ andq $-(VEC_SIZE * 4), %rax`
		08c3a6	`+ movq %rdi, %rdx`
		08c3a6	`+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because`
		08c3a6	`+ lengths that overflow can be valid and break the comparison. */`
		08c3a6	`+ andq $-(VEC_SIZE * 4), %rdx`
		08c3a6
		08c3a6	`.p2align 4`
		08c3a6	`-L(last_vec_2x_aligned):`
		08c3a6	`- movl %esi, %ecx`
		08c3a6	`-`
		08c3a6	`- /* Check the last VEC. */`
		08c3a6	`- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1`
		08c3a6	`+L(loop_4x_vec):`
		08c3a6	`+ /* Store 1 were not-equals and 0 where equals in k1 (used to mask later`
		08c3a6	`+ on). */`
		08c3a6	`+ vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1`
		08c3a6	`+`
		08c3a6	`+ /* VEC(2/3) will have zero-byte where we found a CHAR. */`
		08c3a6	`+ vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)`
		08c3a6	`+ vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)`
		08c3a6	`+ vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4`
		08c3a6	`+`
		08c3a6	`+ /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where`
		08c3a6	`+ CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */`
		08c3a6	`+ vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}`
		08c3a6	`+ vptestnmb %VEC(3), %VEC(3), %k2`
		08c3a6	`+`
		08c3a6	`+ /* Any 1s and we found CHAR. */`
		08c3a6	`+ kortestd %k2, %k4`
		08c3a6	`+ jnz L(loop_end)`
		08c3a6	`+`
		08c3a6	`+ addq $-(VEC_SIZE * 4), %rax`
		08c3a6	`+ cmpq %rdx, %rax`
		08c3a6	`+ jne L(loop_4x_vec)`
		08c3a6	`+`
		08c3a6	`+ /* Need to re-adjust rdx / rax for L(last_4x_vec). */`
		08c3a6	`+ subq $-(VEC_SIZE * 4), %rdx`
		08c3a6	`+ movq %rdx, %rax`
		08c3a6	`+ subl %edi, %edx`
		08c3a6	`+L(last_4x_vec):`
		08c3a6	`+`
		08c3a6	`+ /* Used no matter what. */`
		08c3a6	`+ vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6
		08c3a6	`- movl $1, %edx`
		08c3a6	`- sall %cl, %edx`
		08c3a6	`- subl $1, %edx`
		08c3a6	`+ cmpl $(VEC_SIZE * 2), %edx`
		08c3a6	`+ jbe L(last_2x_vec)`
		08c3a6
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x0_dec)`
		08c3a6
		08c3a6	`- /* Remove the trailing bytes. */`
		08c3a6	`- andl %edx, %eax`
		08c3a6
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jnz L(last_vec_x1)`
		08c3a6	`+ vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6
		08c3a6	`- /* Check the second last VEC. */`
		08c3a6	`- vpcmpb $0, (%rdi), %YMMMATCH, %k1`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x1)`
		08c3a6
		08c3a6	`- movl %r8d, %ecx`
		08c3a6	`+ /* Used no matter what. */`
		08c3a6	`+ vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6
		08c3a6	`- kmovd %k1, %eax`
		08c3a6	`+ cmpl $(VEC_SIZE * 3), %edx`
		08c3a6	`+ ja L(last_vec)`
		08c3a6
		08c3a6	`- /* Remove the leading bytes. Must use unsigned right shift for`
		08c3a6	`- bsrl below. */`
		08c3a6	`- shrl %cl, %eax`
		08c3a6	`- testl %eax, %eax`
		08c3a6	`- jz L(zero)`
		08c3a6	`+ lzcntl %ecx, %ecx`
		08c3a6	`+ subq $(VEC_SIZE * 2 + 1), %rax`
		08c3a6	`+ subq %rcx, %rax`
		08c3a6	`+ cmpq %rax, %rdi`
		08c3a6	`+ jbe L(ret_1)`
		08c3a6	`+ xorl %eax, %eax`
		08c3a6	`+L(ret_1):`
		08c3a6	`+ ret`
		08c3a6
		08c3a6	`- bsrl %eax, %eax`
		08c3a6	`- addq %rdi, %rax`
		08c3a6	`- addq %r8, %rax`
		08c3a6	`+ .p2align 4,, 6`
		08c3a6	`+L(loop_end):`
		08c3a6	`+ kmovd %k1, %ecx`
		08c3a6	`+ notl %ecx`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x0_end)`
		08c3a6	`+`
		08c3a6	`+ vptestnmb %VEC(2), %VEC(2), %k0`
		08c3a6	`+ kmovd %k0, %ecx`
		08c3a6	`+ testl %ecx, %ecx`
		08c3a6	`+ jnz L(ret_vec_x1_end)`
		08c3a6	`+`
		08c3a6	`+ kmovd %k2, %ecx`
		08c3a6	`+ kmovd %k4, %esi`
		08c3a6	`+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)`
		08c3a6	`+ then it won't affect the result in esi (VEC4). If ecx is non-zero`
		08c3a6	`+ then CHAR in VEC3 and bsrq will use that position. */`
		08c3a6	`+ salq $32, %rcx`
		08c3a6	`+ orq %rsi, %rcx`
		08c3a6	`+ bsrq %rcx, %rcx`
		08c3a6	`+ addq %rcx, %rax`
		08c3a6	`+ ret`
		08c3a6	`+ .p2align 4,, 4`
		08c3a6	`+L(ret_vec_x0_end):`
		08c3a6	`+ addq $(VEC_SIZE), %rax`
		08c3a6	`+L(ret_vec_x1_end):`
		08c3a6	`+ bsrl %ecx, %ecx`
		08c3a6	`+ leaq (VEC_SIZE * 2)(%rax, %rcx), %rax`
		08c3a6	`ret`
		08c3a6	`-END (__memrchr_evex)`
		08c3a6	`+`
		08c3a6	`+END(MEMRCHR)`
		08c3a6	`#endif`

rpms / glibc

Source Code

Blame SOURCES/glibc-upstream-2.34-284.patch