Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/ia-opt-memcmp-evex-movbe-1.patch

Blob History Raw

		190885	`From 2d8eaea7ad74328d806d3f1a377f1168eaa2f348 Mon Sep 17 00:00:00 2001`
		190885	`From: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Date: Mon, 17 May 2021 13:57:24 -0400`
		190885	`Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S`
		190885
		190885	`No bug. This commit optimizes memcmp-evex.S. The optimizations include`
		190885	`adding a new vec compare path for small sizes, reorganizing the entry`
		190885	`control flow, removing some unnecissary ALU instructions from the main`
		190885	`loop, and most importantly replacing the heavy use of vpcmp + kand`
		190885	`logic with vpxor + vptern. test-memcmp and test-wmemcmp are both`
		190885	`passing.`
		190885
		190885	`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		190885	`(cherry picked from commit 4ad473e97acdc5f6d811755b67c09f2128a644ce)`
		190885	`---`
		190885	`sysdeps/x86_64/multiarch/memcmp-evex-movbe.S \| 710 +++++++++++--------`
		190885	`1 file changed, 408 insertions(+), 302 deletions(-)`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`index 9c093972..654dc7ac 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S`
		190885	`@@ -19,17 +19,22 @@`
		190885	`#if IS_IN (libc)`
		190885
		190885	`/* memcmp/wmemcmp is implemented as:`
		190885	`- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap`
		190885	`- to avoid branches.`
		190885	`- 2. Use overlapping compare to avoid branch.`
		190885	`- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8`
		190885	`- bytes for wmemcmp.`
		190885	`- 4. If size is 8 * VEC_SIZE or less, unroll the loop.`
		190885	`- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory`
		190885	`+ 1. Use ymm vector compares when possible. The only case where`
		190885	`+ vector compares is not possible for when size < CHAR_PER_VEC`
		190885	`+ and loading from either s1 or s2 would cause a page cross.`
		190885	`+ 2. For size from 2 to 7 bytes on page cross, load as big endian`
		190885	`+ with movbe and bswap to avoid branches.`
		190885	`+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or`
		190885	`+ size >= 8 bytes for wmemcmp.`
		190885	`+ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a`
		190885	`+ to check for early mismatches. Only do this if its guranteed the`
		190885	`+ work is not wasted.`
		190885	`+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.`
		190885	`+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory`
		190885	`area.`
		190885	`- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.`
		190885	`- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.`
		190885	`- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */`
		190885	`+ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.`
		190885	`+ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.`
		190885	`+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */`
		190885
		190885	`# include <sysdep.h>`
		190885
		190885	`@@ -40,11 +45,21 @@`
		190885	`# define VMOVU vmovdqu64`
		190885
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`-# define VPCMPEQ vpcmpeqd`
		190885	`+# define CHAR_SIZE 4`
		190885	`+# define VPCMP vpcmpd`
		190885	`# else`
		190885	`-# define VPCMPEQ vpcmpeqb`
		190885	`+# define CHAR_SIZE 1`
		190885	`+# define VPCMP vpcmpub`
		190885	`# endif`
		190885
		190885	`+# define VEC_SIZE 32`
		190885	`+# define PAGE_SIZE 4096`
		190885	`+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)`
		190885	`+`
		190885	`+# define XMM0 xmm16`
		190885	`+# define XMM1 xmm17`
		190885	`+# define XMM2 xmm18`
		190885	`+# define YMM0 ymm16`
		190885	`# define XMM1 xmm17`
		190885	`# define XMM2 xmm18`
		190885	`# define YMM1 ymm17`
		190885	`@@ -54,15 +69,6 @@`
		190885	`# define YMM5 ymm21`
		190885	`# define YMM6 ymm22`
		190885
		190885	`-# define VEC_SIZE 32`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`-# define VEC_MASK 0xff`
		190885	`-# define XMM_MASK 0xf`
		190885	`-# else`
		190885	`-# define VEC_MASK 0xffffffff`
		190885	`-# define XMM_MASK 0xffff`
		190885	`-# endif`
		190885	`-`
		190885	`/* Warning!`
		190885	`wmemcmp has to use SIGNED comparison for elements.`
		190885	`memcmp has to use UNSIGNED comparison for elemnts.`
		190885	`@@ -70,145 +76,370 @@`
		190885
		190885	`.section .text.evex,"ax",@progbits`
		190885	`ENTRY (MEMCMP)`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- shl $2, %RDX_LP`
		190885	`-# elif defined __ILP32__`
		190885	`+# ifdef __ILP32__`
		190885	`/* Clear the upper 32 bits. */`
		190885	`movl %edx, %edx`
		190885	`# endif`
		190885	`- cmp $VEC_SIZE, %RDX_LP`
		190885	`+ cmp $CHAR_PER_VEC, %RDX_LP`
		190885	`jb L(less_vec)`
		190885
		190885	`/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k1`
		190885	`+ VMOVU (%rsi), %YMM1`
		190885	`+ /* Use compare not equals to directly check for mismatch. */`
		190885	`+ VPCMP $4, (%rdi), %YMM1, %k1`
		190885	`kmovd %k1, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- cmpq $(VEC_SIZE * 2), %rdx`
		190885	`- jbe L(last_vec)`
		190885	`-`
		190885	`- /* More than 2 * VEC. */`
		190885	`- cmpq $(VEC_SIZE * 8), %rdx`
		190885	`- ja L(more_8x_vec)`
		190885	`- cmpq $(VEC_SIZE * 4), %rdx`
		190885	`- jb L(last_4x_vec)`
		190885	`+ /* NB: eax must be destination register if going to`
		190885	`+ L(return_vec_[0,2]). For L(return_vec_3 destination register`
		190885	`+ must be ecx. */`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885
		190885	`- /* From 4 * VEC to 8 * VEC, inclusively. */`
		190885	`- VMOVU (%rsi), %YMM1`
		190885	`- VPCMPEQ (%rdi), %YMM1, %k1`
		190885	`+ cmpq $(CHAR_PER_VEC * 2), %rdx`
		190885	`+ jbe L(last_1x_vec)`
		190885
		190885	`+ /* Check second VEC no matter what. */`
		190885	`VMOVU VEC_SIZE(%rsi), %YMM2`
		190885	`- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2`
		190885	`+ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_1)`
		190885	`+`
		190885	`+ /* Less than 4 * VEC. */`
		190885	`+ cmpq $(CHAR_PER_VEC * 4), %rdx`
		190885	`+ jbe L(last_2x_vec)`
		190885
		190885	`+ /* Check third and fourth VEC no matter what. */`
		190885	`VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		190885	`- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3`
		190885	`+ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_2)`
		190885
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		190885	`- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4`
		190885	`+ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1`
		190885	`+ kmovd %k1, %ecx`
		190885	`+ testl %ecx, %ecx`
		190885	`+ jnz L(return_vec_3)`
		190885
		190885	`- kandd %k1, %k2, %k5`
		190885	`- kandd %k3, %k4, %k6`
		190885	`- kandd %k5, %k6, %k6`
		190885	`+ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so`
		190885	`+ compare with zero to get a mask is needed. */`
		190885	`+ vpxorq %XMM0, %XMM0, %XMM0`
		190885
		190885	`- kmovd %k6, %eax`
		190885	`- cmpl $VEC_MASK, %eax`
		190885	`- jne L(4x_vec_end)`
		190885	`+ /* Go to 4x VEC loop. */`
		190885	`+ cmpq $(CHAR_PER_VEC * 8), %rdx`
		190885	`+ ja L(more_8x_vec)`
		190885
		190885	`- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi`
		190885	`- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi`
		190885	`- VMOVU (%rsi), %YMM1`
		190885	`- VPCMPEQ (%rdi), %YMM1, %k1`
		190885	`+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any`
		190885	`+ branches. */`
		190885
		190885	`- VMOVU VEC_SIZE(%rsi), %YMM2`
		190885	`- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2`
		190885	`- kandd %k1, %k2, %k5`
		190885	`+ /* Load first two VEC from s2 before adjusting addresses. */`
		190885	`+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1`
		190885	`+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2`
		190885	`+ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi`
		190885	`+ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi`
		190885	`+`
		190885	`+ /* Wait to load from s1 until addressed adjust due to`
		190885	`+ unlamination of microfusion with complex address mode. */`
		190885	`+`
		190885	`+ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it`
		190885	`+ will have some 1s. */`
		190885	`+ vpxorq (%rdi), %YMM1, %YMM1`
		190885	`+ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2`
		190885
		190885	`VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		190885	`- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3`
		190885	`- kandd %k3, %k5, %k5`
		190885	`+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		190885	`+ /* Or together YMM1, YMM2, and YMM3 into YMM3. */`
		190885	`+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		190885
		190885	`VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		190885	`- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4`
		190885	`- kandd %k4, %k5, %k5`
		190885	`+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while`
		190885	`+ oring with YMM3. Result is stored in YMM4. */`
		190885	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		190885	`+ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */`
		190885	`+ VPCMP $4, %YMM4, %YMM0, %k1`
		190885	`+ kmovd %k1, %ecx`
		190885	`+ testl %ecx, %ecx`
		190885	`+ jnz L(return_vec_0_1_2_3)`
		190885	`+ /* NB: eax must be zero to reach here. */`
		190885	`+ ret`
		190885
		190885	`- kmovd %k5, %eax`
		190885	`- cmpl $VEC_MASK, %eax`
		190885	`- jne L(4x_vec_end)`
		190885	`- xorl %eax, %eax`
		190885	`+ /* NB: aligning 32 here allows for the rest of the jump targets`
		190885	`+ to be tuned for 32 byte alignment. Most important this ensures`
		190885	`+ the L(more_8x_vec) loop is 32 byte aligned. */`
		190885	`+ .p2align 5`
		190885	`+L(less_vec):`
		190885	`+ /* Check if one or less CHAR. This is necessary for size = 0 but`
		190885	`+ is also faster for size = CHAR_SIZE. */`
		190885	`+ cmpl $1, %edx`
		190885	`+ jbe L(one_or_less)`
		190885	`+`
		190885	`+ /* Check if loading one VEC from either s1 or s2 could cause a`
		190885	`+ page cross. This can have false positives but is by far the`
		190885	`+ fastest method. */`
		190885	`+ movl %edi, %eax`
		190885	`+ orl %esi, %eax`
		190885	`+ andl $(PAGE_SIZE - 1), %eax`
		190885	`+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		190885	`+ jg L(page_cross_less_vec)`
		190885	`+`
		190885	`+ /* No page cross possible. */`
		190885	`+ VMOVU (%rsi), %YMM2`
		190885	`+ VPCMP $4, (%rdi), %YMM2, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ /* Create mask in ecx for potentially in bound matches. */`
		190885	`+ bzhil %edx, %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(last_2x_vec):`
		190885	`- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+L(return_vec_0):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ /* NB: no partial register stall here because xorl zero idiom`
		190885	`+ above. */`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (%rsi, %rax), %ecx`
		190885	`+ movzbl (%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ ret`
		190885
		190885	`-L(last_vec):`
		190885	`- /* Use overlapping loads to avoid branches. */`
		190885	`- leaq -VEC_SIZE(%rdi, %rdx), %rdi`
		190885	`- leaq -VEC_SIZE(%rsi, %rdx), %rsi`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ /* NB: No p2align necessary. Alignment % 16 is naturally 1`
		190885	`+ which is good enough for a target not in a loop. */`
		190885	`+L(return_vec_1):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl VEC_SIZE(%rsi, %rax), %ecx`
		190885	`+ movzbl VEC_SIZE(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`-L(first_vec):`
		190885	`- /* A byte or int32 is different within 16 or 32 bytes. */`
		190885	`- tzcntl %eax, %ecx`
		190885	`+ /* NB: No p2align necessary. Alignment % 16 is naturally 2`
		190885	`+ which is good enough for a target not in a loop. */`
		190885	`+L(return_vec_2):`
		190885	`+ tzcntl %eax, %eax`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl (%rdi, %rcx, 4), %edx`
		190885	`- cmpl (%rsi, %rcx, 4), %edx`
		190885	`-L(wmemcmp_return):`
		190885	`- setl %al`
		190885	`- negl %eax`
		190885	`- orl $1, %eax`
		190885	`+ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`# else`
		190885	`- movzbl (%rdi, %rcx), %eax`
		190885	`- movzbl (%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`# endif`
		190885	`ret`
		190885
		190885	`+ .p2align 4`
		190885	`+L(8x_return_vec_0_1_2_3):`
		190885	`+ /* Returning from L(more_8x_vec) requires restoring rsi. */`
		190885	`+ addq %rdi, %rsi`
		190885	`+L(return_vec_0_1_2_3):`
		190885	`+ VPCMP $4, %YMM1, %YMM0, %k0`
		190885	`+ kmovd %k0, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`+`
		190885	`+ VPCMP $4, %YMM2, %YMM0, %k0`
		190885	`+ kmovd %k0, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_1)`
		190885	`+`
		190885	`+ VPCMP $4, %YMM3, %YMM0, %k0`
		190885	`+ kmovd %k0, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_2)`
		190885	`+L(return_vec_3):`
		190885	`+ tzcntl %ecx, %ecx`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ ret`
		190885	`+`
		190885	`.p2align 4`
		190885	`-L(4):`
		190885	`- xorl %eax, %eax`
		190885	`- movl (%rdi), %edx`
		190885	`- cmpl (%rsi), %edx`
		190885	`- jne L(wmemcmp_return)`
		190885	`+L(more_8x_vec):`
		190885	`+ /* Set end of s1 in rdx. */`
		190885	`+ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx`
		190885	`+ /* rsi stores s2 - s1. This allows loop to only update one`
		190885	`+ pointer. */`
		190885	`+ subq %rdi, %rsi`
		190885	`+ /* Align s1 pointer. */`
		190885	`+ andq $-VEC_SIZE, %rdi`
		190885	`+ /* Adjust because first 4x vec where check already. */`
		190885	`+ subq $-(VEC_SIZE * 4), %rdi`
		190885	`+ .p2align 4`
		190885	`+L(loop_4x_vec):`
		190885	`+ VMOVU (%rsi, %rdi), %YMM1`
		190885	`+ vpxorq (%rdi), %YMM1, %YMM1`
		190885	`+`
		190885	`+ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2`
		190885	`+ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2`
		190885	`+`
		190885	`+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3`
		190885	`+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3`
		190885	`+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		190885	`+`
		190885	`+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4`
		190885	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4`
		190885	`+ VPCMP $4, %YMM4, %YMM0, %k1`
		190885	`+ kmovd %k1, %ecx`
		190885	`+ testl %ecx, %ecx`
		190885	`+ jnz L(8x_return_vec_0_1_2_3)`
		190885	`+ subq $-(VEC_SIZE * 4), %rdi`
		190885	`+ cmpq %rdx, %rdi`
		190885	`+ jb L(loop_4x_vec)`
		190885	`+`
		190885	`+ subq %rdx, %rdi`
		190885	`+ /* rdi has 4 * VEC_SIZE - remaining length. */`
		190885	`+ cmpl $(VEC_SIZE * 3), %edi`
		190885	`+ jae L(8x_last_1x_vec)`
		190885	`+ /* Load regardless of branch. */`
		190885	`+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3`
		190885	`+ cmpl $(VEC_SIZE * 2), %edi`
		190885	`+ jae L(8x_last_2x_vec)`
		190885	`+`
		190885	`+ VMOVU (%rsi, %rdx), %YMM1`
		190885	`+ vpxorq (%rdx), %YMM1, %YMM1`
		190885	`+`
		190885	`+ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2`
		190885	`+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2`
		190885	`+`
		190885	`+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3`
		190885	`+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3`
		190885	`+`
		190885	`+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4`
		190885	`+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4`
		190885	`+ VPCMP $4, %YMM4, %YMM0, %k1`
		190885	`+ kmovd %k1, %ecx`
		190885	`+ /* Restore s1 pointer to rdi. */`
		190885	`+ movq %rdx, %rdi`
		190885	`+ testl %ecx, %ecx`
		190885	`+ jnz L(8x_return_vec_0_1_2_3)`
		190885	`+ /* NB: eax must be zero to reach here. */`
		190885	`+ ret`
		190885	`+`
		190885	`+ /* Only entry is from L(more_8x_vec). */`
		190885	`+ .p2align 4`
		190885	`+L(8x_last_2x_vec):`
		190885	`+ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(8x_return_vec_2)`
		190885	`+ /* Naturally aligned to 16 bytes. */`
		190885	`+L(8x_last_1x_vec):`
		190885	`+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1`
		190885	`+ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(8x_return_vec_3)`
		190885	`+ ret`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(last_2x_vec):`
		190885	`+ /* Check second to last VEC. */`
		190885	`+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1`
		190885	`+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_1_end)`
		190885	`+`
		190885	`+ /* Check last VEC. */`
		190885	`+ .p2align 4`
		190885	`+L(last_1x_vec):`
		190885	`+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1`
		190885	`+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0_end)`
		190885	`ret`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(8x_return_vec_2):`
		190885	`+ subq $VEC_SIZE, %rdx`
		190885	`+L(8x_return_vec_3):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ leaq (%rdx, %rax, CHAR_SIZE), %rax`
		190885	`+ movl (VEC_SIZE * 3)(%rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`# else`
		190885	`+ addq %rdx, %rax`
		190885	`+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ movzbl (VEC_SIZE * 3)(%rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ ret`
		190885	`+`
		190885	`.p2align 4`
		190885	`-L(between_4_7):`
		190885	`- /* Load as big endian with overlapping movbe to avoid branches. */`
		190885	`- movbe (%rdi), %eax`
		190885	`- movbe (%rsi), %ecx`
		190885	`- shlq $32, %rax`
		190885	`- shlq $32, %rcx`
		190885	`- movbe -4(%rdi, %rdx), %edi`
		190885	`- movbe -4(%rsi, %rdx), %esi`
		190885	`- orq %rdi, %rax`
		190885	`- orq %rsi, %rcx`
		190885	`- subq %rcx, %rax`
		190885	`- je L(exit)`
		190885	`- sbbl %eax, %eax`
		190885	`- orl $1, %eax`
		190885	`+L(return_vec_0_end):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+ addl %edx, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl -VEC_SIZE(%rsi, %rax), %ecx`
		190885	`+ movzbl -VEC_SIZE(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(exit):`
		190885	`+L(return_vec_1_end):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+ addl %edx, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`ret`
		190885
		190885	`+`
		190885	`.p2align 4`
		190885	`+L(page_cross_less_vec):`
		190885	`+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28`
		190885	`+ bytes. */`
		190885	`+ cmpl $(16 / CHAR_SIZE), %edx`
		190885	`+ jae L(between_16_31)`
		190885	`+# ifndef USE_AS_WMEMCMP`
		190885	`+ cmpl $8, %edx`
		190885	`+ jae L(between_8_15)`
		190885	`+ cmpl $4, %edx`
		190885	`+ jae L(between_4_7)`
		190885	`L(between_2_3):`
		190885	`/* Load as big endian to avoid branches. */`
		190885	`movzwl (%rdi), %eax`
		190885	`@@ -217,224 +448,99 @@ L(between_2_3):`
		190885	`shll $8, %ecx`
		190885	`bswap %eax`
		190885	`bswap %ecx`
		190885	`- movb -1(%rdi, %rdx), %al`
		190885	`- movb -1(%rsi, %rdx), %cl`
		190885	`+ movzbl -1(%rdi, %rdx), %edi`
		190885	`+ movzbl -1(%rsi, %rdx), %esi`
		190885	`+ orl %edi, %eax`
		190885	`+ orl %esi, %ecx`
		190885	`/* Subtraction is okay because the upper 8 bits are zero. */`
		190885	`subl %ecx, %eax`
		190885	`ret`
		190885	`-`
		190885	`.p2align 4`
		190885	`-L(1):`
		190885	`- movzbl (%rdi), %eax`
		190885	`+L(one_or_less):`
		190885	`+ jb L(zero)`
		190885	`movzbl (%rsi), %ecx`
		190885	`+ movzbl (%rdi), %eax`
		190885	`subl %ecx, %eax`
		190885	`ret`
		190885	`-# endif`
		190885	`-`
		190885	`- .p2align 4`
		190885	`-L(zero):`
		190885	`- xorl %eax, %eax`
		190885	`- ret`
		190885
		190885	`.p2align 4`
		190885	`-L(less_vec):`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */`
		190885	`- cmpb $4, %dl`
		190885	`- je L(4)`
		190885	`- jb L(zero)`
		190885	`-# else`
		190885	`- cmpb $1, %dl`
		190885	`- je L(1)`
		190885	`- jb L(zero)`
		190885	`- cmpb $4, %dl`
		190885	`- jb L(between_2_3)`
		190885	`- cmpb $8, %dl`
		190885	`- jb L(between_4_7)`
		190885	`+L(between_8_15):`
		190885	`# endif`
		190885	`- cmpb $16, %dl`
		190885	`- jae L(between_16_31)`
		190885	`- /* It is between 8 and 15 bytes. */`
		190885	`+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */`
		190885	`vmovq (%rdi), %XMM1`
		190885	`vmovq (%rsi), %XMM2`
		190885	`- VPCMPEQ %XMM1, %XMM2, %k2`
		190885	`- kmovw %k2, %eax`
		190885	`- subl $XMM_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ VPCMP $4, %XMM1, %XMM2, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`/* Use overlapping loads to avoid branches. */`
		190885	`- leaq -8(%rdi, %rdx), %rdi`
		190885	`- leaq -8(%rsi, %rdx), %rsi`
		190885	`+ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi`
		190885	`+ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi`
		190885	`vmovq (%rdi), %XMM1`
		190885	`vmovq (%rsi), %XMM2`
		190885	`- VPCMPEQ %XMM1, %XMM2, %k2`
		190885	`- kmovw %k2, %eax`
		190885	`- subl $XMM_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ VPCMP $4, %XMM1, %XMM2, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(between_16_31):`
		190885	`- /* From 16 to 31 bytes. No branch when size == 16. */`
		190885	`- VMOVU (%rsi), %XMM2`
		190885	`- VPCMPEQ (%rdi), %XMM2, %k2`
		190885	`- kmovw %k2, %eax`
		190885	`- subl $XMM_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- /* Use overlapping loads to avoid branches. */`
		190885	`- leaq -16(%rdi, %rdx), %rdi`
		190885	`- leaq -16(%rsi, %rdx), %rsi`
		190885	`- VMOVU (%rsi), %XMM2`
		190885	`- VPCMPEQ (%rdi), %XMM2, %k2`
		190885	`- kmovw %k2, %eax`
		190885	`- subl $XMM_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+L(zero):`
		190885	`+ xorl %eax, %eax`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(more_8x_vec):`
		190885	`- /* More than 8 * VEC. Check the first VEC. */`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- /* Align the first memory area for aligned loads in the loop.`
		190885	`- Compute how much the first memory area is misaligned. */`
		190885	`- movq %rdi, %rcx`
		190885	`- andl $(VEC_SIZE - 1), %ecx`
		190885	`- /* Get the negative of offset for alignment. */`
		190885	`- subq $VEC_SIZE, %rcx`
		190885	`- /* Adjust the second memory area. */`
		190885	`- subq %rcx, %rsi`
		190885	`- /* Adjust the first memory area which should be aligned now. */`
		190885	`- subq %rcx, %rdi`
		190885	`- /* Adjust length. */`
		190885	`- addq %rcx, %rdx`
		190885	`-`
		190885	`-L(loop_4x_vec):`
		190885	`- /* Compare 4 * VEC at a time forward. */`
		190885	`- VMOVU (%rsi), %YMM1`
		190885	`- VPCMPEQ (%rdi), %YMM1, %k1`
		190885	`-`
		190885	`- VMOVU VEC_SIZE(%rsi), %YMM2`
		190885	`- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2`
		190885	`- kandd %k2, %k1, %k5`
		190885	`-`
		190885	`- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3`
		190885	`- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3`
		190885	`- kandd %k3, %k5, %k5`
		190885	`-`
		190885	`- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4`
		190885	`- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4`
		190885	`- kandd %k4, %k5, %k5`
		190885	`-`
		190885	`- kmovd %k5, %eax`
		190885	`- cmpl $VEC_MASK, %eax`
		190885	`- jne L(4x_vec_end)`
		190885	`-`
		190885	`- addq $(VEC_SIZE * 4), %rdi`
		190885	`- addq $(VEC_SIZE * 4), %rsi`
		190885	`-`
		190885	`- subq $(VEC_SIZE * 4), %rdx`
		190885	`- cmpq $(VEC_SIZE * 4), %rdx`
		190885	`- jae L(loop_4x_vec)`
		190885	`-`
		190885	`- /* Less than 4 * VEC. */`
		190885	`- cmpq $VEC_SIZE, %rdx`
		190885	`- jbe L(last_vec)`
		190885	`- cmpq $(VEC_SIZE * 2), %rdx`
		190885	`- jbe L(last_2x_vec)`
		190885	`-`
		190885	`-L(last_4x_vec):`
		190885	`- /* From 2 * VEC to 4 * VEC. */`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- addq $VEC_SIZE, %rdi`
		190885	`- addq $VEC_SIZE, %rsi`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+L(between_16_31):`
		190885	`+ /* From 16 to 31 bytes. No branch when size == 16. */`
		190885	`+ VMOVU (%rsi), %XMM2`
		190885	`+ VPCMP $4, (%rdi), %XMM2, %k1`
		190885	`+ kmovd %k1, %eax`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885
		190885	`/* Use overlapping loads to avoid branches. */`
		190885	`- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi`
		190885	`- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885
		190885	`- addq $VEC_SIZE, %rdi`
		190885	`- addq $VEC_SIZE, %rsi`
		190885	`- VMOVU (%rsi), %YMM2`
		190885	`- VPCMPEQ (%rdi), %YMM2, %k2`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`- ret`
		190885	`-`
		190885	`- .p2align 4`
		190885	`-L(4x_vec_end):`
		190885	`+ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2`
		190885	`+ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi`
		190885	`+ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi`
		190885	`+ VPCMP $4, (%rdi), %XMM2, %k1`
		190885	`kmovd %k1, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`- kmovd %k2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec_x1)`
		190885	`- kmovd %k3, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec_x2)`
		190885	`- kmovd %k4, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- tzcntl %eax, %ecx`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx`
		190885	`- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx`
		190885	`- jmp L(wmemcmp_return)`
		190885	`-# else`
		190885	`- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`-# endif`
		190885	`+ testl %eax, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`-L(first_vec_x1):`
		190885	`- tzcntl %eax, %ecx`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl VEC_SIZE(%rdi, %rcx, 4), %edx`
		190885	`- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx`
		190885	`- jmp L(wmemcmp_return)`
		190885	`-# else`
		190885	`- movzbl VEC_SIZE(%rdi, %rcx), %eax`
		190885	`- movzbl VEC_SIZE(%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`-# endif`
		190885	`+ .p2align 4`
		190885	`+L(one_or_less):`
		190885	`+ jb L(zero)`
		190885	`+ movl (%rdi), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (%rsi), %ecx`
		190885	`+ je L(zero)`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`ret`
		190885	`+# else`
		190885
		190885	`.p2align 4`
		190885	`-L(first_vec_x2):`
		190885	`- tzcntl %eax, %ecx`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx`
		190885	`- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx`
		190885	`- jmp L(wmemcmp_return)`
		190885	`-# else`
		190885	`- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax`
		190885	`- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`-# endif`
		190885	`+L(between_4_7):`
		190885	`+ /* Load as big endian with overlapping movbe to avoid branches.`
		190885	`+ */`
		190885	`+ movbe (%rdi), %eax`
		190885	`+ movbe (%rsi), %ecx`
		190885	`+ shlq $32, %rax`
		190885	`+ shlq $32, %rcx`
		190885	`+ movbe -4(%rdi, %rdx), %edi`
		190885	`+ movbe -4(%rsi, %rdx), %esi`
		190885	`+ orq %rdi, %rax`
		190885	`+ orq %rsi, %rcx`
		190885	`+ subq %rcx, %rax`
		190885	`+ jz L(zero_4_7)`
		190885	`+ sbbl %eax, %eax`
		190885	`+ orl $1, %eax`
		190885	`+L(zero_4_7):`
		190885	`ret`
		190885	`+# endif`
		190885	`+`
		190885	`END (MEMCMP)`
		190885	`#endif`
		190885	`--`
		190885	`GitLab`
		190885

rpms / glibc

Source Code

Blame SOURCES/ia-opt-memcmp-evex-movbe-1.patch