Blame SOURCES/ia-opt-memcmp-evex-movbe-1.patch

190885
From 2d8eaea7ad74328d806d3f1a377f1168eaa2f348 Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Mon, 17 May 2021 13:57:24 -0400
190885
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
190885
190885
No bug. This commit optimizes memcmp-evex.S. The optimizations include
190885
adding a new vec compare path for small sizes, reorganizing the entry
190885
control flow, removing some unnecissary ALU instructions from the main
190885
loop, and most importantly replacing the heavy use of vpcmp + kand
190885
logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
190885
passing.
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
190885
(cherry picked from commit 4ad473e97acdc5f6d811755b67c09f2128a644ce)
190885
---
190885
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
190885
 1 file changed, 408 insertions(+), 302 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
190885
index 9c093972..654dc7ac 100644
190885
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
190885
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
190885
@@ -19,17 +19,22 @@
190885
 #if IS_IN (libc)
190885
 
190885
 /* memcmp/wmemcmp is implemented as:
190885
-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
190885
-      to avoid branches.
190885
-   2. Use overlapping compare to avoid branch.
190885
-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
190885
-      bytes for wmemcmp.
190885
-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
190885
-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
190885
+   1. Use ymm vector compares when possible. The only case where
190885
+      vector compares is not possible for when size < CHAR_PER_VEC
190885
+      and loading from either s1 or s2 would cause a page cross.
190885
+   2. For size from 2 to 7 bytes on page cross, load as big endian
190885
+      with movbe and bswap to avoid branches.
190885
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
190885
+      size >= 8 bytes for wmemcmp.
190885
+   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
190885
+      to check for early mismatches. Only do this if its guranteed the
190885
+      work is not wasted.
190885
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
190885
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
190885
       area.
190885
-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
190885
-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
190885
-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
190885
+   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
190885
+   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
190885
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
190885
 
190885
 # include <sysdep.h>
190885
 
190885
@@ -40,11 +45,21 @@
190885
 # define VMOVU		vmovdqu64
190885
 
190885
 # ifdef USE_AS_WMEMCMP
190885
-#  define VPCMPEQ	vpcmpeqd
190885
+#  define CHAR_SIZE	4
190885
+#  define VPCMP	vpcmpd
190885
 # else
190885
-#  define VPCMPEQ	vpcmpeqb
190885
+#  define CHAR_SIZE	1
190885
+#  define VPCMP	vpcmpub
190885
 # endif
190885
 
190885
+# define VEC_SIZE	32
190885
+# define PAGE_SIZE	4096
190885
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
190885
+
190885
+# define XMM0		xmm16
190885
+# define XMM1		xmm17
190885
+# define XMM2		xmm18
190885
+# define YMM0		ymm16
190885
 # define XMM1		xmm17
190885
 # define XMM2		xmm18
190885
 # define YMM1		ymm17
190885
@@ -54,15 +69,6 @@
190885
 # define YMM5		ymm21
190885
 # define YMM6		ymm22
190885
 
190885
-# define VEC_SIZE 32
190885
-# ifdef USE_AS_WMEMCMP
190885
-#  define VEC_MASK 0xff
190885
-#  define XMM_MASK 0xf
190885
-# else
190885
-#  define VEC_MASK 0xffffffff
190885
-#  define XMM_MASK 0xffff
190885
-# endif
190885
-
190885
 /* Warning!
190885
            wmemcmp has to use SIGNED comparison for elements.
190885
            memcmp has to use UNSIGNED comparison for elemnts.
190885
@@ -70,145 +76,370 @@
190885
 
190885
 	.section .text.evex,"ax",@progbits
190885
 ENTRY (MEMCMP)
190885
-# ifdef USE_AS_WMEMCMP
190885
-	shl	$2, %RDX_LP
190885
-# elif defined __ILP32__
190885
+# ifdef __ILP32__
190885
 	/* Clear the upper 32 bits.  */
190885
 	movl	%edx, %edx
190885
 # endif
190885
-	cmp	$VEC_SIZE, %RDX_LP
190885
+	cmp	$CHAR_PER_VEC, %RDX_LP
190885
 	jb	L(less_vec)
190885
 
190885
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k1
190885
+	VMOVU	(%rsi), %YMM1
190885
+	/* Use compare not equals to directly check for mismatch.  */
190885
+	VPCMP	$4, (%rdi), %YMM1, %k1
190885
 	kmovd	%k1, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
-
190885
-	cmpq	$(VEC_SIZE * 2), %rdx
190885
-	jbe	L(last_vec)
190885
-
190885
-	/* More than 2 * VEC.  */
190885
-	cmpq	$(VEC_SIZE * 8), %rdx
190885
-	ja	L(more_8x_vec)
190885
-	cmpq	$(VEC_SIZE * 4), %rdx
190885
-	jb	L(last_4x_vec)
190885
+	/* NB: eax must be destination register if going to
190885
+	   L(return_vec_[0,2]). For L(return_vec_3 destination register
190885
+	   must be ecx.  */
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 
190885
-	/* From 4 * VEC to 8 * VEC, inclusively. */
190885
-	VMOVU	(%rsi), %YMM1
190885
-	VPCMPEQ (%rdi), %YMM1, %k1
190885
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
190885
+	jbe	L(last_1x_vec)
190885
 
190885
+	/* Check second VEC no matter what.  */
190885
 	VMOVU	VEC_SIZE(%rsi), %YMM2
190885
-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
190885
+	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_1)
190885
+
190885
+	/* Less than 4 * VEC.  */
190885
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
190885
+	jbe	L(last_2x_vec)
190885
 
190885
+	/* Check third and fourth VEC no matter what.  */
190885
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
190885
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
190885
+	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_2)
190885
 
190885
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
190885
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
190885
+	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
190885
+	kmovd	%k1, %ecx
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(return_vec_3)
190885
 
190885
-	kandd	%k1, %k2, %k5
190885
-	kandd	%k3, %k4, %k6
190885
-	kandd	%k5, %k6, %k6
190885
+	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
190885
+	   compare with zero to get a mask is needed.  */
190885
+	vpxorq	%XMM0, %XMM0, %XMM0
190885
 
190885
-	kmovd	%k6, %eax
190885
-	cmpl	$VEC_MASK, %eax
190885
-	jne	L(4x_vec_end)
190885
+	/* Go to 4x VEC loop.  */
190885
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
190885
+	ja	L(more_8x_vec)
190885
 
190885
-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
190885
-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
190885
-	VMOVU	(%rsi), %YMM1
190885
-	VPCMPEQ (%rdi), %YMM1, %k1
190885
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
190885
+	   branches.  */
190885
 
190885
-	VMOVU	VEC_SIZE(%rsi), %YMM2
190885
-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
190885
-	kandd	%k1, %k2, %k5
190885
+	/* Load first two VEC from s2 before adjusting addresses.  */
190885
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
190885
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
190885
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
190885
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
190885
+
190885
+	/* Wait to load from s1 until addressed adjust due to
190885
+	   unlamination of microfusion with complex address mode.  */
190885
+
190885
+	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
190885
+	   will have some 1s.  */
190885
+	vpxorq	(%rdi), %YMM1, %YMM1
190885
+	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
190885
 
190885
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
190885
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
190885
-	kandd	%k3, %k5, %k5
190885
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
190885
+	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
190885
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
190885
 
190885
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
190885
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
190885
-	kandd	%k4, %k5, %k5
190885
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
190885
+	   oring with YMM3. Result is stored in YMM4.  */
190885
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
190885
+	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
190885
+	VPCMP	$4, %YMM4, %YMM0, %k1
190885
+	kmovd	%k1, %ecx
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(return_vec_0_1_2_3)
190885
+	/* NB: eax must be zero to reach here.  */
190885
+	ret
190885
 
190885
-	kmovd	%k5, %eax
190885
-	cmpl	$VEC_MASK, %eax
190885
-	jne	L(4x_vec_end)
190885
-	xorl	%eax, %eax
190885
+	/* NB: aligning 32 here allows for the rest of the jump targets
190885
+	   to be tuned for 32 byte alignment. Most important this ensures
190885
+	   the L(more_8x_vec) loop is 32 byte aligned.  */
190885
+	.p2align 5
190885
+L(less_vec):
190885
+	/* Check if one or less CHAR. This is necessary for size = 0 but
190885
+	   is also faster for size = CHAR_SIZE.  */
190885
+	cmpl	$1, %edx
190885
+	jbe	L(one_or_less)
190885
+
190885
+	/* Check if loading one VEC from either s1 or s2 could cause a
190885
+	   page cross. This can have false positives but is by far the
190885
+	   fastest method.  */
190885
+	movl	%edi, %eax
190885
+	orl	%esi, %eax
190885
+	andl	$(PAGE_SIZE - 1), %eax
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
+	jg	L(page_cross_less_vec)
190885
+
190885
+	/* No page cross possible.  */
190885
+	VMOVU	(%rsi), %YMM2
190885
+	VPCMP	$4, (%rdi), %YMM2, %k1
190885
+	kmovd	%k1, %eax
190885
+	/* Create mask in ecx for potentially in bound matches.  */
190885
+	bzhil	%edx, %eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 	ret
190885
 
190885
 	.p2align 4
190885
-L(last_2x_vec):
190885
-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
+L(return_vec_0):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCMP
190885
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	/* NB: no partial register stall here because xorl zero idiom
190885
+	   above.  */
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	movzbl	(%rsi, %rax), %ecx
190885
+	movzbl	(%rdi, %rax), %eax
190885
+	subl	%ecx, %eax
190885
+# endif
190885
+	ret
190885
 
190885
-L(last_vec):
190885
-	/* Use overlapping loads to avoid branches.  */
190885
-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
190885
-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
+	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
190885
+	   which is good enough for a target not in a loop.  */
190885
+L(return_vec_1):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCMP
190885
+	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
190885
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
190885
+	subl	%ecx, %eax
190885
+# endif
190885
 	ret
190885
 
190885
-	.p2align 4
190885
-L(first_vec):
190885
-	/* A byte or int32 is different within 16 or 32 bytes.  */
190885
-	tzcntl	%eax, %ecx
190885
+	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
190885
+	   which is good enough for a target not in a loop.  */
190885
+L(return_vec_2):
190885
+	tzcntl	%eax, %eax
190885
 # ifdef USE_AS_WMEMCMP
190885
-	xorl	%eax, %eax
190885
-	movl	(%rdi, %rcx, 4), %edx
190885
-	cmpl	(%rsi, %rcx, 4), %edx
190885
-L(wmemcmp_return):
190885
-	setl	%al
190885
-	negl	%eax
190885
-	orl	$1, %eax
190885
+	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
 # else
190885
-	movzbl	(%rdi, %rcx), %eax
190885
-	movzbl	(%rsi, %rcx), %edx
190885
-	sub	%edx, %eax
190885
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
190885
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
190885
+	subl	%ecx, %eax
190885
 # endif
190885
 	ret
190885
 
190885
+	.p2align 4
190885
+L(8x_return_vec_0_1_2_3):
190885
+	/* Returning from L(more_8x_vec) requires restoring rsi.  */
190885
+	addq	%rdi, %rsi
190885
+L(return_vec_0_1_2_3):
190885
+	VPCMP	$4, %YMM1, %YMM0, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
+
190885
+	VPCMP	$4, %YMM2, %YMM0, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_1)
190885
+
190885
+	VPCMP	$4, %YMM3, %YMM0, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_2)
190885
+L(return_vec_3):
190885
+	tzcntl	%ecx, %ecx
190885
 # ifdef USE_AS_WMEMCMP
190885
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
190885
+	xorl	%edx, %edx
190885
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
190885
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
190885
+	subl	%ecx, %eax
190885
+# endif
190885
+	ret
190885
+
190885
 	.p2align 4
190885
-L(4):
190885
-	xorl	%eax, %eax
190885
-	movl	(%rdi), %edx
190885
-	cmpl	(%rsi), %edx
190885
-	jne	L(wmemcmp_return)
190885
+L(more_8x_vec):
190885
+	/* Set end of s1 in rdx.  */
190885
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
190885
+	/* rsi stores s2 - s1. This allows loop to only update one
190885
+	   pointer.  */
190885
+	subq	%rdi, %rsi
190885
+	/* Align s1 pointer.  */
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	/* Adjust because first 4x vec where check already.  */
190885
+	subq	$-(VEC_SIZE * 4), %rdi
190885
+	.p2align 4
190885
+L(loop_4x_vec):
190885
+	VMOVU	(%rsi, %rdi), %YMM1
190885
+	vpxorq	(%rdi), %YMM1, %YMM1
190885
+
190885
+	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
190885
+	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
190885
+
190885
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
190885
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
190885
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
190885
+
190885
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
190885
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
190885
+	VPCMP	$4, %YMM4, %YMM0, %k1
190885
+	kmovd	%k1, %ecx
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(8x_return_vec_0_1_2_3)
190885
+	subq	$-(VEC_SIZE * 4), %rdi
190885
+	cmpq	%rdx, %rdi
190885
+	jb	L(loop_4x_vec)
190885
+
190885
+	subq	%rdx, %rdi
190885
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
190885
+	cmpl	$(VEC_SIZE * 3), %edi
190885
+	jae	L(8x_last_1x_vec)
190885
+	/* Load regardless of branch.  */
190885
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
190885
+	cmpl	$(VEC_SIZE * 2), %edi
190885
+	jae	L(8x_last_2x_vec)
190885
+
190885
+	VMOVU	(%rsi, %rdx), %YMM1
190885
+	vpxorq	(%rdx), %YMM1, %YMM1
190885
+
190885
+	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
190885
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
190885
+
190885
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
190885
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
190885
+
190885
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
190885
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
190885
+	VPCMP	$4, %YMM4, %YMM0, %k1
190885
+	kmovd	%k1, %ecx
190885
+	/* Restore s1 pointer to rdi.  */
190885
+	movq	%rdx, %rdi
190885
+	testl	%ecx, %ecx
190885
+	jnz	L(8x_return_vec_0_1_2_3)
190885
+	/* NB: eax must be zero to reach here.  */
190885
+	ret
190885
+
190885
+	/* Only entry is from L(more_8x_vec).  */
190885
+	.p2align 4
190885
+L(8x_last_2x_vec):
190885
+	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(8x_return_vec_2)
190885
+	/* Naturally aligned to 16 bytes.  */
190885
+L(8x_last_1x_vec):
190885
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
190885
+	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(8x_return_vec_3)
190885
+	ret
190885
+
190885
+	.p2align 4
190885
+L(last_2x_vec):
190885
+	/* Check second to last VEC.  */
190885
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
190885
+	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_1_end)
190885
+
190885
+	/* Check last VEC.  */
190885
+	.p2align 4
190885
+L(last_1x_vec):
190885
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
190885
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0_end)
190885
 	ret
190885
+
190885
+	.p2align 4
190885
+L(8x_return_vec_2):
190885
+	subq	$VEC_SIZE, %rdx
190885
+L(8x_return_vec_3):
190885
+	tzcntl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCMP
190885
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
190885
+	movl	(VEC_SIZE * 3)(%rax), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
 # else
190885
+	addq	%rdx, %rax
190885
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
190885
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
190885
+	subl	%ecx, %eax
190885
+# endif
190885
+	ret
190885
+
190885
 	.p2align 4
190885
-L(between_4_7):
190885
-	/* Load as big endian with overlapping movbe to avoid branches.  */
190885
-	movbe	(%rdi), %eax
190885
-	movbe	(%rsi), %ecx
190885
-	shlq	$32, %rax
190885
-	shlq	$32, %rcx
190885
-	movbe	-4(%rdi, %rdx), %edi
190885
-	movbe	-4(%rsi, %rdx), %esi
190885
-	orq	%rdi, %rax
190885
-	orq	%rsi, %rcx
190885
-	subq	%rcx, %rax
190885
-	je	L(exit)
190885
-	sbbl	%eax, %eax
190885
-	orl	$1, %eax
190885
+L(return_vec_0_end):
190885
+	tzcntl	%eax, %eax
190885
+	addl	%edx, %eax
190885
+# ifdef USE_AS_WMEMCMP
190885
+	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
190885
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
190885
+	subl	%ecx, %eax
190885
+# endif
190885
 	ret
190885
 
190885
 	.p2align 4
190885
-L(exit):
190885
+L(return_vec_1_end):
190885
+	tzcntl	%eax, %eax
190885
+	addl	%edx, %eax
190885
+# ifdef USE_AS_WMEMCMP
190885
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
190885
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
190885
+	subl	%ecx, %eax
190885
+# endif
190885
 	ret
190885
 
190885
+
190885
 	.p2align 4
190885
+L(page_cross_less_vec):
190885
+	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
190885
+	   bytes.  */
190885
+	cmpl	$(16 / CHAR_SIZE), %edx
190885
+	jae	L(between_16_31)
190885
+# ifndef USE_AS_WMEMCMP
190885
+	cmpl	$8, %edx
190885
+	jae	L(between_8_15)
190885
+	cmpl	$4, %edx
190885
+	jae	L(between_4_7)
190885
 L(between_2_3):
190885
 	/* Load as big endian to avoid branches.  */
190885
 	movzwl	(%rdi), %eax
190885
@@ -217,224 +448,99 @@ L(between_2_3):
190885
 	shll	$8, %ecx
190885
 	bswap	%eax
190885
 	bswap	%ecx
190885
-	movb	-1(%rdi, %rdx), %al
190885
-	movb	-1(%rsi, %rdx), %cl
190885
+	movzbl	-1(%rdi, %rdx), %edi
190885
+	movzbl	-1(%rsi, %rdx), %esi
190885
+	orl	%edi, %eax
190885
+	orl	%esi, %ecx
190885
 	/* Subtraction is okay because the upper 8 bits are zero.  */
190885
 	subl	%ecx, %eax
190885
 	ret
190885
-
190885
 	.p2align 4
190885
-L(1):
190885
-	movzbl	(%rdi), %eax
190885
+L(one_or_less):
190885
+	jb	L(zero)
190885
 	movzbl	(%rsi), %ecx
190885
+	movzbl	(%rdi), %eax
190885
 	subl	%ecx, %eax
190885
 	ret
190885
-# endif
190885
-
190885
-	.p2align 4
190885
-L(zero):
190885
-	xorl	%eax, %eax
190885
-	ret
190885
 
190885
 	.p2align 4
190885
-L(less_vec):
190885
-# ifdef USE_AS_WMEMCMP
190885
-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
190885
-	cmpb	$4, %dl
190885
-	je	L(4)
190885
-	jb	L(zero)
190885
-# else
190885
-	cmpb	$1, %dl
190885
-	je	L(1)
190885
-	jb	L(zero)
190885
-	cmpb	$4, %dl
190885
-	jb	L(between_2_3)
190885
-	cmpb	$8, %dl
190885
-	jb	L(between_4_7)
190885
+L(between_8_15):
190885
 # endif
190885
-	cmpb	$16, %dl
190885
-	jae	L(between_16_31)
190885
-	/* It is between 8 and 15 bytes.  */
190885
+	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
190885
 	vmovq	(%rdi), %XMM1
190885
 	vmovq	(%rsi), %XMM2
190885
-	VPCMPEQ %XMM1, %XMM2, %k2
190885
-	kmovw	%k2, %eax
190885
-	subl    $XMM_MASK, %eax
190885
-	jnz	L(first_vec)
190885
+	VPCMP	$4, %XMM1, %XMM2, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 	/* Use overlapping loads to avoid branches.  */
190885
-	leaq	-8(%rdi, %rdx), %rdi
190885
-	leaq	-8(%rsi, %rdx), %rsi
190885
+	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
190885
+	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
190885
 	vmovq	(%rdi), %XMM1
190885
 	vmovq	(%rsi), %XMM2
190885
-	VPCMPEQ %XMM1, %XMM2, %k2
190885
-	kmovw	%k2, %eax
190885
-	subl    $XMM_MASK, %eax
190885
-	jnz	L(first_vec)
190885
+	VPCMP	$4, %XMM1, %XMM2, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 	ret
190885
 
190885
 	.p2align 4
190885
-L(between_16_31):
190885
-	/* From 16 to 31 bytes.  No branch when size == 16.  */
190885
-	VMOVU	(%rsi), %XMM2
190885
-	VPCMPEQ (%rdi), %XMM2, %k2
190885
-	kmovw	%k2, %eax
190885
-	subl    $XMM_MASK, %eax
190885
-	jnz	L(first_vec)
190885
-
190885
-	/* Use overlapping loads to avoid branches.  */
190885
-	leaq	-16(%rdi, %rdx), %rdi
190885
-	leaq	-16(%rsi, %rdx), %rsi
190885
-	VMOVU	(%rsi), %XMM2
190885
-	VPCMPEQ (%rdi), %XMM2, %k2
190885
-	kmovw	%k2, %eax
190885
-	subl    $XMM_MASK, %eax
190885
-	jnz	L(first_vec)
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
 	ret
190885
 
190885
 	.p2align 4
190885
-L(more_8x_vec):
190885
-	/* More than 8 * VEC.  Check the first VEC.  */
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
-
190885
-	/* Align the first memory area for aligned loads in the loop.
190885
-	   Compute how much the first memory area is misaligned.  */
190885
-	movq	%rdi, %rcx
190885
-	andl	$(VEC_SIZE - 1), %ecx
190885
-	/* Get the negative of offset for alignment.  */
190885
-	subq	$VEC_SIZE, %rcx
190885
-	/* Adjust the second memory area.  */
190885
-	subq	%rcx, %rsi
190885
-	/* Adjust the first memory area which should be aligned now.  */
190885
-	subq	%rcx, %rdi
190885
-	/* Adjust length.  */
190885
-	addq	%rcx, %rdx
190885
-
190885
-L(loop_4x_vec):
190885
-	/* Compare 4 * VEC at a time forward.  */
190885
-	VMOVU	(%rsi), %YMM1
190885
-	VPCMPEQ (%rdi), %YMM1, %k1
190885
-
190885
-	VMOVU	VEC_SIZE(%rsi), %YMM2
190885
-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
190885
-	kandd	%k2, %k1, %k5
190885
-
190885
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
190885
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
190885
-	kandd	%k3, %k5, %k5
190885
-
190885
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
190885
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
190885
-	kandd	%k4, %k5, %k5
190885
-
190885
-	kmovd	%k5, %eax
190885
-	cmpl	$VEC_MASK, %eax
190885
-	jne	L(4x_vec_end)
190885
-
190885
-	addq	$(VEC_SIZE * 4), %rdi
190885
-	addq	$(VEC_SIZE * 4), %rsi
190885
-
190885
-	subq	$(VEC_SIZE * 4), %rdx
190885
-	cmpq	$(VEC_SIZE * 4), %rdx
190885
-	jae	L(loop_4x_vec)
190885
-
190885
-	/* Less than 4 * VEC.  */
190885
-	cmpq	$VEC_SIZE, %rdx
190885
-	jbe	L(last_vec)
190885
-	cmpq	$(VEC_SIZE * 2), %rdx
190885
-	jbe	L(last_2x_vec)
190885
-
190885
-L(last_4x_vec):
190885
-	/* From 2 * VEC to 4 * VEC. */
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
-
190885
-	addq	$VEC_SIZE, %rdi
190885
-	addq	$VEC_SIZE, %rsi
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
+L(between_16_31):
190885
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
190885
+	VMOVU	(%rsi), %XMM2
190885
+	VPCMP	$4, (%rdi), %XMM2, %k1
190885
+	kmovd	%k1, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 
190885
 	/* Use overlapping loads to avoid branches.  */
190885
-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
190885
-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
 
190885
-	addq	$VEC_SIZE, %rdi
190885
-	addq	$VEC_SIZE, %rsi
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMPEQ (%rdi), %YMM2, %k2
190885
-	kmovd	%k2, %eax
190885
-	subl    $VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
-	ret
190885
-
190885
-	.p2align 4
190885
-L(4x_vec_end):
190885
+	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
190885
+	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
190885
+	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
190885
+	VPCMP	$4, (%rdi), %XMM2, %k1
190885
 	kmovd	%k1, %eax
190885
-	subl	$VEC_MASK, %eax
190885
-	jnz	L(first_vec)
190885
-	kmovd	%k2, %eax
190885
-	subl	$VEC_MASK, %eax
190885
-	jnz	L(first_vec_x1)
190885
-	kmovd	%k3, %eax
190885
-	subl	$VEC_MASK, %eax
190885
-	jnz	L(first_vec_x2)
190885
-	kmovd	%k4, %eax
190885
-	subl	$VEC_MASK, %eax
190885
-	tzcntl	%eax, %ecx
190885
-# ifdef USE_AS_WMEMCMP
190885
-	xorl	%eax, %eax
190885
-	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
190885
-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
190885
-	jmp	L(wmemcmp_return)
190885
-# else
190885
-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
190885
-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
190885
-	sub	%edx, %eax
190885
-# endif
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 	ret
190885
 
190885
-	.p2align 4
190885
-L(first_vec_x1):
190885
-	tzcntl	%eax, %ecx
190885
 # ifdef USE_AS_WMEMCMP
190885
-	xorl	%eax, %eax
190885
-	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
190885
-	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
190885
-	jmp	L(wmemcmp_return)
190885
-# else
190885
-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
190885
-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
190885
-	sub	%edx, %eax
190885
-# endif
190885
+	.p2align 4
190885
+L(one_or_less):
190885
+	jb	L(zero)
190885
+	movl	(%rdi), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	(%rsi), %ecx
190885
+	je	L(zero)
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
 	ret
190885
+# else
190885
 
190885
 	.p2align 4
190885
-L(first_vec_x2):
190885
-	tzcntl	%eax, %ecx
190885
-# ifdef USE_AS_WMEMCMP
190885
-	xorl	%eax, %eax
190885
-	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
190885
-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
190885
-	jmp	L(wmemcmp_return)
190885
-# else
190885
-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
190885
-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
190885
-	sub	%edx, %eax
190885
-# endif
190885
+L(between_4_7):
190885
+	/* Load as big endian with overlapping movbe to avoid branches.
190885
+	 */
190885
+	movbe	(%rdi), %eax
190885
+	movbe	(%rsi), %ecx
190885
+	shlq	$32, %rax
190885
+	shlq	$32, %rcx
190885
+	movbe	-4(%rdi, %rdx), %edi
190885
+	movbe	-4(%rsi, %rdx), %esi
190885
+	orq	%rdi, %rax
190885
+	orq	%rsi, %rcx
190885
+	subq	%rcx, %rax
190885
+	jz	L(zero_4_7)
190885
+	sbbl	%eax, %eax
190885
+	orl	$1, %eax
190885
+L(zero_4_7):
190885
 	ret
190885
+# endif
190885
+
190885
 END (MEMCMP)
190885
 #endif
190885
-- 
190885
GitLab
190885