Blame SOURCES/ia-opt-memcmp-evex-movbe-2.patch

190885
From 851ab0499680a3369da724d3d6d2ba71652d530d Mon Sep 17 00:00:00 2001
190885
From: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Date: Tue, 21 Sep 2021 18:45:03 -0500
190885
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
190885
 size
190885
190885
No bug.
190885
190885
The frontend optimizations are to:
190885
1. Reorganize logically connected basic blocks so they are either in
190885
   the same cache line or adjacent cache lines.
190885
2. Avoid cases when basic blocks unnecissarily cross cache lines.
190885
3. Try and 32 byte align any basic blocks possible without sacrificing
190885
   code size. Smaller / Less hot basic blocks are used for this.
190885
190885
Overall code size shrunk by 168 bytes. This should make up for any
190885
extra costs due to aligning to 64 bytes.
190885
190885
In general performance before deviated a great deal dependending on
190885
whether entry alignment % 64 was 0, 16, 32, or 48. These changes
190885
essentially make it so that the current implementation is at least
190885
equal to the best alignment of the original for any arguments.
190885
190885
The only additional optimization is in the page cross case. Branch on
190885
equals case was removed from the size == [4, 7] case. As well the [4,
190885
7] and [2, 3] case where swapped as [4, 7] is likely a more hot
190885
argument size.
190885
190885
test-memcmp and test-wmemcmp are both passing.
190885
190885
(cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
190885
---
190885
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
190885
 1 file changed, 242 insertions(+), 192 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
190885
index 654dc7ac..2761b54f 100644
190885
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
190885
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
190885
@@ -34,7 +34,24 @@
190885
       area.
190885
    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
190885
    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
190885
-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
190885
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
190885
+
190885
+When possible the implementation tries to optimize for frontend in the
190885
+following ways:
190885
+Throughput:
190885
+    1. All code sections that fit are able to run optimally out of the
190885
+       LSD.
190885
+    2. All code sections that fit are able to run optimally out of the
190885
+       DSB
190885
+    3. Basic blocks are contained in minimum number of fetch blocks
190885
+       necessary.
190885
+
190885
+Latency:
190885
+    1. Logically connected basic blocks are put in the same
190885
+       cache-line.
190885
+    2. Logically connected basic blocks that do not fit in the same
190885
+       cache-line are put in adjacent lines. This can get beneficial
190885
+       L2 spatial prefetching and L1 next-line prefetching.  */
190885
 
190885
 # include <sysdep.h>
190885
 
190885
@@ -47,9 +64,11 @@
190885
 # ifdef USE_AS_WMEMCMP
190885
 #  define CHAR_SIZE	4
190885
 #  define VPCMP	vpcmpd
190885
+#  define VPTEST	vptestmd
190885
 # else
190885
 #  define CHAR_SIZE	1
190885
 #  define VPCMP	vpcmpub
190885
+#  define VPTEST	vptestmb
190885
 # endif
190885
 
190885
 # define VEC_SIZE	32
190885
@@ -75,7 +94,9 @@
190885
 */
190885
 
190885
 	.section .text.evex,"ax",@progbits
190885
-ENTRY (MEMCMP)
190885
+/* Cache align memcmp entry. This allows for much more thorough
190885
+   frontend optimization.  */
190885
+ENTRY_P2ALIGN (MEMCMP, 6)
190885
 # ifdef __ILP32__
190885
 	/* Clear the upper 32 bits.  */
190885
 	movl	%edx, %edx
190885
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
190885
 	VPCMP	$4, (%rdi), %YMM1, %k1
190885
 	kmovd	%k1, %eax
190885
 	/* NB: eax must be destination register if going to
190885
-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
190885
+	   L(return_vec_[0,2]). For L(return_vec_3) destination register
190885
 	   must be ecx.  */
190885
 	testl	%eax, %eax
190885
 	jnz	L(return_vec_0)
190885
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
190885
 	testl	%ecx, %ecx
190885
 	jnz	L(return_vec_3)
190885
 
190885
-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
190885
-	   compare with zero to get a mask is needed.  */
190885
-	vpxorq	%XMM0, %XMM0, %XMM0
190885
-
190885
 	/* Go to 4x VEC loop.  */
190885
 	cmpq	$(CHAR_PER_VEC * 8), %rdx
190885
 	ja	L(more_8x_vec)
190885
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
190885
 
190885
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
190885
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
190885
-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
190885
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
190885
 
190885
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
190885
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
190885
-	   oring with YMM3. Result is stored in YMM4.  */
190885
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
190885
-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
190885
-	VPCMP	$4, %YMM4, %YMM0, %k1
190885
+	   oring with YMM1. Result is stored in YMM4.  */
190885
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
190885
+
190885
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
190885
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
190885
+
190885
+	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
190885
+	 */
190885
+	VPTEST	%YMM4, %YMM4, %k1
190885
+	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
190885
 	kmovd	%k1, %ecx
190885
 	testl	%ecx, %ecx
190885
 	jnz	L(return_vec_0_1_2_3)
190885
 	/* NB: eax must be zero to reach here.  */
190885
 	ret
190885
 
190885
-	/* NB: aligning 32 here allows for the rest of the jump targets
190885
-	   to be tuned for 32 byte alignment. Most important this ensures
190885
-	   the L(more_8x_vec) loop is 32 byte aligned.  */
190885
-	.p2align 5
190885
-L(less_vec):
190885
-	/* Check if one or less CHAR. This is necessary for size = 0 but
190885
-	   is also faster for size = CHAR_SIZE.  */
190885
-	cmpl	$1, %edx
190885
-	jbe	L(one_or_less)
190885
+	.p2align 4
190885
+L(8x_end_return_vec_0_1_2_3):
190885
+	movq	%rdx, %rdi
190885
+L(8x_return_vec_0_1_2_3):
190885
+	addq	%rdi, %rsi
190885
+L(return_vec_0_1_2_3):
190885
+	VPTEST	%YMM1, %YMM1, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_0)
190885
 
190885
-	/* Check if loading one VEC from either s1 or s2 could cause a
190885
-	   page cross. This can have false positives but is by far the
190885
-	   fastest method.  */
190885
-	movl	%edi, %eax
190885
-	orl	%esi, %eax
190885
-	andl	$(PAGE_SIZE - 1), %eax
190885
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
-	jg	L(page_cross_less_vec)
190885
+	VPTEST	%YMM2, %YMM2, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_1)
190885
 
190885
-	/* No page cross possible.  */
190885
-	VMOVU	(%rsi), %YMM2
190885
-	VPCMP	$4, (%rdi), %YMM2, %k1
190885
-	kmovd	%k1, %eax
190885
-	/* Create mask in ecx for potentially in bound matches.  */
190885
-	bzhil	%edx, %eax, %eax
190885
-	jnz	L(return_vec_0)
190885
+	VPTEST	%YMM3, %YMM3, %k0
190885
+	kmovd	%k0, %eax
190885
+	testl	%eax, %eax
190885
+	jnz	L(return_vec_2)
190885
+L(return_vec_3):
190885
+	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
190885
+	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
190885
+	   line.  */
190885
+	bsfl	%ecx, %ecx
190885
+# ifdef USE_AS_WMEMCMP
190885
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
190885
+	xorl	%edx, %edx
190885
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
190885
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
190885
+	subl	%ecx, %eax
190885
+# endif
190885
 	ret
190885
 
190885
 	.p2align 4
190885
@@ -209,10 +240,11 @@ L(return_vec_0):
190885
 # endif
190885
 	ret
190885
 
190885
-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
190885
-	   which is good enough for a target not in a loop.  */
190885
+	.p2align 4
190885
 L(return_vec_1):
190885
-	tzcntl	%eax, %eax
190885
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
190885
+	   fetch block.  */
190885
+	bsfl	%eax, %eax
190885
 # ifdef USE_AS_WMEMCMP
190885
 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
190885
 	xorl	%edx, %edx
190885
@@ -226,10 +258,11 @@ L(return_vec_1):
190885
 # endif
190885
 	ret
190885
 
190885
-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
190885
-	   which is good enough for a target not in a loop.  */
190885
+	.p2align 4,, 10
190885
 L(return_vec_2):
190885
-	tzcntl	%eax, %eax
190885
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
190885
+	   fetch block.  */
190885
+	bsfl	%eax, %eax
190885
 # ifdef USE_AS_WMEMCMP
190885
 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
190885
 	xorl	%edx, %edx
190885
@@ -243,40 +276,6 @@ L(return_vec_2):
190885
 # endif
190885
 	ret
190885
 
190885
-	.p2align 4
190885
-L(8x_return_vec_0_1_2_3):
190885
-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
190885
-	addq	%rdi, %rsi
190885
-L(return_vec_0_1_2_3):
190885
-	VPCMP	$4, %YMM1, %YMM0, %k0
190885
-	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(return_vec_0)
190885
-
190885
-	VPCMP	$4, %YMM2, %YMM0, %k0
190885
-	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(return_vec_1)
190885
-
190885
-	VPCMP	$4, %YMM3, %YMM0, %k0
190885
-	kmovd	%k0, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(return_vec_2)
190885
-L(return_vec_3):
190885
-	tzcntl	%ecx, %ecx
190885
-# ifdef USE_AS_WMEMCMP
190885
-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
190885
-	xorl	%edx, %edx
190885
-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
190885
-	setg	%dl
190885
-	leal	-1(%rdx, %rdx), %eax
190885
-# else
190885
-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
190885
-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
190885
-	subl	%ecx, %eax
190885
-# endif
190885
-	ret
190885
-
190885
 	.p2align 4
190885
 L(more_8x_vec):
190885
 	/* Set end of s1 in rdx.  */
190885
@@ -288,21 +287,19 @@ L(more_8x_vec):
190885
 	andq	$-VEC_SIZE, %rdi
190885
 	/* Adjust because first 4x vec where check already.  */
190885
 	subq	$-(VEC_SIZE * 4), %rdi
190885
+
190885
 	.p2align 4
190885
 L(loop_4x_vec):
190885
 	VMOVU	(%rsi, %rdi), %YMM1
190885
 	vpxorq	(%rdi), %YMM1, %YMM1
190885
-
190885
 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
190885
 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
190885
-
190885
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
190885
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
190885
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
190885
-
190885
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
190885
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
190885
-	VPCMP	$4, %YMM4, %YMM0, %k1
190885
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
190885
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
190885
+	VPTEST	%YMM4, %YMM4, %k1
190885
 	kmovd	%k1, %ecx
190885
 	testl	%ecx, %ecx
190885
 	jnz	L(8x_return_vec_0_1_2_3)
190885
@@ -319,28 +316,25 @@ L(loop_4x_vec):
190885
 	cmpl	$(VEC_SIZE * 2), %edi
190885
 	jae	L(8x_last_2x_vec)
190885
 
190885
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
190885
+
190885
 	VMOVU	(%rsi, %rdx), %YMM1
190885
 	vpxorq	(%rdx), %YMM1, %YMM1
190885
 
190885
 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
190885
 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
190885
-
190885
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
190885
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
190885
-
190885
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
190885
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
190885
-	VPCMP	$4, %YMM4, %YMM0, %k1
190885
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
190885
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
190885
+	VPTEST	%YMM4, %YMM4, %k1
190885
 	kmovd	%k1, %ecx
190885
-	/* Restore s1 pointer to rdi.  */
190885
-	movq	%rdx, %rdi
190885
 	testl	%ecx, %ecx
190885
-	jnz	L(8x_return_vec_0_1_2_3)
190885
+	jnz	L(8x_end_return_vec_0_1_2_3)
190885
 	/* NB: eax must be zero to reach here.  */
190885
 	ret
190885
 
190885
 	/* Only entry is from L(more_8x_vec).  */
190885
-	.p2align 4
190885
+	.p2align 4,, 10
190885
 L(8x_last_2x_vec):
190885
 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
190885
 	kmovd	%k1, %eax
190885
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
190885
 	jnz	L(8x_return_vec_3)
190885
 	ret
190885
 
190885
-	.p2align 4
190885
+	/* Not ideally aligned (at offset +9 bytes in fetch block) but
190885
+	   not aligning keeps it in the same cache line as
190885
+	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
190885
+	   size.  */
190885
+	.p2align 4,, 4
190885
+L(8x_return_vec_2):
190885
+	subq	$VEC_SIZE, %rdx
190885
+L(8x_return_vec_3):
190885
+	bsfl	%eax, %eax
190885
+# ifdef USE_AS_WMEMCMP
190885
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
190885
+	movl	(VEC_SIZE * 3)(%rax), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+# else
190885
+	addq	%rdx, %rax
190885
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
190885
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
190885
+	subl	%ecx, %eax
190885
+# endif
190885
+	ret
190885
+
190885
+	.p2align 4,, 10
190885
 L(last_2x_vec):
190885
 	/* Check second to last VEC.  */
190885
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
190885
@@ -374,26 +392,49 @@ L(last_1x_vec):
190885
 	jnz	L(return_vec_0_end)
190885
 	ret
190885
 
190885
-	.p2align 4
190885
-L(8x_return_vec_2):
190885
-	subq	$VEC_SIZE, %rdx
190885
-L(8x_return_vec_3):
190885
-	tzcntl	%eax, %eax
190885
+	.p2align 4,, 10
190885
+L(return_vec_1_end):
190885
+	/* Use bsf to save code size. This is necessary to have
190885
+	   L(one_or_less) fit in aligning bytes between.  */
190885
+	bsfl	%eax, %eax
190885
+	addl	%edx, %eax
190885
 # ifdef USE_AS_WMEMCMP
190885
-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
190885
-	movl	(VEC_SIZE * 3)(%rax), %ecx
190885
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
190885
 	xorl	%edx, %edx
190885
-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
190885
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
190885
 	setg	%dl
190885
 	leal	-1(%rdx, %rdx), %eax
190885
 # else
190885
-	addq	%rdx, %rax
190885
-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
190885
-	movzbl	(VEC_SIZE * 3)(%rax), %eax
190885
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
190885
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
190885
 	subl	%ecx, %eax
190885
 # endif
190885
 	ret
190885
 
190885
+	/* NB: L(one_or_less) fits in alignment padding between
190885
+	   L(return_vec_1_end) and L(return_vec_0_end).  */
190885
+# ifdef USE_AS_WMEMCMP
190885
+L(one_or_less):
190885
+	jb	L(zero)
190885
+	movl	(%rdi), %ecx
190885
+	xorl	%edx, %edx
190885
+	cmpl	(%rsi), %ecx
190885
+	je	L(zero)
190885
+	setg	%dl
190885
+	leal	-1(%rdx, %rdx), %eax
190885
+	ret
190885
+# else
190885
+L(one_or_less):
190885
+	jb	L(zero)
190885
+	movzbl	(%rsi), %ecx
190885
+	movzbl	(%rdi), %eax
190885
+	subl	%ecx, %eax
190885
+	ret
190885
+# endif
190885
+L(zero):
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
 	.p2align 4
190885
 L(return_vec_0_end):
190885
 	tzcntl	%eax, %eax
190885
@@ -412,23 +453,56 @@ L(return_vec_0_end):
190885
 	ret
190885
 
190885
 	.p2align 4
190885
-L(return_vec_1_end):
190885
+L(less_vec):
190885
+	/* Check if one or less CHAR. This is necessary for size == 0
190885
+	   but is also faster for size == CHAR_SIZE.  */
190885
+	cmpl	$1, %edx
190885
+	jbe	L(one_or_less)
190885
+
190885
+	/* Check if loading one VEC from either s1 or s2 could cause a
190885
+	   page cross. This can have false positives but is by far the
190885
+	   fastest method.  */
190885
+	movl	%edi, %eax
190885
+	orl	%esi, %eax
190885
+	andl	$(PAGE_SIZE - 1), %eax
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
190885
+	jg	L(page_cross_less_vec)
190885
+
190885
+	/* No page cross possible.  */
190885
+	VMOVU	(%rsi), %YMM2
190885
+	VPCMP	$4, (%rdi), %YMM2, %k1
190885
+	kmovd	%k1, %eax
190885
+	/* Check if any matches where in bounds. Intentionally not
190885
+	   storing result in eax to limit dependency chain if it goes to
190885
+	   L(return_vec_0_lv).  */
190885
+	bzhil	%edx, %eax, %edx
190885
+	jnz	L(return_vec_0_lv)
190885
+	xorl	%eax, %eax
190885
+	ret
190885
+
190885
+	/* Essentially duplicate of L(return_vec_0). Ends up not costing
190885
+	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
190885
+	   the jump and ends up fitting in aligning bytes. As well fits on
190885
+	   same cache line as L(less_vec) so also saves a line from having
190885
+	   to be fetched on cold calls to memcmp.  */
190885
+	.p2align 4,, 4
190885
+L(return_vec_0_lv):
190885
 	tzcntl	%eax, %eax
190885
-	addl	%edx, %eax
190885
 # ifdef USE_AS_WMEMCMP
190885
-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
190885
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
190885
 	xorl	%edx, %edx
190885
-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
190885
+	/* NB: no partial register stall here because xorl zero idiom
190885
+	   above.  */
190885
 	setg	%dl
190885
 	leal	-1(%rdx, %rdx), %eax
190885
 # else
190885
-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
190885
-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
190885
+	movzbl	(%rsi, %rax), %ecx
190885
+	movzbl	(%rdi, %rax), %eax
190885
 	subl	%ecx, %eax
190885
 # endif
190885
 	ret
190885
 
190885
-
190885
 	.p2align 4
190885
 L(page_cross_less_vec):
190885
 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
190885
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
190885
 	cmpl	$8, %edx
190885
 	jae	L(between_8_15)
190885
 	cmpl	$4, %edx
190885
-	jae	L(between_4_7)
190885
-L(between_2_3):
190885
-	/* Load as big endian to avoid branches.  */
190885
-	movzwl	(%rdi), %eax
190885
-	movzwl	(%rsi), %ecx
190885
-	shll	$8, %eax
190885
-	shll	$8, %ecx
190885
-	bswap	%eax
190885
-	bswap	%ecx
190885
-	movzbl	-1(%rdi, %rdx), %edi
190885
-	movzbl	-1(%rsi, %rdx), %esi
190885
-	orl	%edi, %eax
190885
-	orl	%esi, %ecx
190885
-	/* Subtraction is okay because the upper 8 bits are zero.  */
190885
-	subl	%ecx, %eax
190885
-	ret
190885
-	.p2align 4
190885
-L(one_or_less):
190885
-	jb	L(zero)
190885
-	movzbl	(%rsi), %ecx
190885
-	movzbl	(%rdi), %eax
190885
-	subl	%ecx, %eax
190885
+	jb	L(between_2_3)
190885
+
190885
+	/* Load as big endian with overlapping movbe to avoid branches.
190885
+	 */
190885
+	movbe	(%rdi), %eax
190885
+	movbe	(%rsi), %ecx
190885
+	shlq	$32, %rax
190885
+	shlq	$32, %rcx
190885
+	movbe	-4(%rdi, %rdx), %edi
190885
+	movbe	-4(%rsi, %rdx), %esi
190885
+	orq	%rdi, %rax
190885
+	orq	%rsi, %rcx
190885
+	subq	%rcx, %rax
190885
+	/* edx is guranteed to be positive int32 in range [4, 7].  */
190885
+	cmovne	%edx, %eax
190885
+	/* ecx is -1 if rcx > rax. Otherwise 0.  */
190885
+	sbbl	%ecx, %ecx
190885
+	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
190885
+	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
190885
+	   eax doesn't matter.  */
190885
+	orl	%ecx, %eax
190885
 	ret
190885
 
190885
-	.p2align 4
190885
+	.p2align 4,, 8
190885
 L(between_8_15):
190885
 # endif
190885
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
190885
-	vmovq	(%rdi), %XMM1
190885
-	vmovq	(%rsi), %XMM2
190885
-	VPCMP	$4, %XMM1, %XMM2, %k1
190885
+	vmovq	(%rdi), %xmm1
190885
+	vmovq	(%rsi), %xmm2
190885
+	VPCMP	$4, %xmm1, %xmm2, %k1
190885
 	kmovd	%k1, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(return_vec_0)
190885
+	jnz	L(return_vec_0_lv)
190885
 	/* Use overlapping loads to avoid branches.  */
190885
-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
190885
-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
190885
-	vmovq	(%rdi), %XMM1
190885
-	vmovq	(%rsi), %XMM2
190885
-	VPCMP	$4, %XMM1, %XMM2, %k1
190885
+	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
190885
+	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
190885
+	VPCMP	$4, %xmm1, %xmm2, %k1
190885
+	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
190885
 	kmovd	%k1, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(return_vec_0)
190885
-	ret
190885
-
190885
-	.p2align 4
190885
-L(zero):
190885
-	xorl	%eax, %eax
190885
+	jnz	L(return_vec_0_end)
190885
 	ret
190885
 
190885
-	.p2align 4
190885
+	.p2align 4,, 8
190885
 L(between_16_31):
190885
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
190885
-	VMOVU	(%rsi), %XMM2
190885
-	VPCMP	$4, (%rdi), %XMM2, %k1
190885
+
190885
+	/* Use movups to save code size.  */
190885
+	movups	(%rsi), %xmm2
190885
+	VPCMP	$4, (%rdi), %xmm2, %k1
190885
 	kmovd	%k1, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(return_vec_0)
190885
-
190885
+	jnz	L(return_vec_0_lv)
190885
 	/* Use overlapping loads to avoid branches.  */
190885
-
190885
-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
190885
-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
190885
-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
190885
-	VPCMP	$4, (%rdi), %XMM2, %k1
190885
+	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
190885
+	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
190885
+	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
190885
 	kmovd	%k1, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(return_vec_0)
190885
-	ret
190885
-
190885
-# ifdef USE_AS_WMEMCMP
190885
-	.p2align 4
190885
-L(one_or_less):
190885
-	jb	L(zero)
190885
-	movl	(%rdi), %ecx
190885
-	xorl	%edx, %edx
190885
-	cmpl	(%rsi), %ecx
190885
-	je	L(zero)
190885
-	setg	%dl
190885
-	leal	-1(%rdx, %rdx), %eax
190885
+	jnz	L(return_vec_0_end)
190885
 	ret
190885
-# else
190885
 
190885
-	.p2align 4
190885
-L(between_4_7):
190885
-	/* Load as big endian with overlapping movbe to avoid branches.
190885
-	 */
190885
-	movbe	(%rdi), %eax
190885
-	movbe	(%rsi), %ecx
190885
-	shlq	$32, %rax
190885
-	shlq	$32, %rcx
190885
-	movbe	-4(%rdi, %rdx), %edi
190885
-	movbe	-4(%rsi, %rdx), %esi
190885
-	orq	%rdi, %rax
190885
-	orq	%rsi, %rcx
190885
-	subq	%rcx, %rax
190885
-	jz	L(zero_4_7)
190885
-	sbbl	%eax, %eax
190885
-	orl	$1, %eax
190885
-L(zero_4_7):
190885
+# ifndef USE_AS_WMEMCMP
190885
+L(between_2_3):
190885
+	/* Load as big endian to avoid branches.  */
190885
+	movzwl	(%rdi), %eax
190885
+	movzwl	(%rsi), %ecx
190885
+	shll	$8, %eax
190885
+	shll	$8, %ecx
190885
+	bswap	%eax
190885
+	bswap	%ecx
190885
+	movzbl	-1(%rdi, %rdx), %edi
190885
+	movzbl	-1(%rsi, %rdx), %esi
190885
+	orl	%edi, %eax
190885
+	orl	%esi, %ecx
190885
+	/* Subtraction is okay because the upper 8 bits are zero.  */
190885
+	subl	%ecx, %eax
190885
 	ret
190885
 # endif
190885
-
190885
 END (MEMCMP)
190885
 #endif
190885
-- 
190885
GitLab
190885