08c3a6
commit 5ec3416853c4150c4d13312e05f93a053586d528
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Tue Sep 21 18:45:03 2021 -0500
08c3a6
08c3a6
    x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
08c3a6
    
08c3a6
    No bug.
08c3a6
    
08c3a6
    The frontend optimizations are to:
08c3a6
    1. Reorganize logically connected basic blocks so they are either in
08c3a6
       the same cache line or adjacent cache lines.
08c3a6
    2. Avoid cases when basic blocks unnecissarily cross cache lines.
08c3a6
    3. Try and 32 byte align any basic blocks possible without sacrificing
08c3a6
       code size. Smaller / Less hot basic blocks are used for this.
08c3a6
    
08c3a6
    Overall code size shrunk by 168 bytes. This should make up for any
08c3a6
    extra costs due to aligning to 64 bytes.
08c3a6
    
08c3a6
    In general performance before deviated a great deal dependending on
08c3a6
    whether entry alignment % 64 was 0, 16, 32, or 48. These changes
08c3a6
    essentially make it so that the current implementation is at least
08c3a6
    equal to the best alignment of the original for any arguments.
08c3a6
    
08c3a6
    The only additional optimization is in the page cross case. Branch on
08c3a6
    equals case was removed from the size == [4, 7] case. As well the [4,
08c3a6
    7] and [2, 3] case where swapped as [4, 7] is likely a more hot
08c3a6
    argument size.
08c3a6
    
08c3a6
    test-memcmp and test-wmemcmp are both passing.
08c3a6
    
08c3a6
    (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
08c3a6
index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
08c3a6
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
08c3a6
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
08c3a6
@@ -34,7 +34,24 @@
08c3a6
       area.
08c3a6
    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
08c3a6
    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
08c3a6
-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
08c3a6
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
08c3a6
+
08c3a6
+When possible the implementation tries to optimize for frontend in the
08c3a6
+following ways:
08c3a6
+Throughput:
08c3a6
+    1. All code sections that fit are able to run optimally out of the
08c3a6
+       LSD.
08c3a6
+    2. All code sections that fit are able to run optimally out of the
08c3a6
+       DSB
08c3a6
+    3. Basic blocks are contained in minimum number of fetch blocks
08c3a6
+       necessary.
08c3a6
+
08c3a6
+Latency:
08c3a6
+    1. Logically connected basic blocks are put in the same
08c3a6
+       cache-line.
08c3a6
+    2. Logically connected basic blocks that do not fit in the same
08c3a6
+       cache-line are put in adjacent lines. This can get beneficial
08c3a6
+       L2 spatial prefetching and L1 next-line prefetching.  */
08c3a6
 
08c3a6
 # include <sysdep.h>
08c3a6
 
08c3a6
@@ -47,9 +64,11 @@
08c3a6
 # ifdef USE_AS_WMEMCMP
08c3a6
 #  define CHAR_SIZE	4
08c3a6
 #  define VPCMP	vpcmpd
08c3a6
+#  define VPTEST	vptestmd
08c3a6
 # else
08c3a6
 #  define CHAR_SIZE	1
08c3a6
 #  define VPCMP	vpcmpub
08c3a6
+#  define VPTEST	vptestmb
08c3a6
 # endif
08c3a6
 
08c3a6
 # define VEC_SIZE	32
08c3a6
@@ -75,7 +94,9 @@
08c3a6
 */
08c3a6
 
08c3a6
 	.section .text.evex,"ax",@progbits
08c3a6
-ENTRY (MEMCMP)
08c3a6
+/* Cache align memcmp entry. This allows for much more thorough
08c3a6
+   frontend optimization.  */
08c3a6
+ENTRY_P2ALIGN (MEMCMP, 6)
08c3a6
 # ifdef __ILP32__
08c3a6
 	/* Clear the upper 32 bits.  */
08c3a6
 	movl	%edx, %edx
08c3a6
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
08c3a6
 	VPCMP	$4, (%rdi), %YMM1, %k1
08c3a6
 	kmovd	%k1, %eax
08c3a6
 	/* NB: eax must be destination register if going to
08c3a6
-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
08c3a6
+	   L(return_vec_[0,2]). For L(return_vec_3) destination register
08c3a6
 	   must be ecx.  */
08c3a6
 	testl	%eax, %eax
08c3a6
 	jnz	L(return_vec_0)
08c3a6
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
08c3a6
 	testl	%ecx, %ecx
08c3a6
 	jnz	L(return_vec_3)
08c3a6
 
08c3a6
-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
08c3a6
-	   compare with zero to get a mask is needed.  */
08c3a6
-	vpxorq	%XMM0, %XMM0, %XMM0
08c3a6
-
08c3a6
 	/* Go to 4x VEC loop.  */
08c3a6
 	cmpq	$(CHAR_PER_VEC * 8), %rdx
08c3a6
 	ja	L(more_8x_vec)
08c3a6
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
08c3a6
 
08c3a6
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
08c3a6
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
08c3a6
-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
08c3a6
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
08c3a6
 
08c3a6
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
08c3a6
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
08c3a6
-	   oring with YMM3. Result is stored in YMM4.  */
08c3a6
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
08c3a6
-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
08c3a6
-	VPCMP	$4, %YMM4, %YMM0, %k1
08c3a6
+	   oring with YMM1. Result is stored in YMM4.  */
08c3a6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
08c3a6
+
08c3a6
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
08c3a6
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
08c3a6
+
08c3a6
+	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
08c3a6
+	 */
08c3a6
+	VPTEST	%YMM4, %YMM4, %k1
08c3a6
+	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
08c3a6
 	kmovd	%k1, %ecx
08c3a6
 	testl	%ecx, %ecx
08c3a6
 	jnz	L(return_vec_0_1_2_3)
08c3a6
 	/* NB: eax must be zero to reach here.  */
08c3a6
 	ret
08c3a6
 
08c3a6
-	/* NB: aligning 32 here allows for the rest of the jump targets
08c3a6
-	   to be tuned for 32 byte alignment. Most important this ensures
08c3a6
-	   the L(more_8x_vec) loop is 32 byte aligned.  */
08c3a6
-	.p2align 5
08c3a6
-L(less_vec):
08c3a6
-	/* Check if one or less CHAR. This is necessary for size = 0 but
08c3a6
-	   is also faster for size = CHAR_SIZE.  */
08c3a6
-	cmpl	$1, %edx
08c3a6
-	jbe	L(one_or_less)
08c3a6
+	.p2align 4
08c3a6
+L(8x_end_return_vec_0_1_2_3):
08c3a6
+	movq	%rdx, %rdi
08c3a6
+L(8x_return_vec_0_1_2_3):
08c3a6
+	addq	%rdi, %rsi
08c3a6
+L(return_vec_0_1_2_3):
08c3a6
+	VPTEST	%YMM1, %YMM1, %k0
08c3a6
+	kmovd	%k0, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(return_vec_0)
08c3a6
 
08c3a6
-	/* Check if loading one VEC from either s1 or s2 could cause a
08c3a6
-	   page cross. This can have false positives but is by far the
08c3a6
-	   fastest method.  */
08c3a6
-	movl	%edi, %eax
08c3a6
-	orl	%esi, %eax
08c3a6
-	andl	$(PAGE_SIZE - 1), %eax
08c3a6
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
08c3a6
-	jg	L(page_cross_less_vec)
08c3a6
+	VPTEST	%YMM2, %YMM2, %k0
08c3a6
+	kmovd	%k0, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(return_vec_1)
08c3a6
 
08c3a6
-	/* No page cross possible.  */
08c3a6
-	VMOVU	(%rsi), %YMM2
08c3a6
-	VPCMP	$4, (%rdi), %YMM2, %k1
08c3a6
-	kmovd	%k1, %eax
08c3a6
-	/* Create mask in ecx for potentially in bound matches.  */
08c3a6
-	bzhil	%edx, %eax, %eax
08c3a6
-	jnz	L(return_vec_0)
08c3a6
+	VPTEST	%YMM3, %YMM3, %k0
08c3a6
+	kmovd	%k0, %eax
08c3a6
+	testl	%eax, %eax
08c3a6
+	jnz	L(return_vec_2)
08c3a6
+L(return_vec_3):
08c3a6
+	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
08c3a6
+	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
08c3a6
+	   line.  */
08c3a6
+	bsfl	%ecx, %ecx
08c3a6
+# ifdef USE_AS_WMEMCMP
08c3a6
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
08c3a6
+	xorl	%edx, %edx
08c3a6
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
08c3a6
+	setg	%dl
08c3a6
+	leal	-1(%rdx, %rdx), %eax
08c3a6
+# else
08c3a6
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
08c3a6
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
08c3a6
+	subl	%ecx, %eax
08c3a6
+# endif
08c3a6
 	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
@@ -209,10 +240,11 @@ L(return_vec_0):
08c3a6
 # endif
08c3a6
 	ret
08c3a6
 
08c3a6
-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
08c3a6
-	   which is good enough for a target not in a loop.  */
08c3a6
+	.p2align 4
08c3a6
 L(return_vec_1):
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
08c3a6
+	   fetch block.  */
08c3a6
+	bsfl	%eax, %eax
08c3a6
 # ifdef USE_AS_WMEMCMP
08c3a6
 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
08c3a6
 	xorl	%edx, %edx
08c3a6
@@ -226,10 +258,11 @@ L(return_vec_1):
08c3a6
 # endif
08c3a6
 	ret
08c3a6
 
08c3a6
-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
08c3a6
-	   which is good enough for a target not in a loop.  */
08c3a6
+	.p2align 4,, 10
08c3a6
 L(return_vec_2):
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
08c3a6
+	   fetch block.  */
08c3a6
+	bsfl	%eax, %eax
08c3a6
 # ifdef USE_AS_WMEMCMP
08c3a6
 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
08c3a6
 	xorl	%edx, %edx
08c3a6
@@ -243,40 +276,6 @@ L(return_vec_2):
08c3a6
 # endif
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(8x_return_vec_0_1_2_3):
08c3a6
-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
08c3a6
-	addq	%rdi, %rsi
08c3a6
-L(return_vec_0_1_2_3):
08c3a6
-	VPCMP	$4, %YMM1, %YMM0, %k0
08c3a6
-	kmovd	%k0, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_0)
08c3a6
-
08c3a6
-	VPCMP	$4, %YMM2, %YMM0, %k0
08c3a6
-	kmovd	%k0, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_1)
08c3a6
-
08c3a6
-	VPCMP	$4, %YMM3, %YMM0, %k0
08c3a6
-	kmovd	%k0, %eax
08c3a6
-	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_2)
08c3a6
-L(return_vec_3):
08c3a6
-	tzcntl	%ecx, %ecx
08c3a6
-# ifdef USE_AS_WMEMCMP
08c3a6
-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
08c3a6
-	xorl	%edx, %edx
08c3a6
-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
08c3a6
-	setg	%dl
08c3a6
-	leal	-1(%rdx, %rdx), %eax
08c3a6
-# else
08c3a6
-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
08c3a6
-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
08c3a6
-	subl	%ecx, %eax
08c3a6
-# endif
08c3a6
-	ret
08c3a6
-
08c3a6
 	.p2align 4
08c3a6
 L(more_8x_vec):
08c3a6
 	/* Set end of s1 in rdx.  */
08c3a6
@@ -288,21 +287,19 @@ L(more_8x_vec):
08c3a6
 	andq	$-VEC_SIZE, %rdi
08c3a6
 	/* Adjust because first 4x vec where check already.  */
08c3a6
 	subq	$-(VEC_SIZE * 4), %rdi
08c3a6
+
08c3a6
 	.p2align 4
08c3a6
 L(loop_4x_vec):
08c3a6
 	VMOVU	(%rsi, %rdi), %YMM1
08c3a6
 	vpxorq	(%rdi), %YMM1, %YMM1
08c3a6
-
08c3a6
 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
08c3a6
 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
08c3a6
-
08c3a6
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
08c3a6
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
08c3a6
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
08c3a6
-
08c3a6
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
08c3a6
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
08c3a6
-	VPCMP	$4, %YMM4, %YMM0, %k1
08c3a6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
08c3a6
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
08c3a6
+	VPTEST	%YMM4, %YMM4, %k1
08c3a6
 	kmovd	%k1, %ecx
08c3a6
 	testl	%ecx, %ecx
08c3a6
 	jnz	L(8x_return_vec_0_1_2_3)
08c3a6
@@ -319,28 +316,25 @@ L(loop_4x_vec):
08c3a6
 	cmpl	$(VEC_SIZE * 2), %edi
08c3a6
 	jae	L(8x_last_2x_vec)
08c3a6
 
08c3a6
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
08c3a6
+
08c3a6
 	VMOVU	(%rsi, %rdx), %YMM1
08c3a6
 	vpxorq	(%rdx), %YMM1, %YMM1
08c3a6
 
08c3a6
 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
08c3a6
 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
08c3a6
-
08c3a6
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
08c3a6
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
08c3a6
-
08c3a6
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
08c3a6
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
08c3a6
-	VPCMP	$4, %YMM4, %YMM0, %k1
08c3a6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
08c3a6
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
08c3a6
+	VPTEST	%YMM4, %YMM4, %k1
08c3a6
 	kmovd	%k1, %ecx
08c3a6
-	/* Restore s1 pointer to rdi.  */
08c3a6
-	movq	%rdx, %rdi
08c3a6
 	testl	%ecx, %ecx
08c3a6
-	jnz	L(8x_return_vec_0_1_2_3)
08c3a6
+	jnz	L(8x_end_return_vec_0_1_2_3)
08c3a6
 	/* NB: eax must be zero to reach here.  */
08c3a6
 	ret
08c3a6
 
08c3a6
 	/* Only entry is from L(more_8x_vec).  */
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 10
08c3a6
 L(8x_last_2x_vec):
08c3a6
 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
08c3a6
 	kmovd	%k1, %eax
08c3a6
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
08c3a6
 	jnz	L(8x_return_vec_3)
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
+	/* Not ideally aligned (at offset +9 bytes in fetch block) but
08c3a6
+	   not aligning keeps it in the same cache line as
08c3a6
+	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
08c3a6
+	   size.  */
08c3a6
+	.p2align 4,, 4
08c3a6
+L(8x_return_vec_2):
08c3a6
+	subq	$VEC_SIZE, %rdx
08c3a6
+L(8x_return_vec_3):
08c3a6
+	bsfl	%eax, %eax
08c3a6
+# ifdef USE_AS_WMEMCMP
08c3a6
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
08c3a6
+	movl	(VEC_SIZE * 3)(%rax), %ecx
08c3a6
+	xorl	%edx, %edx
08c3a6
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
08c3a6
+	setg	%dl
08c3a6
+	leal	-1(%rdx, %rdx), %eax
08c3a6
+# else
08c3a6
+	addq	%rdx, %rax
08c3a6
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
08c3a6
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
08c3a6
+	subl	%ecx, %eax
08c3a6
+# endif
08c3a6
+	ret
08c3a6
+
08c3a6
+	.p2align 4,, 10
08c3a6
 L(last_2x_vec):
08c3a6
 	/* Check second to last VEC.  */
08c3a6
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
08c3a6
@@ -374,26 +392,49 @@ L(last_1x_vec):
08c3a6
 	jnz	L(return_vec_0_end)
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(8x_return_vec_2):
08c3a6
-	subq	$VEC_SIZE, %rdx
08c3a6
-L(8x_return_vec_3):
08c3a6
-	tzcntl	%eax, %eax
08c3a6
+	.p2align 4,, 10
08c3a6
+L(return_vec_1_end):
08c3a6
+	/* Use bsf to save code size. This is necessary to have
08c3a6
+	   L(one_or_less) fit in aligning bytes between.  */
08c3a6
+	bsfl	%eax, %eax
08c3a6
+	addl	%edx, %eax
08c3a6
 # ifdef USE_AS_WMEMCMP
08c3a6
-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
08c3a6
-	movl	(VEC_SIZE * 3)(%rax), %ecx
08c3a6
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
08c3a6
 	xorl	%edx, %edx
08c3a6
-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
08c3a6
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
08c3a6
 	setg	%dl
08c3a6
 	leal	-1(%rdx, %rdx), %eax
08c3a6
 # else
08c3a6
-	addq	%rdx, %rax
08c3a6
-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
08c3a6
-	movzbl	(VEC_SIZE * 3)(%rax), %eax
08c3a6
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
08c3a6
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
08c3a6
 	subl	%ecx, %eax
08c3a6
 # endif
08c3a6
 	ret
08c3a6
 
08c3a6
+	/* NB: L(one_or_less) fits in alignment padding between
08c3a6
+	   L(return_vec_1_end) and L(return_vec_0_end).  */
08c3a6
+# ifdef USE_AS_WMEMCMP
08c3a6
+L(one_or_less):
08c3a6
+	jb	L(zero)
08c3a6
+	movl	(%rdi), %ecx
08c3a6
+	xorl	%edx, %edx
08c3a6
+	cmpl	(%rsi), %ecx
08c3a6
+	je	L(zero)
08c3a6
+	setg	%dl
08c3a6
+	leal	-1(%rdx, %rdx), %eax
08c3a6
+	ret
08c3a6
+# else
08c3a6
+L(one_or_less):
08c3a6
+	jb	L(zero)
08c3a6
+	movzbl	(%rsi), %ecx
08c3a6
+	movzbl	(%rdi), %eax
08c3a6
+	subl	%ecx, %eax
08c3a6
+	ret
08c3a6
+# endif
08c3a6
+L(zero):
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
+
08c3a6
 	.p2align 4
08c3a6
 L(return_vec_0_end):
08c3a6
 	tzcntl	%eax, %eax
08c3a6
@@ -412,23 +453,56 @@ L(return_vec_0_end):
08c3a6
 	ret
08c3a6
 
08c3a6
 	.p2align 4
08c3a6
-L(return_vec_1_end):
08c3a6
+L(less_vec):
08c3a6
+	/* Check if one or less CHAR. This is necessary for size == 0
08c3a6
+	   but is also faster for size == CHAR_SIZE.  */
08c3a6
+	cmpl	$1, %edx
08c3a6
+	jbe	L(one_or_less)
08c3a6
+
08c3a6
+	/* Check if loading one VEC from either s1 or s2 could cause a
08c3a6
+	   page cross. This can have false positives but is by far the
08c3a6
+	   fastest method.  */
08c3a6
+	movl	%edi, %eax
08c3a6
+	orl	%esi, %eax
08c3a6
+	andl	$(PAGE_SIZE - 1), %eax
08c3a6
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
08c3a6
+	jg	L(page_cross_less_vec)
08c3a6
+
08c3a6
+	/* No page cross possible.  */
08c3a6
+	VMOVU	(%rsi), %YMM2
08c3a6
+	VPCMP	$4, (%rdi), %YMM2, %k1
08c3a6
+	kmovd	%k1, %eax
08c3a6
+	/* Check if any matches where in bounds. Intentionally not
08c3a6
+	   storing result in eax to limit dependency chain if it goes to
08c3a6
+	   L(return_vec_0_lv).  */
08c3a6
+	bzhil	%edx, %eax, %edx
08c3a6
+	jnz	L(return_vec_0_lv)
08c3a6
+	xorl	%eax, %eax
08c3a6
+	ret
08c3a6
+
08c3a6
+	/* Essentially duplicate of L(return_vec_0). Ends up not costing
08c3a6
+	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
08c3a6
+	   the jump and ends up fitting in aligning bytes. As well fits on
08c3a6
+	   same cache line as L(less_vec) so also saves a line from having
08c3a6
+	   to be fetched on cold calls to memcmp.  */
08c3a6
+	.p2align 4,, 4
08c3a6
+L(return_vec_0_lv):
08c3a6
 	tzcntl	%eax, %eax
08c3a6
-	addl	%edx, %eax
08c3a6
 # ifdef USE_AS_WMEMCMP
08c3a6
-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
08c3a6
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
08c3a6
 	xorl	%edx, %edx
08c3a6
-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
08c3a6
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
08c3a6
+	/* NB: no partial register stall here because xorl zero idiom
08c3a6
+	   above.  */
08c3a6
 	setg	%dl
08c3a6
 	leal	-1(%rdx, %rdx), %eax
08c3a6
 # else
08c3a6
-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
08c3a6
-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
08c3a6
+	movzbl	(%rsi, %rax), %ecx
08c3a6
+	movzbl	(%rdi, %rax), %eax
08c3a6
 	subl	%ecx, %eax
08c3a6
 # endif
08c3a6
 	ret
08c3a6
 
08c3a6
-
08c3a6
 	.p2align 4
08c3a6
 L(page_cross_less_vec):
08c3a6
 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
08c3a6
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
08c3a6
 	cmpl	$8, %edx
08c3a6
 	jae	L(between_8_15)
08c3a6
 	cmpl	$4, %edx
08c3a6
-	jae	L(between_4_7)
08c3a6
-L(between_2_3):
08c3a6
-	/* Load as big endian to avoid branches.  */
08c3a6
-	movzwl	(%rdi), %eax
08c3a6
-	movzwl	(%rsi), %ecx
08c3a6
-	shll	$8, %eax
08c3a6
-	shll	$8, %ecx
08c3a6
-	bswap	%eax
08c3a6
-	bswap	%ecx
08c3a6
-	movzbl	-1(%rdi, %rdx), %edi
08c3a6
-	movzbl	-1(%rsi, %rdx), %esi
08c3a6
-	orl	%edi, %eax
08c3a6
-	orl	%esi, %ecx
08c3a6
-	/* Subtraction is okay because the upper 8 bits are zero.  */
08c3a6
-	subl	%ecx, %eax
08c3a6
-	ret
08c3a6
-	.p2align 4
08c3a6
-L(one_or_less):
08c3a6
-	jb	L(zero)
08c3a6
-	movzbl	(%rsi), %ecx
08c3a6
-	movzbl	(%rdi), %eax
08c3a6
-	subl	%ecx, %eax
08c3a6
+	jb	L(between_2_3)
08c3a6
+
08c3a6
+	/* Load as big endian with overlapping movbe to avoid branches.
08c3a6
+	 */
08c3a6
+	movbe	(%rdi), %eax
08c3a6
+	movbe	(%rsi), %ecx
08c3a6
+	shlq	$32, %rax
08c3a6
+	shlq	$32, %rcx
08c3a6
+	movbe	-4(%rdi, %rdx), %edi
08c3a6
+	movbe	-4(%rsi, %rdx), %esi
08c3a6
+	orq	%rdi, %rax
08c3a6
+	orq	%rsi, %rcx
08c3a6
+	subq	%rcx, %rax
08c3a6
+	/* edx is guranteed to be positive int32 in range [4, 7].  */
08c3a6
+	cmovne	%edx, %eax
08c3a6
+	/* ecx is -1 if rcx > rax. Otherwise 0.  */
08c3a6
+	sbbl	%ecx, %ecx
08c3a6
+	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
08c3a6
+	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
08c3a6
+	   eax doesn't matter.  */
08c3a6
+	orl	%ecx, %eax
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 8
08c3a6
 L(between_8_15):
08c3a6
 # endif
08c3a6
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
08c3a6
-	vmovq	(%rdi), %XMM1
08c3a6
-	vmovq	(%rsi), %XMM2
08c3a6
-	VPCMP	$4, %XMM1, %XMM2, %k1
08c3a6
+	vmovq	(%rdi), %xmm1
08c3a6
+	vmovq	(%rsi), %xmm2
08c3a6
+	VPCMP	$4, %xmm1, %xmm2, %k1
08c3a6
 	kmovd	%k1, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_0)
08c3a6
+	jnz	L(return_vec_0_lv)
08c3a6
 	/* Use overlapping loads to avoid branches.  */
08c3a6
-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
08c3a6
-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
08c3a6
-	vmovq	(%rdi), %XMM1
08c3a6
-	vmovq	(%rsi), %XMM2
08c3a6
-	VPCMP	$4, %XMM1, %XMM2, %k1
08c3a6
+	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
08c3a6
+	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
08c3a6
+	VPCMP	$4, %xmm1, %xmm2, %k1
08c3a6
+	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
08c3a6
 	kmovd	%k1, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_0)
08c3a6
-	ret
08c3a6
-
08c3a6
-	.p2align 4
08c3a6
-L(zero):
08c3a6
-	xorl	%eax, %eax
08c3a6
+	jnz	L(return_vec_0_end)
08c3a6
 	ret
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
+	.p2align 4,, 8
08c3a6
 L(between_16_31):
08c3a6
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
08c3a6
-	VMOVU	(%rsi), %XMM2
08c3a6
-	VPCMP	$4, (%rdi), %XMM2, %k1
08c3a6
+
08c3a6
+	/* Use movups to save code size.  */
08c3a6
+	movups	(%rsi), %xmm2
08c3a6
+	VPCMP	$4, (%rdi), %xmm2, %k1
08c3a6
 	kmovd	%k1, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_0)
08c3a6
-
08c3a6
+	jnz	L(return_vec_0_lv)
08c3a6
 	/* Use overlapping loads to avoid branches.  */
08c3a6
-
08c3a6
-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
08c3a6
-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
08c3a6
-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
08c3a6
-	VPCMP	$4, (%rdi), %XMM2, %k1
08c3a6
+	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
08c3a6
+	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
08c3a6
+	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
08c3a6
 	kmovd	%k1, %eax
08c3a6
 	testl	%eax, %eax
08c3a6
-	jnz	L(return_vec_0)
08c3a6
-	ret
08c3a6
-
08c3a6
-# ifdef USE_AS_WMEMCMP
08c3a6
-	.p2align 4
08c3a6
-L(one_or_less):
08c3a6
-	jb	L(zero)
08c3a6
-	movl	(%rdi), %ecx
08c3a6
-	xorl	%edx, %edx
08c3a6
-	cmpl	(%rsi), %ecx
08c3a6
-	je	L(zero)
08c3a6
-	setg	%dl
08c3a6
-	leal	-1(%rdx, %rdx), %eax
08c3a6
+	jnz	L(return_vec_0_end)
08c3a6
 	ret
08c3a6
-# else
08c3a6
 
08c3a6
-	.p2align 4
08c3a6
-L(between_4_7):
08c3a6
-	/* Load as big endian with overlapping movbe to avoid branches.
08c3a6
-	 */
08c3a6
-	movbe	(%rdi), %eax
08c3a6
-	movbe	(%rsi), %ecx
08c3a6
-	shlq	$32, %rax
08c3a6
-	shlq	$32, %rcx
08c3a6
-	movbe	-4(%rdi, %rdx), %edi
08c3a6
-	movbe	-4(%rsi, %rdx), %esi
08c3a6
-	orq	%rdi, %rax
08c3a6
-	orq	%rsi, %rcx
08c3a6
-	subq	%rcx, %rax
08c3a6
-	jz	L(zero_4_7)
08c3a6
-	sbbl	%eax, %eax
08c3a6
-	orl	$1, %eax
08c3a6
-L(zero_4_7):
08c3a6
+# ifndef USE_AS_WMEMCMP
08c3a6
+L(between_2_3):
08c3a6
+	/* Load as big endian to avoid branches.  */
08c3a6
+	movzwl	(%rdi), %eax
08c3a6
+	movzwl	(%rsi), %ecx
08c3a6
+	shll	$8, %eax
08c3a6
+	shll	$8, %ecx
08c3a6
+	bswap	%eax
08c3a6
+	bswap	%ecx
08c3a6
+	movzbl	-1(%rdi, %rdx), %edi
08c3a6
+	movzbl	-1(%rsi, %rdx), %esi
08c3a6
+	orl	%edi, %eax
08c3a6
+	orl	%esi, %ecx
08c3a6
+	/* Subtraction is okay because the upper 8 bits are zero.  */
08c3a6
+	subl	%ecx, %eax
08c3a6
 	ret
08c3a6
 # endif
08c3a6
-
08c3a6
 END (MEMCMP)
08c3a6
 #endif