076f82
commit 5ec3416853c4150c4d13312e05f93a053586d528
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Tue Sep 21 18:45:03 2021 -0500
076f82
076f82
    x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
076f82
    
076f82
    No bug.
076f82
    
076f82
    The frontend optimizations are to:
076f82
    1. Reorganize logically connected basic blocks so they are either in
076f82
       the same cache line or adjacent cache lines.
076f82
    2. Avoid cases when basic blocks unnecissarily cross cache lines.
076f82
    3. Try and 32 byte align any basic blocks possible without sacrificing
076f82
       code size. Smaller / Less hot basic blocks are used for this.
076f82
    
076f82
    Overall code size shrunk by 168 bytes. This should make up for any
076f82
    extra costs due to aligning to 64 bytes.
076f82
    
076f82
    In general performance before deviated a great deal dependending on
076f82
    whether entry alignment % 64 was 0, 16, 32, or 48. These changes
076f82
    essentially make it so that the current implementation is at least
076f82
    equal to the best alignment of the original for any arguments.
076f82
    
076f82
    The only additional optimization is in the page cross case. Branch on
076f82
    equals case was removed from the size == [4, 7] case. As well the [4,
076f82
    7] and [2, 3] case where swapped as [4, 7] is likely a more hot
076f82
    argument size.
076f82
    
076f82
    test-memcmp and test-wmemcmp are both passing.
076f82
    
076f82
    (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
076f82
index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
076f82
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
076f82
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
076f82
@@ -34,7 +34,24 @@
076f82
       area.
076f82
    7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
076f82
    8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
076f82
-   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
076f82
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
076f82
+
076f82
+When possible the implementation tries to optimize for frontend in the
076f82
+following ways:
076f82
+Throughput:
076f82
+    1. All code sections that fit are able to run optimally out of the
076f82
+       LSD.
076f82
+    2. All code sections that fit are able to run optimally out of the
076f82
+       DSB
076f82
+    3. Basic blocks are contained in minimum number of fetch blocks
076f82
+       necessary.
076f82
+
076f82
+Latency:
076f82
+    1. Logically connected basic blocks are put in the same
076f82
+       cache-line.
076f82
+    2. Logically connected basic blocks that do not fit in the same
076f82
+       cache-line are put in adjacent lines. This can get beneficial
076f82
+       L2 spatial prefetching and L1 next-line prefetching.  */
076f82
 
076f82
 # include <sysdep.h>
076f82
 
076f82
@@ -47,9 +64,11 @@
076f82
 # ifdef USE_AS_WMEMCMP
076f82
 #  define CHAR_SIZE	4
076f82
 #  define VPCMP	vpcmpd
076f82
+#  define VPTEST	vptestmd
076f82
 # else
076f82
 #  define CHAR_SIZE	1
076f82
 #  define VPCMP	vpcmpub
076f82
+#  define VPTEST	vptestmb
076f82
 # endif
076f82
 
076f82
 # define VEC_SIZE	32
076f82
@@ -75,7 +94,9 @@
076f82
 */
076f82
 
076f82
 	.section .text.evex,"ax",@progbits
076f82
-ENTRY (MEMCMP)
076f82
+/* Cache align memcmp entry. This allows for much more thorough
076f82
+   frontend optimization.  */
076f82
+ENTRY_P2ALIGN (MEMCMP, 6)
076f82
 # ifdef __ILP32__
076f82
 	/* Clear the upper 32 bits.  */
076f82
 	movl	%edx, %edx
076f82
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
076f82
 	VPCMP	$4, (%rdi), %YMM1, %k1
076f82
 	kmovd	%k1, %eax
076f82
 	/* NB: eax must be destination register if going to
076f82
-	   L(return_vec_[0,2]). For L(return_vec_3 destination register
076f82
+	   L(return_vec_[0,2]). For L(return_vec_3) destination register
076f82
 	   must be ecx.  */
076f82
 	testl	%eax, %eax
076f82
 	jnz	L(return_vec_0)
076f82
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
076f82
 	testl	%ecx, %ecx
076f82
 	jnz	L(return_vec_3)
076f82
 
076f82
-	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
076f82
-	   compare with zero to get a mask is needed.  */
076f82
-	vpxorq	%XMM0, %XMM0, %XMM0
076f82
-
076f82
 	/* Go to 4x VEC loop.  */
076f82
 	cmpq	$(CHAR_PER_VEC * 8), %rdx
076f82
 	ja	L(more_8x_vec)
076f82
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
076f82
 
076f82
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
076f82
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
076f82
-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
076f82
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
076f82
 
076f82
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
076f82
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
076f82
-	   oring with YMM3. Result is stored in YMM4.  */
076f82
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
076f82
-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
076f82
-	VPCMP	$4, %YMM4, %YMM0, %k1
076f82
+	   oring with YMM1. Result is stored in YMM4.  */
076f82
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
076f82
+
076f82
+	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
076f82
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
076f82
+
076f82
+	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
076f82
+	 */
076f82
+	VPTEST	%YMM4, %YMM4, %k1
076f82
+	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
076f82
 	kmovd	%k1, %ecx
076f82
 	testl	%ecx, %ecx
076f82
 	jnz	L(return_vec_0_1_2_3)
076f82
 	/* NB: eax must be zero to reach here.  */
076f82
 	ret
076f82
 
076f82
-	/* NB: aligning 32 here allows for the rest of the jump targets
076f82
-	   to be tuned for 32 byte alignment. Most important this ensures
076f82
-	   the L(more_8x_vec) loop is 32 byte aligned.  */
076f82
-	.p2align 5
076f82
-L(less_vec):
076f82
-	/* Check if one or less CHAR. This is necessary for size = 0 but
076f82
-	   is also faster for size = CHAR_SIZE.  */
076f82
-	cmpl	$1, %edx
076f82
-	jbe	L(one_or_less)
076f82
+	.p2align 4
076f82
+L(8x_end_return_vec_0_1_2_3):
076f82
+	movq	%rdx, %rdi
076f82
+L(8x_return_vec_0_1_2_3):
076f82
+	addq	%rdi, %rsi
076f82
+L(return_vec_0_1_2_3):
076f82
+	VPTEST	%YMM1, %YMM1, %k0
076f82
+	kmovd	%k0, %eax
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(return_vec_0)
076f82
 
076f82
-	/* Check if loading one VEC from either s1 or s2 could cause a
076f82
-	   page cross. This can have false positives but is by far the
076f82
-	   fastest method.  */
076f82
-	movl	%edi, %eax
076f82
-	orl	%esi, %eax
076f82
-	andl	$(PAGE_SIZE - 1), %eax
076f82
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
-	jg	L(page_cross_less_vec)
076f82
+	VPTEST	%YMM2, %YMM2, %k0
076f82
+	kmovd	%k0, %eax
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(return_vec_1)
076f82
 
076f82
-	/* No page cross possible.  */
076f82
-	VMOVU	(%rsi), %YMM2
076f82
-	VPCMP	$4, (%rdi), %YMM2, %k1
076f82
-	kmovd	%k1, %eax
076f82
-	/* Create mask in ecx for potentially in bound matches.  */
076f82
-	bzhil	%edx, %eax, %eax
076f82
-	jnz	L(return_vec_0)
076f82
+	VPTEST	%YMM3, %YMM3, %k0
076f82
+	kmovd	%k0, %eax
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(return_vec_2)
076f82
+L(return_vec_3):
076f82
+	/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
076f82
+	   fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
076f82
+	   line.  */
076f82
+	bsfl	%ecx, %ecx
076f82
+# ifdef USE_AS_WMEMCMP
076f82
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
076f82
+	xorl	%edx, %edx
076f82
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
076f82
+	setg	%dl
076f82
+	leal	-1(%rdx, %rdx), %eax
076f82
+# else
076f82
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
076f82
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+# endif
076f82
 	ret
076f82
 
076f82
 	.p2align 4
076f82
@@ -209,10 +240,11 @@ L(return_vec_0):
076f82
 # endif
076f82
 	ret
076f82
 
076f82
-	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
076f82
-	   which is good enough for a target not in a loop.  */
076f82
+	.p2align 4
076f82
 L(return_vec_1):
076f82
-	tzcntl	%eax, %eax
076f82
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
076f82
+	   fetch block.  */
076f82
+	bsfl	%eax, %eax
076f82
 # ifdef USE_AS_WMEMCMP
076f82
 	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
076f82
 	xorl	%edx, %edx
076f82
@@ -226,10 +258,11 @@ L(return_vec_1):
076f82
 # endif
076f82
 	ret
076f82
 
076f82
-	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
076f82
-	   which is good enough for a target not in a loop.  */
076f82
+	.p2align 4,, 10
076f82
 L(return_vec_2):
076f82
-	tzcntl	%eax, %eax
076f82
+	/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
076f82
+	   fetch block.  */
076f82
+	bsfl	%eax, %eax
076f82
 # ifdef USE_AS_WMEMCMP
076f82
 	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
076f82
 	xorl	%edx, %edx
076f82
@@ -243,40 +276,6 @@ L(return_vec_2):
076f82
 # endif
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(8x_return_vec_0_1_2_3):
076f82
-	/* Returning from L(more_8x_vec) requires restoring rsi.  */
076f82
-	addq	%rdi, %rsi
076f82
-L(return_vec_0_1_2_3):
076f82
-	VPCMP	$4, %YMM1, %YMM0, %k0
076f82
-	kmovd	%k0, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(return_vec_0)
076f82
-
076f82
-	VPCMP	$4, %YMM2, %YMM0, %k0
076f82
-	kmovd	%k0, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(return_vec_1)
076f82
-
076f82
-	VPCMP	$4, %YMM3, %YMM0, %k0
076f82
-	kmovd	%k0, %eax
076f82
-	testl	%eax, %eax
076f82
-	jnz	L(return_vec_2)
076f82
-L(return_vec_3):
076f82
-	tzcntl	%ecx, %ecx
076f82
-# ifdef USE_AS_WMEMCMP
076f82
-	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
076f82
-	xorl	%edx, %edx
076f82
-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
076f82
-	setg	%dl
076f82
-	leal	-1(%rdx, %rdx), %eax
076f82
-# else
076f82
-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
076f82
-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
076f82
-	subl	%ecx, %eax
076f82
-# endif
076f82
-	ret
076f82
-
076f82
 	.p2align 4
076f82
 L(more_8x_vec):
076f82
 	/* Set end of s1 in rdx.  */
076f82
@@ -288,21 +287,19 @@ L(more_8x_vec):
076f82
 	andq	$-VEC_SIZE, %rdi
076f82
 	/* Adjust because first 4x vec where check already.  */
076f82
 	subq	$-(VEC_SIZE * 4), %rdi
076f82
+
076f82
 	.p2align 4
076f82
 L(loop_4x_vec):
076f82
 	VMOVU	(%rsi, %rdi), %YMM1
076f82
 	vpxorq	(%rdi), %YMM1, %YMM1
076f82
-
076f82
 	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
076f82
 	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
076f82
-
076f82
 	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
076f82
 	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
076f82
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
076f82
-
076f82
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
076f82
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
076f82
-	VPCMP	$4, %YMM4, %YMM0, %k1
076f82
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
076f82
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
076f82
+	VPTEST	%YMM4, %YMM4, %k1
076f82
 	kmovd	%k1, %ecx
076f82
 	testl	%ecx, %ecx
076f82
 	jnz	L(8x_return_vec_0_1_2_3)
076f82
@@ -319,28 +316,25 @@ L(loop_4x_vec):
076f82
 	cmpl	$(VEC_SIZE * 2), %edi
076f82
 	jae	L(8x_last_2x_vec)
076f82
 
076f82
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
076f82
+
076f82
 	VMOVU	(%rsi, %rdx), %YMM1
076f82
 	vpxorq	(%rdx), %YMM1, %YMM1
076f82
 
076f82
 	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
076f82
 	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
076f82
-
076f82
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
076f82
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
076f82
-
076f82
 	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
076f82
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
076f82
-	VPCMP	$4, %YMM4, %YMM0, %k1
076f82
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
076f82
+	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
076f82
+	VPTEST	%YMM4, %YMM4, %k1
076f82
 	kmovd	%k1, %ecx
076f82
-	/* Restore s1 pointer to rdi.  */
076f82
-	movq	%rdx, %rdi
076f82
 	testl	%ecx, %ecx
076f82
-	jnz	L(8x_return_vec_0_1_2_3)
076f82
+	jnz	L(8x_end_return_vec_0_1_2_3)
076f82
 	/* NB: eax must be zero to reach here.  */
076f82
 	ret
076f82
 
076f82
 	/* Only entry is from L(more_8x_vec).  */
076f82
-	.p2align 4
076f82
+	.p2align 4,, 10
076f82
 L(8x_last_2x_vec):
076f82
 	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
076f82
 	kmovd	%k1, %eax
076f82
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
076f82
 	jnz	L(8x_return_vec_3)
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
+	/* Not ideally aligned (at offset +9 bytes in fetch block) but
076f82
+	   not aligning keeps it in the same cache line as
076f82
+	   L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
076f82
+	   size.  */
076f82
+	.p2align 4,, 4
076f82
+L(8x_return_vec_2):
076f82
+	subq	$VEC_SIZE, %rdx
076f82
+L(8x_return_vec_3):
076f82
+	bsfl	%eax, %eax
076f82
+# ifdef USE_AS_WMEMCMP
076f82
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
076f82
+	movl	(VEC_SIZE * 3)(%rax), %ecx
076f82
+	xorl	%edx, %edx
076f82
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
076f82
+	setg	%dl
076f82
+	leal	-1(%rdx, %rdx), %eax
076f82
+# else
076f82
+	addq	%rdx, %rax
076f82
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
076f82
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
076f82
+	subl	%ecx, %eax
076f82
+# endif
076f82
+	ret
076f82
+
076f82
+	.p2align 4,, 10
076f82
 L(last_2x_vec):
076f82
 	/* Check second to last VEC.  */
076f82
 	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
076f82
@@ -374,26 +392,49 @@ L(last_1x_vec):
076f82
 	jnz	L(return_vec_0_end)
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(8x_return_vec_2):
076f82
-	subq	$VEC_SIZE, %rdx
076f82
-L(8x_return_vec_3):
076f82
-	tzcntl	%eax, %eax
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_1_end):
076f82
+	/* Use bsf to save code size. This is necessary to have
076f82
+	   L(one_or_less) fit in aligning bytes between.  */
076f82
+	bsfl	%eax, %eax
076f82
+	addl	%edx, %eax
076f82
 # ifdef USE_AS_WMEMCMP
076f82
-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
076f82
-	movl	(VEC_SIZE * 3)(%rax), %ecx
076f82
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
076f82
 	xorl	%edx, %edx
076f82
-	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
076f82
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
076f82
 	setg	%dl
076f82
 	leal	-1(%rdx, %rdx), %eax
076f82
 # else
076f82
-	addq	%rdx, %rax
076f82
-	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
076f82
-	movzbl	(VEC_SIZE * 3)(%rax), %eax
076f82
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
076f82
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
076f82
 	subl	%ecx, %eax
076f82
 # endif
076f82
 	ret
076f82
 
076f82
+	/* NB: L(one_or_less) fits in alignment padding between
076f82
+	   L(return_vec_1_end) and L(return_vec_0_end).  */
076f82
+# ifdef USE_AS_WMEMCMP
076f82
+L(one_or_less):
076f82
+	jb	L(zero)
076f82
+	movl	(%rdi), %ecx
076f82
+	xorl	%edx, %edx
076f82
+	cmpl	(%rsi), %ecx
076f82
+	je	L(zero)
076f82
+	setg	%dl
076f82
+	leal	-1(%rdx, %rdx), %eax
076f82
+	ret
076f82
+# else
076f82
+L(one_or_less):
076f82
+	jb	L(zero)
076f82
+	movzbl	(%rsi), %ecx
076f82
+	movzbl	(%rdi), %eax
076f82
+	subl	%ecx, %eax
076f82
+	ret
076f82
+# endif
076f82
+L(zero):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+
076f82
 	.p2align 4
076f82
 L(return_vec_0_end):
076f82
 	tzcntl	%eax, %eax
076f82
@@ -412,23 +453,56 @@ L(return_vec_0_end):
076f82
 	ret
076f82
 
076f82
 	.p2align 4
076f82
-L(return_vec_1_end):
076f82
+L(less_vec):
076f82
+	/* Check if one or less CHAR. This is necessary for size == 0
076f82
+	   but is also faster for size == CHAR_SIZE.  */
076f82
+	cmpl	$1, %edx
076f82
+	jbe	L(one_or_less)
076f82
+
076f82
+	/* Check if loading one VEC from either s1 or s2 could cause a
076f82
+	   page cross. This can have false positives but is by far the
076f82
+	   fastest method.  */
076f82
+	movl	%edi, %eax
076f82
+	orl	%esi, %eax
076f82
+	andl	$(PAGE_SIZE - 1), %eax
076f82
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
+	jg	L(page_cross_less_vec)
076f82
+
076f82
+	/* No page cross possible.  */
076f82
+	VMOVU	(%rsi), %YMM2
076f82
+	VPCMP	$4, (%rdi), %YMM2, %k1
076f82
+	kmovd	%k1, %eax
076f82
+	/* Check if any matches where in bounds. Intentionally not
076f82
+	   storing result in eax to limit dependency chain if it goes to
076f82
+	   L(return_vec_0_lv).  */
076f82
+	bzhil	%edx, %eax, %edx
076f82
+	jnz	L(return_vec_0_lv)
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+
076f82
+	/* Essentially duplicate of L(return_vec_0). Ends up not costing
076f82
+	   any code as shrinks L(less_vec) by allowing 2-byte encoding of
076f82
+	   the jump and ends up fitting in aligning bytes. As well fits on
076f82
+	   same cache line as L(less_vec) so also saves a line from having
076f82
+	   to be fetched on cold calls to memcmp.  */
076f82
+	.p2align 4,, 4
076f82
+L(return_vec_0_lv):
076f82
 	tzcntl	%eax, %eax
076f82
-	addl	%edx, %eax
076f82
 # ifdef USE_AS_WMEMCMP
076f82
-	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
076f82
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
076f82
 	xorl	%edx, %edx
076f82
-	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
076f82
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
076f82
+	/* NB: no partial register stall here because xorl zero idiom
076f82
+	   above.  */
076f82
 	setg	%dl
076f82
 	leal	-1(%rdx, %rdx), %eax
076f82
 # else
076f82
-	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
076f82
-	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
076f82
+	movzbl	(%rsi, %rax), %ecx
076f82
+	movzbl	(%rdi, %rax), %eax
076f82
 	subl	%ecx, %eax
076f82
 # endif
076f82
 	ret
076f82
 
076f82
-
076f82
 	.p2align 4
076f82
 L(page_cross_less_vec):
076f82
 	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
076f82
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
076f82
 	cmpl	$8, %edx
076f82
 	jae	L(between_8_15)
076f82
 	cmpl	$4, %edx
076f82
-	jae	L(between_4_7)
076f82
-L(between_2_3):
076f82
-	/* Load as big endian to avoid branches.  */
076f82
-	movzwl	(%rdi), %eax
076f82
-	movzwl	(%rsi), %ecx
076f82
-	shll	$8, %eax
076f82
-	shll	$8, %ecx
076f82
-	bswap	%eax
076f82
-	bswap	%ecx
076f82
-	movzbl	-1(%rdi, %rdx), %edi
076f82
-	movzbl	-1(%rsi, %rdx), %esi
076f82
-	orl	%edi, %eax
076f82
-	orl	%esi, %ecx
076f82
-	/* Subtraction is okay because the upper 8 bits are zero.  */
076f82
-	subl	%ecx, %eax
076f82
-	ret
076f82
-	.p2align 4
076f82
-L(one_or_less):
076f82
-	jb	L(zero)
076f82
-	movzbl	(%rsi), %ecx
076f82
-	movzbl	(%rdi), %eax
076f82
-	subl	%ecx, %eax
076f82
+	jb	L(between_2_3)
076f82
+
076f82
+	/* Load as big endian with overlapping movbe to avoid branches.
076f82
+	 */
076f82
+	movbe	(%rdi), %eax
076f82
+	movbe	(%rsi), %ecx
076f82
+	shlq	$32, %rax
076f82
+	shlq	$32, %rcx
076f82
+	movbe	-4(%rdi, %rdx), %edi
076f82
+	movbe	-4(%rsi, %rdx), %esi
076f82
+	orq	%rdi, %rax
076f82
+	orq	%rsi, %rcx
076f82
+	subq	%rcx, %rax
076f82
+	/* edx is guranteed to be positive int32 in range [4, 7].  */
076f82
+	cmovne	%edx, %eax
076f82
+	/* ecx is -1 if rcx > rax. Otherwise 0.  */
076f82
+	sbbl	%ecx, %ecx
076f82
+	/* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
076f82
+	   rax then eax and ecx are zero. If rax < rax then ecx is -1 so
076f82
+	   eax doesn't matter.  */
076f82
+	orl	%ecx, %eax
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
+	.p2align 4,, 8
076f82
 L(between_8_15):
076f82
 # endif
076f82
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
076f82
-	vmovq	(%rdi), %XMM1
076f82
-	vmovq	(%rsi), %XMM2
076f82
-	VPCMP	$4, %XMM1, %XMM2, %k1
076f82
+	vmovq	(%rdi), %xmm1
076f82
+	vmovq	(%rsi), %xmm2
076f82
+	VPCMP	$4, %xmm1, %xmm2, %k1
076f82
 	kmovd	%k1, %eax
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(return_vec_0)
076f82
+	jnz	L(return_vec_0_lv)
076f82
 	/* Use overlapping loads to avoid branches.  */
076f82
-	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
076f82
-	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
076f82
-	vmovq	(%rdi), %XMM1
076f82
-	vmovq	(%rsi), %XMM2
076f82
-	VPCMP	$4, %XMM1, %XMM2, %k1
076f82
+	vmovq	-8(%rdi, %rdx, CHAR_SIZE), %xmm1
076f82
+	vmovq	-8(%rsi, %rdx, CHAR_SIZE), %xmm2
076f82
+	VPCMP	$4, %xmm1, %xmm2, %k1
076f82
+	addl	$(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
076f82
 	kmovd	%k1, %eax
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(return_vec_0)
076f82
-	ret
076f82
-
076f82
-	.p2align 4
076f82
-L(zero):
076f82
-	xorl	%eax, %eax
076f82
+	jnz	L(return_vec_0_end)
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
+	.p2align 4,, 8
076f82
 L(between_16_31):
076f82
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
076f82
-	VMOVU	(%rsi), %XMM2
076f82
-	VPCMP	$4, (%rdi), %XMM2, %k1
076f82
+
076f82
+	/* Use movups to save code size.  */
076f82
+	movups	(%rsi), %xmm2
076f82
+	VPCMP	$4, (%rdi), %xmm2, %k1
076f82
 	kmovd	%k1, %eax
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(return_vec_0)
076f82
-
076f82
+	jnz	L(return_vec_0_lv)
076f82
 	/* Use overlapping loads to avoid branches.  */
076f82
-
076f82
-	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
076f82
-	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
076f82
-	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
076f82
-	VPCMP	$4, (%rdi), %XMM2, %k1
076f82
+	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
076f82
+	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
076f82
+	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
076f82
 	kmovd	%k1, %eax
076f82
 	testl	%eax, %eax
076f82
-	jnz	L(return_vec_0)
076f82
-	ret
076f82
-
076f82
-# ifdef USE_AS_WMEMCMP
076f82
-	.p2align 4
076f82
-L(one_or_less):
076f82
-	jb	L(zero)
076f82
-	movl	(%rdi), %ecx
076f82
-	xorl	%edx, %edx
076f82
-	cmpl	(%rsi), %ecx
076f82
-	je	L(zero)
076f82
-	setg	%dl
076f82
-	leal	-1(%rdx, %rdx), %eax
076f82
+	jnz	L(return_vec_0_end)
076f82
 	ret
076f82
-# else
076f82
 
076f82
-	.p2align 4
076f82
-L(between_4_7):
076f82
-	/* Load as big endian with overlapping movbe to avoid branches.
076f82
-	 */
076f82
-	movbe	(%rdi), %eax
076f82
-	movbe	(%rsi), %ecx
076f82
-	shlq	$32, %rax
076f82
-	shlq	$32, %rcx
076f82
-	movbe	-4(%rdi, %rdx), %edi
076f82
-	movbe	-4(%rsi, %rdx), %esi
076f82
-	orq	%rdi, %rax
076f82
-	orq	%rsi, %rcx
076f82
-	subq	%rcx, %rax
076f82
-	jz	L(zero_4_7)
076f82
-	sbbl	%eax, %eax
076f82
-	orl	$1, %eax
076f82
-L(zero_4_7):
076f82
+# ifndef USE_AS_WMEMCMP
076f82
+L(between_2_3):
076f82
+	/* Load as big endian to avoid branches.  */
076f82
+	movzwl	(%rdi), %eax
076f82
+	movzwl	(%rsi), %ecx
076f82
+	shll	$8, %eax
076f82
+	shll	$8, %ecx
076f82
+	bswap	%eax
076f82
+	bswap	%ecx
076f82
+	movzbl	-1(%rdi, %rdx), %edi
076f82
+	movzbl	-1(%rsi, %rdx), %esi
076f82
+	orl	%edi, %eax
076f82
+	orl	%esi, %ecx
076f82
+	/* Subtraction is okay because the upper 8 bits are zero.  */
076f82
+	subl	%ecx, %eax
076f82
 	ret
076f82
 # endif
076f82
-
076f82
 END (MEMCMP)
076f82
 #endif