076f82
commit 0d5b36c8cc15f064e302d29692853f8a760e1547
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Jan 10 15:35:38 2022 -0600
076f82
076f82
    x86: Optimize strcmp-avx2.S
076f82
    
076f82
    Optimization are primarily to the loop logic and how the page cross
076f82
    logic interacts with the loop.
076f82
    
076f82
    The page cross logic is at times more expensive for short strings near
076f82
    the end of a page but not crossing the page. This is done to retest
076f82
    the page cross conditions with a non-faulty check and to improve the
076f82
    logic for entering the loop afterwards. This is only particular cases,
076f82
    however, and is general made up for by more than 10x improvements on
076f82
    the transition from the page cross -> loop case.
076f82
    
076f82
    The non-page cross cases are improved most for smaller sizes [0, 128]
076f82
    and go about even for (128, 4096]. The loop page cross logic is
076f82
    improved so some more significant speedup is seen there as well.
076f82
    
076f82
    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
076f82
    
076f82
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
    (cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
076f82
index fa70c994fc25dfd8..a0d1c65db11028bc 100644
076f82
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
076f82
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
076f82
@@ -26,35 +26,57 @@
076f82
 
076f82
 # define PAGE_SIZE	4096
076f82
 
076f82
-/* VEC_SIZE = Number of bytes in a ymm register */
076f82
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
076f82
 # define VEC_SIZE	32
076f82
 
076f82
-/* Shift for dividing by (VEC_SIZE * 4).  */
076f82
-# define DIVIDE_BY_VEC_4_SHIFT	7
076f82
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
076f82
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
076f82
-# endif
076f82
+# define VMOVU	vmovdqu
076f82
+# define VMOVA	vmovdqa
076f82
 
076f82
 # ifdef USE_AS_WCSCMP
076f82
-/* Compare packed dwords.  */
076f82
+	/* Compare packed dwords.  */
076f82
 #  define VPCMPEQ	vpcmpeqd
076f82
-/* Compare packed dwords and store minimum.  */
076f82
+	/* Compare packed dwords and store minimum.  */
076f82
 #  define VPMINU	vpminud
076f82
-/* 1 dword char == 4 bytes.  */
076f82
+	/* 1 dword char == 4 bytes.  */
076f82
 #  define SIZE_OF_CHAR	4
076f82
 # else
076f82
-/* Compare packed bytes.  */
076f82
+	/* Compare packed bytes.  */
076f82
 #  define VPCMPEQ	vpcmpeqb
076f82
-/* Compare packed bytes and store minimum.  */
076f82
+	/* Compare packed bytes and store minimum.  */
076f82
 #  define VPMINU	vpminub
076f82
-/* 1 byte char == 1 byte.  */
076f82
+	/* 1 byte char == 1 byte.  */
076f82
 #  define SIZE_OF_CHAR	1
076f82
 # endif
076f82
 
076f82
+# ifdef USE_AS_STRNCMP
076f82
+#  define LOOP_REG	r9d
076f82
+#  define LOOP_REG64	r9
076f82
+
076f82
+#  define OFFSET_REG8	r9b
076f82
+#  define OFFSET_REG	r9d
076f82
+#  define OFFSET_REG64	r9
076f82
+# else
076f82
+#  define LOOP_REG	edx
076f82
+#  define LOOP_REG64	rdx
076f82
+
076f82
+#  define OFFSET_REG8	dl
076f82
+#  define OFFSET_REG	edx
076f82
+#  define OFFSET_REG64	rdx
076f82
+# endif
076f82
+
076f82
 # ifndef VZEROUPPER
076f82
 #  define VZEROUPPER	vzeroupper
076f82
 # endif
076f82
 
076f82
+# if defined USE_AS_STRNCMP
076f82
+#  define VEC_OFFSET	0
076f82
+# else
076f82
+#  define VEC_OFFSET	(-VEC_SIZE)
076f82
+# endif
076f82
+
076f82
+# define xmmZERO	xmm15
076f82
+# define ymmZERO	ymm15
076f82
+
076f82
 # ifndef SECTION
076f82
 #  define SECTION(p)	p##.avx
076f82
 # endif
076f82
@@ -79,783 +101,1049 @@
076f82
    the maximum offset is reached before a difference is found, zero is
076f82
    returned.  */
076f82
 
076f82
-	.section SECTION(.text),"ax",@progbits
076f82
-ENTRY (STRCMP)
076f82
+	.section SECTION(.text), "ax", @progbits
076f82
+ENTRY(STRCMP)
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Check for simple cases (0 or 1) in offset.  */
076f82
+#  ifdef __ILP32__
076f82
+	/* Clear the upper 32 bits.  */
076f82
+	movl	%edx, %rdx
076f82
+#  endif
076f82
 	cmp	$1, %RDX_LP
076f82
-	je	L(char0)
076f82
-	jb	L(zero)
076f82
+	/* Signed comparison intentional. We use this branch to also
076f82
+	   test cases where length >= 2^63. These very large sizes can be
076f82
+	   handled with strcmp as there is no way for that length to
076f82
+	   actually bound the buffer.  */
076f82
+	jle	L(one_or_less)
076f82
 #  ifdef USE_AS_WCSCMP
076f82
-#  ifndef __ILP32__
076f82
 	movq	%rdx, %rcx
076f82
-	/* Check if length could overflow when multiplied by
076f82
-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
076f82
-	   overflow cases as well as redirect cases where its impossible to
076f82
-	   length to bound a valid memory region. In these cases just use
076f82
-	   'wcscmp'.  */
076f82
+
076f82
+	/* Multiplying length by sizeof(wchar_t) can result in overflow.
076f82
+	   Check if that is possible. All cases where overflow are possible
076f82
+	   are cases where length is large enough that it can never be a
076f82
+	   bound on valid memory so just use wcscmp.  */
076f82
 	shrq	$56, %rcx
076f82
-	jnz	OVERFLOW_STRCMP
076f82
-#  endif
076f82
-	/* Convert units: from wide to byte char.  */
076f82
-	shl	$2, %RDX_LP
076f82
+	jnz	__wcscmp_avx2
076f82
+
076f82
+	leaq	(, %rdx, 4), %rdx
076f82
 #  endif
076f82
-	/* Register %r11 tracks the maximum offset.  */
076f82
-	mov	%RDX_LP, %R11_LP
076f82
 # endif
076f82
+	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
076f82
 	movl	%edi, %eax
076f82
-	xorl	%edx, %edx
076f82
-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
076f82
-	vpxor	%xmm7, %xmm7, %xmm7
076f82
 	orl	%esi, %eax
076f82
-	andl	$(PAGE_SIZE - 1), %eax
076f82
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
076f82
-	jg	L(cross_page)
076f82
-	/* Start comparing 4 vectors.  */
076f82
-	vmovdqu	(%rdi), %ymm1
076f82
-	VPCMPEQ	(%rsi), %ymm1, %ymm0
076f82
-	VPMINU	%ymm1, %ymm0, %ymm0
076f82
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
076f82
-	vpmovmskb %ymm0, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	je	L(next_3_vectors)
076f82
-	tzcntl	%ecx, %edx
076f82
+	sall	$20, %eax
076f82
+	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
076f82
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
076f82
+	ja	L(page_cross)
076f82
+
076f82
+L(no_page_cross):
076f82
+	/* Safe to compare 4x vectors.  */
076f82
+	VMOVU	(%rdi), %ymm0
076f82
+	/* 1s where s1 and s2 equal.  */
076f82
+	VPCMPEQ	(%rsi), %ymm0, %ymm1
076f82
+	/* 1s at null CHAR.  */
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	/* 1s where s1 and s2 equal AND not null CHAR.  */
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+
076f82
+	/* All 1s -> keep going, any 0s -> return.  */
076f82
+	vpmovmskb %ymm1, %ecx
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
076f82
-	   offset (%r11).   */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	$VEC_SIZE, %rdx
076f82
+	jbe	L(vec_0_test_len)
076f82
 # endif
076f82
+
076f82
+	/* All 1s represents all equals. incl will overflow to zero in
076f82
+	   all equals case. Otherwise 1s will carry until position of first
076f82
+	   mismatch.  */
076f82
+	incl	%ecx
076f82
+	jz	L(more_3x_vec)
076f82
+
076f82
+	.p2align 4,, 4
076f82
+L(return_vec_0):
076f82
+	tzcntl	%ecx, %ecx
076f82
 # ifdef USE_AS_WCSCMP
076f82
+	movl	(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	je	L(return)
076f82
-L(wcscmp_return):
076f82
+	cmpl	(%rsi, %rcx), %edx
076f82
+	je	L(ret0)
076f82
 	setl	%al
076f82
 	negl	%eax
076f82
 	orl	$1, %eax
076f82
-L(return):
076f82
 # else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	movzbl	(%rdi, %rcx), %eax
076f82
+	movzbl	(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
 # endif
076f82
+L(ret0):
076f82
 L(return_vzeroupper):
076f82
 	ZERO_UPPER_VEC_REGISTERS_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(return_vec_size):
076f82
-	tzcntl	%ecx, %edx
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
076f82
-	   the maximum offset (%r11).  */
076f82
-	addq	$VEC_SIZE, %rdx
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
+	.p2align 4,, 8
076f82
+L(vec_0_test_len):
076f82
+	notl	%ecx
076f82
+	bzhil	%edx, %ecx, %eax
076f82
+	jnz	L(return_vec_0)
076f82
+	/* Align if will cross fetch block.  */
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero):
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# else
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
+	.p2align 4,, 5
076f82
+L(one_or_less):
076f82
+	jb	L(ret_zero)
076f82
 #  ifdef USE_AS_WCSCMP
076f82
+	/* 'nbe' covers the case where length is negative (large
076f82
+	   unsigned).  */
076f82
+	jnbe	__wcscmp_avx2
076f82
+	movl	(%rdi), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
076f82
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	cmpl	(%rsi), %edx
076f82
+	je	L(ret1)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 #  else
076f82
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
076f82
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	/* 'nbe' covers the case where length is negative (large
076f82
+	   unsigned).  */
076f82
+
076f82
+	jnbe	__strcmp_avx2
076f82
+	movzbl	(%rdi), %eax
076f82
+	movzbl	(%rsi), %ecx
076f82
+	subl	%ecx, %eax
076f82
 #  endif
076f82
+L(ret1):
076f82
+	ret
076f82
 # endif
076f82
-	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(return_2_vec_size):
076f82
-	tzcntl	%ecx, %edx
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_1):
076f82
+	tzcntl	%ecx, %ecx
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
076f82
-	   after the maximum offset (%r11).  */
076f82
-	addq	$(VEC_SIZE * 2), %rdx
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
+	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
076f82
+	   overflow.  */
076f82
+	addq	$-VEC_SIZE, %rdx
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	VEC_SIZE(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
076f82
+	je	L(ret2)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
076f82
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
076f82
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
 # endif
076f82
+L(ret2):
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(return_3_vec_size):
076f82
-	tzcntl	%ecx, %edx
076f82
+	.p2align 4,, 10
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
076f82
-	   after the maximum offset (%r11).  */
076f82
-	addq	$(VEC_SIZE * 3), %rdx
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
+L(return_vec_3):
076f82
+	salq	$32, %rcx
076f82
+# endif
076f82
+
076f82
+L(return_vec_2):
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	tzcntl	%ecx, %ecx
076f82
+# else
076f82
+	tzcntq	%rcx, %rcx
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
076f82
+	je	L(ret3)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 # else
076f82
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
076f82
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+# endif
076f82
+L(ret3):
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_3):
076f82
+	tzcntl	%ecx, %ecx
076f82
 #  ifdef USE_AS_WCSCMP
076f82
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
076f82
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
076f82
+	je	L(ret4)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 #  else
076f82
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
076f82
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
076f82
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
 #  endif
076f82
-# endif
076f82
+L(ret4):
076f82
 	VZEROUPPER_RETURN
076f82
+# endif
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(more_3x_vec):
076f82
+	/* Safe to compare 4x vectors.  */
076f82
+	VMOVU	VEC_SIZE(%rdi), %ymm0
076f82
+	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_1)
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	subq	$(VEC_SIZE * 2), %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+
076f82
+	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
076f82
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_2)
076f82
+
076f82
+	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
076f82
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_3)
076f82
 
076f82
-	.p2align 4
076f82
-L(next_3_vectors):
076f82
-	vmovdqu	VEC_SIZE(%rdi), %ymm6
076f82
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
076f82
-	VPMINU	%ymm6, %ymm3, %ymm3
076f82
-	VPCMPEQ	%ymm7, %ymm3, %ymm3
076f82
-	vpmovmskb %ymm3, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(return_vec_size)
076f82
-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
076f82
-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
076f82
-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
076f82
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
076f82
-	VPMINU	%ymm5, %ymm2, %ymm2
076f82
-	VPCMPEQ	%ymm4, %ymm0, %ymm0
076f82
-	VPCMPEQ	%ymm7, %ymm2, %ymm2
076f82
-	vpmovmskb %ymm2, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(return_2_vec_size)
076f82
-	VPMINU	%ymm4, %ymm0, %ymm0
076f82
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
076f82
-	vpmovmskb %ymm0, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(return_3_vec_size)
076f82
-L(main_loop_header):
076f82
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
076f82
-	movl	$PAGE_SIZE, %ecx
076f82
-	/* Align load via RAX.  */
076f82
-	andq	$-(VEC_SIZE * 4), %rdx
076f82
-	subq	%rdi, %rdx
076f82
-	leaq	(%rdi, %rdx), %rax
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Starting from this point, the maximum offset, or simply the
076f82
-	   'offset', DECREASES by the same amount when base pointers are
076f82
-	   moved forward.  Return 0 when:
076f82
-	     1) On match: offset <= the matched vector index.
076f82
-	     2) On mistmach, offset is before the mistmatched index.
076f82
+	cmpq	$(VEC_SIZE * 2), %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	/* any non-zero positive value that doesn't inference with 0x1.
076f82
 	 */
076f82
-	subq	%rdx, %r11
076f82
-	jbe	L(zero)
076f82
-# endif
076f82
-	addq	%rsi, %rdx
076f82
-	movq	%rdx, %rsi
076f82
-	andl	$(PAGE_SIZE - 1), %esi
076f82
-	/* Number of bytes before page crossing.  */
076f82
-	subq	%rsi, %rcx
076f82
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
076f82
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
076f82
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
076f82
-	movl	%ecx, %esi
076f82
-	jmp	L(loop_start)
076f82
+	movl	$2, %r8d
076f82
 
076f82
+# else
076f82
+	xorl	%r8d, %r8d
076f82
+# endif
076f82
+
076f82
+	/* The prepare labels are various entry points from the page
076f82
+	   cross logic.  */
076f82
+L(prepare_loop):
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
076f82
+	   the loop.  */
076f82
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
076f82
+# endif
076f82
+L(prepare_loop_no_len):
076f82
+
076f82
+	/* Align s1 and adjust s2 accordingly.  */
076f82
+	subq	%rdi, %rsi
076f82
+	andq	$-(VEC_SIZE * 4), %rdi
076f82
+	addq	%rdi, %rsi
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	subq	%rdi, %rdx
076f82
+# endif
076f82
+
076f82
+L(prepare_loop_aligned):
076f82
+	/* eax stores distance from rsi to next page cross. These cases
076f82
+	   need to be handled specially as the 4x loop could potentially
076f82
+	   read memory past the length of s1 or s2 and across a page
076f82
+	   boundary.  */
076f82
+	movl	$-(VEC_SIZE * 4), %eax
076f82
+	subl	%esi, %eax
076f82
+	andl	$(PAGE_SIZE - 1), %eax
076f82
+
076f82
+	/* Loop 4x comparisons at a time.  */
076f82
 	.p2align 4
076f82
 L(loop):
076f82
+
076f82
+	/* End condition for strncmp.  */
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
076f82
-	   the maximum offset (%r11) by the same amount.  */
076f82
-	subq	$(VEC_SIZE * 4), %r11
076f82
-	jbe	L(zero)
076f82
-# endif
076f82
-	addq	$(VEC_SIZE * 4), %rax
076f82
-	addq	$(VEC_SIZE * 4), %rdx
076f82
-L(loop_start):
076f82
-	testl	%esi, %esi
076f82
-	leal	-1(%esi), %esi
076f82
-	je	L(loop_cross_page)
076f82
-L(back_to_loop):
076f82
-	/* Main loop, comparing 4 vectors are a time.  */
076f82
-	vmovdqa	(%rax), %ymm0
076f82
-	vmovdqa	VEC_SIZE(%rax), %ymm3
076f82
-	VPCMPEQ	(%rdx), %ymm0, %ymm4
076f82
-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
076f82
-	VPMINU	%ymm0, %ymm4, %ymm4
076f82
-	VPMINU	%ymm3, %ymm1, %ymm1
076f82
-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
076f82
-	VPMINU	%ymm1, %ymm4, %ymm0
076f82
-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
076f82
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
076f82
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
076f82
-	VPMINU	%ymm2, %ymm5, %ymm5
076f82
-	VPMINU	%ymm3, %ymm6, %ymm6
076f82
-	VPMINU	%ymm5, %ymm0, %ymm0
076f82
-	VPMINU	%ymm6, %ymm0, %ymm0
076f82
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
076f82
-
076f82
-	/* Test each mask (32 bits) individually because for VEC_SIZE
076f82
-	   == 32 is not possible to OR the four masks and keep all bits
076f82
-	   in a 64-bit integer register, differing from SSE2 strcmp
076f82
-	   where ORing is possible.  */
076f82
-	vpmovmskb %ymm0, %ecx
076f82
+	subq	$(VEC_SIZE * 4), %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+
076f82
+	subq	$-(VEC_SIZE * 4), %rdi
076f82
+	subq	$-(VEC_SIZE * 4), %rsi
076f82
+
076f82
+	/* Check if rsi loads will cross a page boundary.  */
076f82
+	addl	$-(VEC_SIZE * 4), %eax
076f82
+	jnb	L(page_cross_during_loop)
076f82
+
076f82
+	/* Loop entry after handling page cross during loop.  */
076f82
+L(loop_skip_page_cross_check):
076f82
+	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
076f82
+	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
076f82
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
076f82
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
076f82
+
076f82
+	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
076f82
+	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
076f82
+
076f82
+	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
076f82
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
076f82
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
076f82
+
076f82
+
076f82
+	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
076f82
+	   zero.  */
076f82
+	vpand	%ymm0, %ymm1, %ymm1
076f82
+
076f82
+
076f82
+	vpand	%ymm2, %ymm3, %ymm3
076f82
+	vpand	%ymm4, %ymm5, %ymm5
076f82
+	vpand	%ymm6, %ymm7, %ymm7
076f82
+
076f82
+	VPMINU	%ymm1, %ymm3, %ymm3
076f82
+	VPMINU	%ymm5, %ymm7, %ymm7
076f82
+
076f82
+	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
076f82
+	VPMINU	%ymm3, %ymm7, %ymm7
076f82
+
076f82
+	/* If any 0 CHAR then done.  */
076f82
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
076f82
+	vpmovmskb %ymm7, %LOOP_REG
076f82
+	testl	%LOOP_REG, %LOOP_REG
076f82
+	jz	L(loop)
076f82
+
076f82
+	/* Find which VEC has the mismatch of end of string.  */
076f82
+	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
 	testl	%ecx, %ecx
076f82
-	je	L(loop)
076f82
-	VPCMPEQ	%ymm7, %ymm4, %ymm0
076f82
-	vpmovmskb %ymm0, %edi
076f82
-	testl	%edi, %edi
076f82
-	je	L(test_vec)
076f82
-	tzcntl	%edi, %ecx
076f82
+	jnz	L(return_vec_0_end)
076f82
+
076f82
+
076f82
+	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
076f82
+	vpmovmskb %ymm3, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(return_vec_1_end)
076f82
+
076f82
+L(return_vec_2_3_end):
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	cmpq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	subq	$(VEC_SIZE * 2), %rdx
076f82
+	jbe	L(ret_zero_end)
076f82
+# endif
076f82
+
076f82
+	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
076f82
+	vpmovmskb %ymm5, %ecx
076f82
+	testl	%ecx, %ecx
076f82
+	jnz	L(return_vec_2_end)
076f82
+
076f82
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
076f82
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
076f82
+	   must entirely be from VEC 3 which is fully represented by
076f82
+	   LOOP_REG.  */
076f82
+	tzcntl	%LOOP_REG, %LOOP_REG
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	subl	$-(VEC_SIZE), %LOOP_REG
076f82
+	cmpq	%LOOP_REG64, %rdx
076f82
+	jbe	L(ret_zero_end)
076f82
+# endif
076f82
+
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
076f82
+	je	L(ret5)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
076f82
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret5):
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(test_vec):
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* The first vector matched.  Return 0 if the maximum offset
076f82
-	   (%r11) <= VEC_SIZE.  */
076f82
-	cmpq	$VEC_SIZE, %r11
076f82
-	jbe	L(zero)
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_end):
076f82
+	xorl	%eax, %eax
076f82
+	VZEROUPPER_RETURN
076f82
 # endif
076f82
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
076f82
-	vpmovmskb %ymm1, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	je	L(test_2_vec)
076f82
-	tzcntl	%ecx, %edi
076f82
+
076f82
+
076f82
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
076f82
+	   they use the value of `r8` to negate the return value. This is
076f82
+	   because the page cross logic can swap `rdi` and `rsi`.  */
076f82
+	.p2align 4,, 10
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	cmpq	%rdi, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+L(return_vec_1_end):
076f82
+	salq	$32, %rcx
076f82
+# endif
076f82
+L(return_vec_0_end):
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	tzcntl	%ecx, %ecx
076f82
+# else
076f82
+	tzcntq	%rcx, %rcx
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero_end)
076f82
+# endif
076f82
+
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rdi), %ecx
076f82
-	cmpl	(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rdi), %eax
076f82
-	movzbl	(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	(%rsi, %rcx), %edx
076f82
+	je	L(ret6)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 # else
076f82
+	movzbl	(%rdi, %rcx), %eax
076f82
+	movzbl	(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
+# endif
076f82
+L(ret6):
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_1_end):
076f82
+	tzcntl	%ecx, %ecx
076f82
 #  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	movl	VEC_SIZE(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
076f82
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
076f82
+	je	L(ret7)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 #  else
076f82
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
076f82
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 #  endif
076f82
-# endif
076f82
+L(ret7):
076f82
 	VZEROUPPER_RETURN
076f82
+# endif
076f82
 
076f82
-	.p2align 4
076f82
-L(test_2_vec):
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_2_end):
076f82
+	tzcntl	%ecx, %ecx
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
076f82
-	   (%r11) <= 2 * VEC_SIZE.  */
076f82
-	cmpq	$(VEC_SIZE * 2), %r11
076f82
-	jbe	L(zero)
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero_page_cross)
076f82
 # endif
076f82
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
076f82
-	vpmovmskb %ymm5, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	je	L(test_3_vec)
076f82
-	tzcntl	%ecx, %edi
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	addq	$(VEC_SIZE * 2), %rdi
076f82
-	cmpq	%rdi, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rdi), %ecx
076f82
-	cmpl	(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rdi), %eax
076f82
-	movzbl	(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
076f82
+	je	L(ret11)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
076f82
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
076f82
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
076f82
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret11):
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(test_3_vec):
076f82
+
076f82
+	/* Page cross in rsi in next 4x VEC.  */
076f82
+
076f82
+	/* TODO: Improve logic here.  */
076f82
+	.p2align 4,, 10
076f82
+L(page_cross_during_loop):
076f82
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
076f82
+
076f82
+	/* Optimistically rsi and rdi and both aligned inwhich case we
076f82
+	   don't need any logic here.  */
076f82
+	cmpl	$-(VEC_SIZE * 4), %eax
076f82
+	/* Don't adjust eax before jumping back to loop and we will
076f82
+	   never hit page cross case again.  */
076f82
+	je	L(loop_skip_page_cross_check)
076f82
+
076f82
+	/* Check if we can safely load a VEC.  */
076f82
+	cmpl	$-(VEC_SIZE * 3), %eax
076f82
+	jle	L(less_1x_vec_till_page_cross)
076f82
+
076f82
+	VMOVA	(%rdi), %ymm0
076f82
+	VPCMPEQ	(%rsi), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_0_end)
076f82
+
076f82
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
076f82
+	cmpl	$-(VEC_SIZE * 2), %eax
076f82
+	jg	L(more_2x_vec_till_page_cross)
076f82
+
076f82
+	.p2align 4,, 4
076f82
+L(less_1x_vec_till_page_cross):
076f82
+	subl	$-(VEC_SIZE * 4), %eax
076f82
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
076f82
+	   concerning case is first iteration if incoming s1 was near start
076f82
+	   of a page and s2 near end. If s1 was near the start of the page
076f82
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
076f82
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
076f82
+	   here, it means the previous page (rdi - VEC_SIZE) has already
076f82
+	   been loaded earlier so must be valid.  */
076f82
+	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
076f82
+	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+
076f82
+	/* Mask of potentially valid bits. The lower bits can be out of
076f82
+	   range comparisons (but safe regarding page crosses).  */
076f82
+	movl	$-1, %r10d
076f82
+	shlxl	%esi, %r10d, %r10d
076f82
+	notl	%ecx
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
076f82
-	   (%r11) <= 3 * VEC_SIZE.  */
076f82
-	cmpq	$(VEC_SIZE * 3), %r11
076f82
-	jbe	L(zero)
076f82
-# endif
076f82
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
076f82
-	vpmovmskb %ymm6, %esi
076f82
-	tzcntl	%esi, %ecx
076f82
+	cmpq	%rax, %rdx
076f82
+	jbe	L(return_page_cross_end_check)
076f82
+# endif
076f82
+	movl	%eax, %OFFSET_REG
076f82
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
076f82
+
076f82
+	andl	%r10d, %ecx
076f82
+	jz	L(loop_skip_page_cross_check)
076f82
+
076f82
+	.p2align 4,, 3
076f82
+L(return_page_cross_end):
076f82
+	tzcntl	%ecx, %ecx
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	addq	$(VEC_SIZE * 3), %rcx
076f82
-	cmpq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %esi
076f82
-	cmpl	(%rdx, %rcx), %esi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
076f82
+L(return_page_cross_cmp_mem):
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	addl	%OFFSET_REG, %ecx
076f82
+# endif
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
076f82
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
076f82
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
076f82
+	je	L(ret8)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
+# else
076f82
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret8):
076f82
 	VZEROUPPER_RETURN
076f82
 
076f82
-	.p2align 4
076f82
-L(loop_cross_page):
076f82
-	xorl	%r10d, %r10d
076f82
-	movq	%rdx, %rcx
076f82
-	/* Align load via RDX.  We load the extra ECX bytes which should
076f82
-	   be ignored.  */
076f82
-	andl	$((VEC_SIZE * 4) - 1), %ecx
076f82
-	/* R10 is -RCX.  */
076f82
-	subq	%rcx, %r10
076f82
-
076f82
-	/* This works only if VEC_SIZE * 2 == 64. */
076f82
-# if (VEC_SIZE * 2) != 64
076f82
-#  error (VEC_SIZE * 2) != 64
076f82
-# endif
076f82
-
076f82
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
076f82
-	cmpl	$(VEC_SIZE * 2), %ecx
076f82
-	jge	L(loop_cross_page_2_vec)
076f82
-
076f82
-	vmovdqu	(%rax, %r10), %ymm2
076f82
-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
076f82
-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
076f82
-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
076f82
-	VPMINU	%ymm2, %ymm0, %ymm0
076f82
-	VPMINU	%ymm3, %ymm1, %ymm1
076f82
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
076f82
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
076f82
-
076f82
-	vpmovmskb %ymm0, %edi
076f82
-	vpmovmskb %ymm1, %esi
076f82
-
076f82
-	salq	$32, %rsi
076f82
-	xorq	%rsi, %rdi
076f82
-
076f82
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
076f82
-	shrq	%cl, %rdi
076f82
-
076f82
-	testq	%rdi, %rdi
076f82
-	je	L(loop_cross_page_2_vec)
076f82
-	tzcntq	%rdi, %rcx
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	cmpq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	.p2align 4,, 10
076f82
+L(return_page_cross_end_check):
076f82
+	tzcntl	%ecx, %ecx
076f82
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
076f82
+	cmpl	%ecx, %edx
076f82
+	ja	L(return_page_cross_cmp_mem)
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# endif
076f82
 	VZEROUPPER_RETURN
076f82
+# endif
076f82
 
076f82
-	.p2align 4
076f82
-L(loop_cross_page_2_vec):
076f82
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
076f82
-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
076f82
-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
076f82
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
076f82
-	VPMINU	%ymm2, %ymm5, %ymm5
076f82
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
076f82
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
076f82
-	VPMINU	%ymm3, %ymm6, %ymm6
076f82
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
076f82
-
076f82
-	vpmovmskb %ymm5, %edi
076f82
-	vpmovmskb %ymm6, %esi
076f82
-
076f82
-	salq	$32, %rsi
076f82
-	xorq	%rsi, %rdi
076f82
 
076f82
-	xorl	%r8d, %r8d
076f82
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
076f82
-	subl	$(VEC_SIZE * 2), %ecx
076f82
-	jle	1f
076f82
-	/* Skip ECX bytes.  */
076f82
-	shrq	%cl, %rdi
076f82
-	/* R8 has number of bytes skipped.  */
076f82
-	movl	%ecx, %r8d
076f82
-1:
076f82
-	/* Before jumping back to the loop, set ESI to the number of
076f82
-	   VEC_SIZE * 4 blocks before page crossing.  */
076f82
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
076f82
-
076f82
-	testq	%rdi, %rdi
076f82
+	.p2align 4,, 10
076f82
+L(more_2x_vec_till_page_cross):
076f82
+	/* If more 2x vec till cross we will complete a full loop
076f82
+	   iteration here.  */
076f82
+
076f82
+	VMOVU	VEC_SIZE(%rdi), %ymm0
076f82
+	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_1_end)
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* At this point, if %rdi value is 0, it already tested
076f82
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
076f82
-	   checks whether strncmp maximum offset reached or not.  */
076f82
-	je	L(string_nbyte_offset_check)
076f82
-# else
076f82
-	je	L(back_to_loop)
076f82
+	cmpq	$(VEC_SIZE * 2), %rdx
076f82
+	jbe	L(ret_zero_in_loop_page_cross)
076f82
 # endif
076f82
-	tzcntq	%rdi, %rcx
076f82
-	addq	%r10, %rcx
076f82
-	/* Adjust for number of bytes skipped.  */
076f82
-	addq	%r8, %rcx
076f82
+
076f82
+	subl	$-(VEC_SIZE * 4), %eax
076f82
+
076f82
+	/* Safe to include comparisons from lower bytes.  */
076f82
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
076f82
+	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_page_cross_0)
076f82
+
076f82
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
076f82
+	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+	jnz	L(return_vec_page_cross_1)
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	addq	$(VEC_SIZE * 2), %rcx
076f82
-	subq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	/* Must check length here as length might proclude reading next
076f82
+	   page.  */
076f82
+	cmpq	%rax, %rdx
076f82
+	jbe	L(ret_zero_in_loop_page_cross)
076f82
+# endif
076f82
+
076f82
+	/* Finish the loop.  */
076f82
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
076f82
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
076f82
+
076f82
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
076f82
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
076f82
+	vpand	%ymm4, %ymm5, %ymm5
076f82
+	vpand	%ymm6, %ymm7, %ymm7
076f82
+	VPMINU	%ymm5, %ymm7, %ymm7
076f82
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
076f82
+	vpmovmskb %ymm7, %LOOP_REG
076f82
+	testl	%LOOP_REG, %LOOP_REG
076f82
+	jnz	L(return_vec_2_3_end)
076f82
+
076f82
+	/* Best for code size to include ucond-jmp here. Would be faster
076f82
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
076f82
+	   as fall-through and have jump back to loop on mismatch
076f82
+	   comparison.  */
076f82
+	subq	$-(VEC_SIZE * 4), %rdi
076f82
+	subq	$-(VEC_SIZE * 4), %rsi
076f82
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	subq	$(VEC_SIZE * 4), %rdx
076f82
+	ja	L(loop_skip_page_cross_check)
076f82
+L(ret_zero_in_loop_page_cross):
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	VZEROUPPER_RETURN
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
076f82
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
076f82
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	jmp	L(loop_skip_page_cross_check)
076f82
 # endif
076f82
-	VZEROUPPER_RETURN
076f82
 
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_page_cross_0):
076f82
+	addl	$-VEC_SIZE, %eax
076f82
+L(return_vec_page_cross_1):
076f82
+	tzcntl	%ecx, %ecx
076f82
 # ifdef USE_AS_STRNCMP
076f82
-L(string_nbyte_offset_check):
076f82
-	leaq	(VEC_SIZE * 4)(%r10), %r10
076f82
-	cmpq	%r10, %r11
076f82
-	jbe	L(zero)
076f82
-	jmp	L(back_to_loop)
076f82
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero_in_loop_page_cross)
076f82
+# else
076f82
+	addl	%eax, %ecx
076f82
 # endif
076f82
 
076f82
-	.p2align 4
076f82
-L(cross_page_loop):
076f82
-	/* Check one byte/dword at a time.  */
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	cmpl	%ecx, %eax
076f82
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
076f82
+	xorl	%eax, %eax
076f82
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
076f82
+	je	L(ret9)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 # else
076f82
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
076f82
 	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
-	jne	L(different)
076f82
-	addl	$SIZE_OF_CHAR, %edx
076f82
-	cmpl	$(VEC_SIZE * 4), %edx
076f82
-	je	L(main_loop_header)
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+L(ret9):
076f82
+	VZEROUPPER_RETURN
076f82
+
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(page_cross):
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	/* If both are VEC aligned we don't need any special logic here.
076f82
+	   Only valid for strcmp where stop condition is guranteed to be
076f82
+	   reachable by just reading memory.  */
076f82
+	testl	$((VEC_SIZE - 1) << 20), %eax
076f82
+	jz	L(no_page_cross)
076f82
 # endif
076f82
+
076f82
+	movl	%edi, %eax
076f82
+	movl	%esi, %ecx
076f82
+	andl	$(PAGE_SIZE - 1), %eax
076f82
+	andl	$(PAGE_SIZE - 1), %ecx
076f82
+
076f82
+	xorl	%OFFSET_REG, %OFFSET_REG
076f82
+
076f82
+	/* Check which is closer to page cross, s1 or s2.  */
076f82
+	cmpl	%eax, %ecx
076f82
+	jg	L(page_cross_s2)
076f82
+
076f82
+	/* The previous page cross check has false positives. Check for
076f82
+	   true positive as page cross logic is very expensive.  */
076f82
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
076f82
+	jbe	L(no_page_cross)
076f82
+
076f82
+	/* Set r8 to not interfere with normal return value (rdi and rsi
076f82
+	   did not swap).  */
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	movl	(%rdi, %rdx), %eax
076f82
-	movl	(%rsi, %rdx), %ecx
076f82
+	/* any non-zero positive value that doesn't inference with 0x1.
076f82
+	 */
076f82
+	movl	$2, %r8d
076f82
 # else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %ecx
076f82
+	xorl	%r8d, %r8d
076f82
 # endif
076f82
-	/* Check null char.  */
076f82
-	testl	%eax, %eax
076f82
-	jne	L(cross_page_loop)
076f82
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
076f82
-	   comparisons.  */
076f82
-	subl	%ecx, %eax
076f82
-# ifndef USE_AS_WCSCMP
076f82
-L(different):
076f82
+
076f82
+	/* Check if less than 1x VEC till page cross.  */
076f82
+	subl	$(VEC_SIZE * 3), %eax
076f82
+	jg	L(less_1x_vec_till_page)
076f82
+
076f82
+	/* If more than 1x VEC till page cross, loop throuh safely
076f82
+	   loadable memory until within 1x VEC of page cross.  */
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(page_cross_loop):
076f82
+
076f82
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
076f82
+	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incl	%ecx
076f82
+
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+	addl	$VEC_SIZE, %OFFSET_REG
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	cmpq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross)
076f82
 # endif
076f82
-	VZEROUPPER_RETURN
076f82
+	addl	$VEC_SIZE, %eax
076f82
+	jl	L(page_cross_loop)
076f82
+
076f82
+	subl	%eax, %OFFSET_REG
076f82
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
076f82
+	   to not cross page so is safe to load. Since we have already
076f82
+	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
076f82
+	 */
076f82
+
076f82
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
076f82
+	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
076f82
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
076f82
+	vpandn	%ymm1, %ymm2, %ymm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	leal	VEC_SIZE(%OFFSET_REG64), %eax
076f82
+	cmpq	%rax, %rdx
076f82
+	jbe	L(check_ret_vec_page_cross2)
076f82
+	addq	%rdi, %rdx
076f82
+# endif
076f82
+	incl	%ecx
076f82
+	jz	L(prepare_loop_no_len)
076f82
 
076f82
+	.p2align 4,, 4
076f82
+L(ret_vec_page_cross):
076f82
+# ifndef USE_AS_STRNCMP
076f82
+L(check_ret_vec_page_cross):
076f82
+# endif
076f82
+	tzcntl	%ecx, %ecx
076f82
+	addl	%OFFSET_REG, %ecx
076f82
+L(ret_vec_page_cross_cont):
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	.p2align 4
076f82
-L(different):
076f82
-	/* Use movl to avoid modifying EFLAGS.  */
076f82
-	movl	$0, %eax
076f82
+	movl	(%rdi, %rcx), %edx
076f82
+	xorl	%eax, %eax
076f82
+	cmpl	(%rsi, %rcx), %edx
076f82
+	je	L(ret12)
076f82
 	setl	%al
076f82
 	negl	%eax
076f82
-	orl	$1, %eax
076f82
-	VZEROUPPER_RETURN
076f82
+	xorl	%r8d, %eax
076f82
+# else
076f82
+	movzbl	(%rdi, %rcx), %eax
076f82
+	movzbl	(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret12):
076f82
+	VZEROUPPER_RETURN
076f82
 
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	.p2align 4
076f82
-L(zero):
076f82
+	.p2align 4,, 10
076f82
+L(check_ret_vec_page_cross2):
076f82
+	incl	%ecx
076f82
+L(check_ret_vec_page_cross):
076f82
+	tzcntl	%ecx, %ecx
076f82
+	addl	%OFFSET_REG, %ecx
076f82
+	cmpq	%rcx, %rdx
076f82
+	ja	L(ret_vec_page_cross_cont)
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_page_cross):
076f82
 	xorl	%eax, %eax
076f82
 	VZEROUPPER_RETURN
076f82
+# endif
076f82
 
076f82
-	.p2align 4
076f82
-L(char0):
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rdi), %ecx
076f82
-	cmpl	(%rsi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rsi), %ecx
076f82
-	movzbl	(%rdi), %eax
076f82
-	subl	%ecx, %eax
076f82
-#  endif
076f82
-	VZEROUPPER_RETURN
076f82
+	.p2align 4,, 4
076f82
+L(page_cross_s2):
076f82
+	/* Ensure this is a true page cross.  */
076f82
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
076f82
+	jbe	L(no_page_cross)
076f82
+
076f82
+
076f82
+	movl	%ecx, %eax
076f82
+	movq	%rdi, %rcx
076f82
+	movq	%rsi, %rdi
076f82
+	movq	%rcx, %rsi
076f82
+
076f82
+	/* set r8 to negate return value as rdi and rsi swapped.  */
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	$-4, %r8d
076f82
+# else
076f82
+	movl	$-1, %r8d
076f82
 # endif
076f82
+	xorl	%OFFSET_REG, %OFFSET_REG
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vector):
076f82
-	addq	%rdx, %rdi
076f82
-	addq	%rdx, %rsi
076f82
+	/* Check if more than 1x VEC till page cross.  */
076f82
+	subl	$(VEC_SIZE * 3), %eax
076f82
+	jle	L(page_cross_loop)
076f82
+
076f82
+	.p2align 4,, 6
076f82
+L(less_1x_vec_till_page):
076f82
+	/* Find largest load size we can use.  */
076f82
+	cmpl	$16, %eax
076f82
+	ja	L(less_16_till_page)
076f82
+
076f82
+	VMOVU	(%rdi), %xmm0
076f82
+	VPCMPEQ	(%rsi), %xmm0, %xmm1
076f82
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
076f82
+	vpandn	%xmm1, %xmm2, %xmm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incw	%cx
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+	movl	$16, %OFFSET_REG
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	subq	%rdx, %r11
076f82
+	cmpq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
+	subl	%eax, %OFFSET_REG
076f82
+# else
076f82
+	/* Explicit check for 16 byte alignment.  */
076f82
+	subl	%eax, %OFFSET_REG
076f82
+	jz	L(prepare_loop)
076f82
 # endif
076f82
-	tzcntl	%ecx, %edx
076f82
+
076f82
+	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
076f82
+	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
076f82
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
076f82
+	vpandn	%xmm1, %xmm2, %xmm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incw	%cx
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	addl	$16, %OFFSET_REG
076f82
+	subq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
+	subq	$-(VEC_SIZE * 4), %rdx
076f82
+
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
076f82
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
076f82
+# else
076f82
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
076f82
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
076f82
 # endif
076f82
-# ifdef USE_AS_WCSCMP
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_page_cross_slow_case0):
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-# else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	ret
076f82
 # endif
076f82
-	VZEROUPPER_RETURN
076f82
 
076f82
-	/* Comparing on page boundary region requires special treatment:
076f82
-	   It must done one vector at the time, starting with the wider
076f82
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
076f82
-	   (xmm) still passes the boundary, byte comparison must be done.
076f82
-	 */
076f82
-	.p2align 4
076f82
-L(cross_page):
076f82
-	/* Try one ymm vector at a time.  */
076f82
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
-	jg	L(cross_page_1_vector)
076f82
-L(loop_1_vector):
076f82
-	vmovdqu	(%rdi, %rdx), %ymm1
076f82
-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
076f82
-	VPMINU	%ymm1, %ymm0, %ymm0
076f82
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
076f82
-	vpmovmskb %ymm0, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(last_vector)
076f82
 
076f82
-	addl	$VEC_SIZE, %edx
076f82
+	.p2align 4,, 10
076f82
+L(less_16_till_page):
076f82
+	/* Find largest load size we can use.  */
076f82
+	cmpl	$24, %eax
076f82
+	ja	L(less_8_till_page)
076f82
 
076f82
-	addl	$VEC_SIZE, %eax
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-# endif
076f82
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
-	jle	L(loop_1_vector)
076f82
-L(cross_page_1_vector):
076f82
-	/* Less than 32 bytes to check, try one xmm vector.  */
076f82
-	cmpl	$(PAGE_SIZE - 16), %eax
076f82
-	jg	L(cross_page_1_xmm)
076f82
-	vmovdqu	(%rdi, %rdx), %xmm1
076f82
-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
076f82
-	VPMINU	%xmm1, %xmm0, %xmm0
076f82
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
076f82
-	vpmovmskb %xmm0, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(last_vector)
076f82
+	vmovq	(%rdi), %xmm0
076f82
+	vmovq	(%rsi), %xmm1
076f82
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
076f82
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
076f82
+	vpandn	%xmm1, %xmm2, %xmm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incb	%cl
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
 
076f82
-	addl	$16, %edx
076f82
-# ifndef USE_AS_WCSCMP
076f82
-	addl	$16, %eax
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	cmpq	$8, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
 # endif
076f82
+	movl	$24, %OFFSET_REG
076f82
+	/* Explicit check for 16 byte alignment.  */
076f82
+	subl	%eax, %OFFSET_REG
076f82
+
076f82
+
076f82
+
076f82
+	vmovq	(%rdi, %OFFSET_REG64), %xmm0
076f82
+	vmovq	(%rsi, %OFFSET_REG64), %xmm1
076f82
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
076f82
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
076f82
+	vpandn	%xmm1, %xmm2, %xmm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	incb	%cl
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-# endif
076f82
-
076f82
-L(cross_page_1_xmm):
076f82
-# ifndef USE_AS_WCSCMP
076f82
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
076f82
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
076f82
-	cmpl	$(PAGE_SIZE - 8), %eax
076f82
-	jg	L(cross_page_8bytes)
076f82
-	vmovq	(%rdi, %rdx), %xmm1
076f82
-	vmovq	(%rsi, %rdx), %xmm0
076f82
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
076f82
-	VPMINU	%xmm1, %xmm0, %xmm0
076f82
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
076f82
-	vpmovmskb %xmm0, %ecx
076f82
-	/* Only last 8 bits are valid.  */
076f82
-	andl	$0xff, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(last_vector)
076f82
+	addl	$8, %OFFSET_REG
076f82
+	subq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
+	subq	$-(VEC_SIZE * 4), %rdx
076f82
 
076f82
-	addl	$8, %edx
076f82
-	addl	$8, %eax
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
076f82
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
076f82
+# else
076f82
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
076f82
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
076f82
+# endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(less_8_till_page):
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	/* If using wchar then this is the only check before we reach
076f82
+	   the page boundary.  */
076f82
+	movl	(%rdi), %eax
076f82
+	movl	(%rsi), %ecx
076f82
+	cmpl	%ecx, %eax
076f82
+	jnz	L(ret_less_8_wcs)
076f82
 #  ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	addq	%rdi, %rdx
076f82
+	/* We already checked for len <= 1 so cannot hit that case here.
076f82
+	 */
076f82
 #  endif
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(prepare_loop_no_len)
076f82
+	ret
076f82
 
076f82
-L(cross_page_8bytes):
076f82
-	/* Less than 8 bytes to check, try 4 byte vector.  */
076f82
-	cmpl	$(PAGE_SIZE - 4), %eax
076f82
-	jg	L(cross_page_4bytes)
076f82
-	vmovd	(%rdi, %rdx), %xmm1
076f82
-	vmovd	(%rsi, %rdx), %xmm0
076f82
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
076f82
-	VPMINU	%xmm1, %xmm0, %xmm0
076f82
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
076f82
-	vpmovmskb %xmm0, %ecx
076f82
-	/* Only last 4 bits are valid.  */
076f82
-	andl	$0xf, %ecx
076f82
-	testl	%ecx, %ecx
076f82
-	jne	L(last_vector)
076f82
+	.p2align 4,, 8
076f82
+L(ret_less_8_wcs):
076f82
+	setl	%OFFSET_REG8
076f82
+	negl	%OFFSET_REG
076f82
+	movl	%OFFSET_REG, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	ret
076f82
+
076f82
+# else
076f82
+
076f82
+	/* Find largest load size we can use.  */
076f82
+	cmpl	$28, %eax
076f82
+	ja	L(less_4_till_page)
076f82
+
076f82
+	vmovd	(%rdi), %xmm0
076f82
+	vmovd	(%rsi), %xmm1
076f82
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
076f82
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
076f82
+	vpandn	%xmm1, %xmm2, %xmm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	subl	$0xf, %ecx
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
 
076f82
-	addl	$4, %edx
076f82
 #  ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	$4, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case1)
076f82
 #  endif
076f82
+	movl	$28, %OFFSET_REG
076f82
+	/* Explicit check for 16 byte alignment.  */
076f82
+	subl	%eax, %OFFSET_REG
076f82
 
076f82
-L(cross_page_4bytes):
076f82
-# endif
076f82
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-# endif
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	movl	(%rdi, %rdx), %eax
076f82
-	movl	(%rsi, %rdx), %ecx
076f82
-# else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %ecx
076f82
-# endif
076f82
-	testl	%eax, %eax
076f82
-	jne	L(cross_page_loop)
076f82
+
076f82
+
076f82
+	vmovd	(%rdi, %OFFSET_REG64), %xmm0
076f82
+	vmovd	(%rsi, %OFFSET_REG64), %xmm1
076f82
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
076f82
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
076f82
+	vpandn	%xmm1, %xmm2, %xmm1
076f82
+	vpmovmskb %ymm1, %ecx
076f82
+	subl	$0xf, %ecx
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	addl	$4, %OFFSET_REG
076f82
+	subq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case1)
076f82
+	subq	$-(VEC_SIZE * 4), %rdx
076f82
+
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
076f82
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
076f82
+#  else
076f82
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
076f82
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
076f82
+#  endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_page_cross_slow_case1):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+#  endif
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(less_4_till_page):
076f82
+	subq	%rdi, %rsi
076f82
+	/* Extremely slow byte comparison loop.  */
076f82
+L(less_4_loop):
076f82
+	movzbl	(%rdi), %eax
076f82
+	movzbl	(%rsi, %rdi), %ecx
076f82
 	subl	%ecx, %eax
076f82
-	VZEROUPPER_RETURN
076f82
-END (STRCMP)
076f82
+	jnz	L(ret_less_4_loop)
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(ret_zero_4_loop)
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	decq	%rdx
076f82
+	jz	L(ret_zero_4_loop)
076f82
+#  endif
076f82
+	incq	%rdi
076f82
+	/* end condition is reach page boundary (rdi is aligned).  */
076f82
+	testl	$31, %edi
076f82
+	jnz	L(less_4_loop)
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
076f82
+	addq	$-(VEC_SIZE * 4), %rdi
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	subq	$-(VEC_SIZE * 4), %rdx
076f82
+#  endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
+L(ret_zero_4_loop):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+L(ret_less_4_loop):
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
+	ret
076f82
+# endif
076f82
+END(STRCMP)
076f82
 #endif