076f82
commit c41a66767d23b7f219fb943be6fab5ddf822d7da
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Jan 10 15:35:39 2022 -0600
076f82
076f82
    x86: Optimize strcmp-evex.S
076f82
    
076f82
    Optimization are primarily to the loop logic and how the page cross
076f82
    logic interacts with the loop.
076f82
    
076f82
    The page cross logic is at times more expensive for short strings near
076f82
    the end of a page but not crossing the page. This is done to retest
076f82
    the page cross conditions with a non-faulty check and to improve the
076f82
    logic for entering the loop afterwards. This is only particular cases,
076f82
    however, and is general made up for by more than 10x improvements on
076f82
    the transition from the page cross -> loop case.
076f82
    
076f82
    The non-page cross cases as well are nearly universally improved.
076f82
    
076f82
    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
076f82
    
076f82
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
    (cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9)
076f82
076f82
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
076f82
index 6f5c4bf984da2b80..99d8409af27327ad 100644
076f82
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
076f82
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
076f82
@@ -26,54 +26,69 @@
076f82
 
076f82
 # define PAGE_SIZE	4096
076f82
 
076f82
-/* VEC_SIZE = Number of bytes in a ymm register */
076f82
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
076f82
 # define VEC_SIZE	32
076f82
+# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
076f82
 
076f82
-/* Shift for dividing by (VEC_SIZE * 4).  */
076f82
-# define DIVIDE_BY_VEC_4_SHIFT	7
076f82
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
076f82
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
076f82
-# endif
076f82
-
076f82
-# define VMOVU		vmovdqu64
076f82
-# define VMOVA		vmovdqa64
076f82
+# define VMOVU	vmovdqu64
076f82
+# define VMOVA	vmovdqa64
076f82
 
076f82
 # ifdef USE_AS_WCSCMP
076f82
-/* Compare packed dwords.  */
076f82
-#  define VPCMP		vpcmpd
076f82
+#  define TESTEQ	subl	$0xff,
076f82
+	/* Compare packed dwords.  */
076f82
+#  define VPCMP	vpcmpd
076f82
 #  define VPMINU	vpminud
076f82
 #  define VPTESTM	vptestmd
076f82
-#  define SHIFT_REG32	r8d
076f82
-#  define SHIFT_REG64	r8
076f82
-/* 1 dword char == 4 bytes.  */
076f82
+	/* 1 dword char == 4 bytes.  */
076f82
 #  define SIZE_OF_CHAR	4
076f82
 # else
076f82
-/* Compare packed bytes.  */
076f82
-#  define VPCMP		vpcmpb
076f82
+#  define TESTEQ	incl
076f82
+	/* Compare packed bytes.  */
076f82
+#  define VPCMP	vpcmpb
076f82
 #  define VPMINU	vpminub
076f82
 #  define VPTESTM	vptestmb
076f82
-#  define SHIFT_REG32	ecx
076f82
-#  define SHIFT_REG64	rcx
076f82
-/* 1 byte char == 1 byte.  */
076f82
+	/* 1 byte char == 1 byte.  */
076f82
 #  define SIZE_OF_CHAR	1
076f82
 # endif
076f82
 
076f82
+# ifdef USE_AS_STRNCMP
076f82
+#  define LOOP_REG	r9d
076f82
+#  define LOOP_REG64	r9
076f82
+
076f82
+#  define OFFSET_REG8	r9b
076f82
+#  define OFFSET_REG	r9d
076f82
+#  define OFFSET_REG64	r9
076f82
+# else
076f82
+#  define LOOP_REG	edx
076f82
+#  define LOOP_REG64	rdx
076f82
+
076f82
+#  define OFFSET_REG8	dl
076f82
+#  define OFFSET_REG	edx
076f82
+#  define OFFSET_REG64	rdx
076f82
+# endif
076f82
+
076f82
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
076f82
+#  define VEC_OFFSET	0
076f82
+# else
076f82
+#  define VEC_OFFSET	(-VEC_SIZE)
076f82
+# endif
076f82
+
076f82
 # define XMMZERO	xmm16
076f82
-# define XMM0		xmm17
076f82
-# define XMM1		xmm18
076f82
+# define XMM0	xmm17
076f82
+# define XMM1	xmm18
076f82
 
076f82
 # define YMMZERO	ymm16
076f82
-# define YMM0		ymm17
076f82
-# define YMM1		ymm18
076f82
-# define YMM2		ymm19
076f82
-# define YMM3		ymm20
076f82
-# define YMM4		ymm21
076f82
-# define YMM5		ymm22
076f82
-# define YMM6		ymm23
076f82
-# define YMM7		ymm24
076f82
-# define YMM8		ymm25
076f82
-# define YMM9		ymm26
076f82
-# define YMM10		ymm27
076f82
+# define YMM0	ymm17
076f82
+# define YMM1	ymm18
076f82
+# define YMM2	ymm19
076f82
+# define YMM3	ymm20
076f82
+# define YMM4	ymm21
076f82
+# define YMM5	ymm22
076f82
+# define YMM6	ymm23
076f82
+# define YMM7	ymm24
076f82
+# define YMM8	ymm25
076f82
+# define YMM9	ymm26
076f82
+# define YMM10	ymm27
076f82
 
076f82
 /* Warning!
076f82
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
076f82
@@ -96,985 +111,1096 @@
076f82
    the maximum offset is reached before a difference is found, zero is
076f82
    returned.  */
076f82
 
076f82
-	.section .text.evex,"ax",@progbits
076f82
-ENTRY (STRCMP)
076f82
+	.section .text.evex, "ax", @progbits
076f82
+ENTRY(STRCMP)
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Check for simple cases (0 or 1) in offset.  */
076f82
-	cmp	$1, %RDX_LP
076f82
-	je	L(char0)
076f82
-	jb	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-#  ifndef __ILP32__
076f82
-	movq	%rdx, %rcx
076f82
-	/* Check if length could overflow when multiplied by
076f82
-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
076f82
-	   overflow cases as well as redirect cases where its impossible to
076f82
-	   length to bound a valid memory region. In these cases just use
076f82
-	   'wcscmp'.  */
076f82
-	shrq	$56, %rcx
076f82
-	jnz	__wcscmp_evex
076f82
-#  endif
076f82
-	/* Convert units: from wide to byte char.  */
076f82
-	shl	$2, %RDX_LP
076f82
+#  ifdef __ILP32__
076f82
+	/* Clear the upper 32 bits.  */
076f82
+	movl	%edx, %rdx
076f82
 #  endif
076f82
-	/* Register %r11 tracks the maximum offset.  */
076f82
-	mov	%RDX_LP, %R11_LP
076f82
+	cmp	$1, %RDX_LP
076f82
+	/* Signed comparison intentional. We use this branch to also
076f82
+	   test cases where length >= 2^63. These very large sizes can be
076f82
+	   handled with strcmp as there is no way for that length to
076f82
+	   actually bound the buffer.  */
076f82
+	jle	L(one_or_less)
076f82
 # endif
076f82
 	movl	%edi, %eax
076f82
-	xorl	%edx, %edx
076f82
-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
076f82
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
076f82
 	orl	%esi, %eax
076f82
-	andl	$(PAGE_SIZE - 1), %eax
076f82
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
076f82
-	jg	L(cross_page)
076f82
-	/* Start comparing 4 vectors.  */
076f82
+	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
076f82
+	sall	$20, %eax
076f82
+	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
076f82
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
076f82
+	ja	L(page_cross)
076f82
+
076f82
+L(no_page_cross):
076f82
+	/* Safe to compare 4x vectors.  */
076f82
 	VMOVU	(%rdi), %YMM0
076f82
-
076f82
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
076f82
 	VPTESTM	%YMM0, %YMM0, %k2
076f82
-
076f82
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
 	   in YMM0 and 32 bytes at (%rsi).  */
076f82
 	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
076f82
-
076f82
 	kmovd	%k1, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
-# else
076f82
-	incl	%ecx
076f82
-# endif
076f82
-	je	L(next_3_vectors)
076f82
-	tzcntl	%ecx, %edx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edx
076f82
-# endif
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
076f82
-	   offset (%r11).   */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	$CHAR_PER_VEC, %rdx
076f82
+	jbe	L(vec_0_test_len)
076f82
 # endif
076f82
+
076f82
+	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
076f82
+	   wcscmp/wcsncmp.  */
076f82
+
076f82
+	/* All 1s represents all equals. TESTEQ will overflow to zero in
076f82
+	   all equals case. Otherwise 1s will carry until position of first
076f82
+	   mismatch.  */
076f82
+	TESTEQ	%ecx
076f82
+	jz	L(more_3x_vec)
076f82
+
076f82
+	.p2align 4,, 4
076f82
+L(return_vec_0):
076f82
+	tzcntl	%ecx, %ecx
076f82
 # ifdef USE_AS_WCSCMP
076f82
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	je	L(return)
076f82
-L(wcscmp_return):
076f82
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret0)
076f82
 	setl	%al
076f82
 	negl	%eax
076f82
 	orl	$1, %eax
076f82
-L(return):
076f82
 # else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	movzbl	(%rdi, %rcx), %eax
076f82
+	movzbl	(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
 # endif
076f82
+L(ret0):
076f82
 	ret
076f82
 
076f82
-L(return_vec_size):
076f82
-	tzcntl	%ecx, %edx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edx
076f82
-# endif
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
076f82
-	   the maximum offset (%r11).  */
076f82
-	addq	$VEC_SIZE, %rdx
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
+	.p2align 4,, 4
076f82
+L(vec_0_test_len):
076f82
+	notl	%ecx
076f82
+	bzhil	%edx, %ecx, %eax
076f82
+	jnz	L(return_vec_0)
076f82
+	/* Align if will cross fetch block.  */
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero):
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# else
076f82
+	ret
076f82
+
076f82
+	.p2align 4,, 5
076f82
+L(one_or_less):
076f82
+	jb	L(ret_zero)
076f82
 #  ifdef USE_AS_WCSCMP
076f82
+	/* 'nbe' covers the case where length is negative (large
076f82
+	   unsigned).  */
076f82
+	jnbe	__wcscmp_evex
076f82
+	movl	(%rdi), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
076f82
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	cmpl	(%rsi), %edx
076f82
+	je	L(ret1)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 #  else
076f82
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
076f82
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	/* 'nbe' covers the case where length is negative (large
076f82
+	   unsigned).  */
076f82
+	jnbe	__strcmp_evex
076f82
+	movzbl	(%rdi), %eax
076f82
+	movzbl	(%rsi), %ecx
076f82
+	subl	%ecx, %eax
076f82
 #  endif
076f82
-# endif
076f82
+L(ret1):
076f82
 	ret
076f82
+# endif
076f82
 
076f82
-L(return_2_vec_size):
076f82
-	tzcntl	%ecx, %edx
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_1):
076f82
+	tzcntl	%ecx, %ecx
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
076f82
+	   worrying about underflow.  */
076f82
+	addq	$-CHAR_PER_VEC, %rdx
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edx
076f82
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	xorl	%eax, %eax
076f82
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret2)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
+# else
076f82
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
 # endif
076f82
+L(ret2):
076f82
+	ret
076f82
+
076f82
+	.p2align 4,, 10
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
076f82
-	   after the maximum offset (%r11).  */
076f82
-	addq	$(VEC_SIZE * 2), %rdx
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+L(return_vec_3):
076f82
+#  if CHAR_PER_VEC <= 16
076f82
+	sall	$CHAR_PER_VEC, %ecx
076f82
 #  else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	salq	$CHAR_PER_VEC, %rcx
076f82
 #  endif
076f82
+# endif
076f82
+L(return_vec_2):
076f82
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
076f82
+	tzcntl	%ecx, %ecx
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
076f82
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
076f82
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	tzcntq	%rcx, %rcx
076f82
 # endif
076f82
-	ret
076f82
 
076f82
-L(return_3_vec_size):
076f82
-	tzcntl	%ecx, %edx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edx
076f82
-# endif
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
076f82
-	   after the maximum offset (%r11).  */
076f82
-	addq	$(VEC_SIZE * 3), %rdx
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret3)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 # else
076f82
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
076f82
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+# endif
076f82
+L(ret3):
076f82
+	ret
076f82
+
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_3):
076f82
+	tzcntl	%ecx, %ecx
076f82
 #  ifdef USE_AS_WCSCMP
076f82
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
076f82
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret4)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	orl	$1, %eax
076f82
 #  else
076f82
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
076f82
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
076f82
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
 #  endif
076f82
-# endif
076f82
+L(ret4):
076f82
 	ret
076f82
+# endif
076f82
 
076f82
-	.p2align 4
076f82
-L(next_3_vectors):
076f82
-	VMOVU	VEC_SIZE(%rdi), %YMM0
076f82
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
076f82
+	/* 32 byte align here ensures the main loop is ideally aligned
076f82
+	   for DSB.  */
076f82
+	.p2align 5
076f82
+L(more_3x_vec):
076f82
+	/* Safe to compare 4x vectors.  */
076f82
+	VMOVU	(VEC_SIZE)(%rdi), %YMM0
076f82
 	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
076f82
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
076f82
+	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
076f82
 	kmovd	%k1, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
-# else
076f82
-	incl	%ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_1)
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	subq	$(CHAR_PER_VEC * 2), %rdx
076f82
+	jbe	L(ret_zero)
076f82
 # endif
076f82
-	jne	L(return_vec_size)
076f82
 
076f82
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
076f82
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
076f82
 	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
076f82
 	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
076f82
 	kmovd	%k1, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
-# else
076f82
-	incl	%ecx
076f82
-# endif
076f82
-	jne	L(return_2_vec_size)
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_2)
076f82
 
076f82
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
076f82
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
076f82
 	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
076f82
 	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
076f82
 	kmovd	%k1, %ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_3)
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
076f82
+	jbe	L(ret_zero)
076f82
+# endif
076f82
+
076f82
+
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
+	/* any non-zero positive value that doesn't inference with 0x1.
076f82
+	 */
076f82
+	movl	$2, %r8d
076f82
+
076f82
 # else
076f82
-	incl	%ecx
076f82
+	xorl	%r8d, %r8d
076f82
 # endif
076f82
-	jne	L(return_3_vec_size)
076f82
-L(main_loop_header):
076f82
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
076f82
-	movl	$PAGE_SIZE, %ecx
076f82
-	/* Align load via RAX.  */
076f82
-	andq	$-(VEC_SIZE * 4), %rdx
076f82
-	subq	%rdi, %rdx
076f82
-	leaq	(%rdi, %rdx), %rax
076f82
+
076f82
+	/* The prepare labels are various entry points from the page
076f82
+	   cross logic.  */
076f82
+L(prepare_loop):
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Starting from this point, the maximum offset, or simply the
076f82
-	   'offset', DECREASES by the same amount when base pointers are
076f82
-	   moved forward.  Return 0 when:
076f82
-	     1) On match: offset <= the matched vector index.
076f82
-	     2) On mistmach, offset is before the mistmatched index.
076f82
-	 */
076f82
-	subq	%rdx, %r11
076f82
-	jbe	L(zero)
076f82
+#  ifdef USE_AS_WCSCMP
076f82
+L(prepare_loop_no_len):
076f82
+	movl	%edi, %ecx
076f82
+	andl	$(VEC_SIZE * 4 - 1), %ecx
076f82
+	shrl	$2, %ecx
076f82
+	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
076f82
+#  else
076f82
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
076f82
+	   the loop.  */
076f82
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
076f82
+L(prepare_loop_no_len):
076f82
+#  endif
076f82
+# else
076f82
+L(prepare_loop_no_len):
076f82
 # endif
076f82
-	addq	%rsi, %rdx
076f82
-	movq	%rdx, %rsi
076f82
-	andl	$(PAGE_SIZE - 1), %esi
076f82
-	/* Number of bytes before page crossing.  */
076f82
-	subq	%rsi, %rcx
076f82
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
076f82
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
076f82
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
076f82
-	movl	%ecx, %esi
076f82
-	jmp	L(loop_start)
076f82
 
076f82
+	/* Align s1 and adjust s2 accordingly.  */
076f82
+	subq	%rdi, %rsi
076f82
+	andq	$-(VEC_SIZE * 4), %rdi
076f82
+L(prepare_loop_readj):
076f82
+	addq	%rdi, %rsi
076f82
+# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
076f82
+	subq	%rdi, %rdx
076f82
+# endif
076f82
+
076f82
+L(prepare_loop_aligned):
076f82
+	/* eax stores distance from rsi to next page cross. These cases
076f82
+	   need to be handled specially as the 4x loop could potentially
076f82
+	   read memory past the length of s1 or s2 and across a page
076f82
+	   boundary.  */
076f82
+	movl	$-(VEC_SIZE * 4), %eax
076f82
+	subl	%esi, %eax
076f82
+	andl	$(PAGE_SIZE - 1), %eax
076f82
+
076f82
+	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
076f82
+
076f82
+	/* Loop 4x comparisons at a time.  */
076f82
 	.p2align 4
076f82
 L(loop):
076f82
+
076f82
+	/* End condition for strncmp.  */
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
076f82
-	   the maximum offset (%r11) by the same amount.  */
076f82
-	subq	$(VEC_SIZE * 4), %r11
076f82
-	jbe	L(zero)
076f82
+	subq	$(CHAR_PER_VEC * 4), %rdx
076f82
+	jbe	L(ret_zero)
076f82
 # endif
076f82
-	addq	$(VEC_SIZE * 4), %rax
076f82
-	addq	$(VEC_SIZE * 4), %rdx
076f82
-L(loop_start):
076f82
-	testl	%esi, %esi
076f82
-	leal	-1(%esi), %esi
076f82
-	je	L(loop_cross_page)
076f82
-L(back_to_loop):
076f82
-	/* Main loop, comparing 4 vectors are a time.  */
076f82
-	VMOVA	(%rax), %YMM0
076f82
-	VMOVA	VEC_SIZE(%rax), %YMM2
076f82
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
076f82
-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
076f82
+
076f82
+	subq	$-(VEC_SIZE * 4), %rdi
076f82
+	subq	$-(VEC_SIZE * 4), %rsi
076f82
+
076f82
+	/* Check if rsi loads will cross a page boundary.  */
076f82
+	addl	$-(VEC_SIZE * 4), %eax
076f82
+	jnb	L(page_cross_during_loop)
076f82
+
076f82
+	/* Loop entry after handling page cross during loop.  */
076f82
+L(loop_skip_page_cross_check):
076f82
+	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
076f82
+	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
076f82
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
076f82
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
076f82
 
076f82
 	VPMINU	%YMM0, %YMM2, %YMM8
076f82
 	VPMINU	%YMM4, %YMM6, %YMM9
076f82
 
076f82
-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
076f82
-	VPMINU	%YMM8, %YMM9, %YMM8
076f82
+	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
076f82
+	VPMINU	%YMM8, %YMM9, %YMM9
076f82
 
076f82
 	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
076f82
-	VPTESTM	%YMM8, %YMM8, %k1
076f82
+	VPTESTM	%YMM9, %YMM9, %k1
076f82
 
076f82
-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
076f82
-	vpxorq	(%rdx), %YMM0, %YMM1
076f82
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
076f82
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
076f82
-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
076f82
+	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
076f82
+	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
076f82
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
076f82
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
076f82
+	   oring with YMM1. Result is stored in YMM6.  */
076f82
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
076f82
 
076f82
-	vporq	%YMM1, %YMM3, %YMM9
076f82
-	vporq	%YMM5, %YMM7, %YMM10
076f82
+	/* Or together YMM3, YMM5, and YMM6.  */
076f82
+	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
076f82
 
076f82
-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
076f82
-	vporq	%YMM9, %YMM10, %YMM9
076f82
 
076f82
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
076f82
-	kmovd   %k0, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
-# else
076f82
-	incl	%ecx
076f82
-# endif
076f82
-	je	 L(loop)
076f82
+	/* A non-zero CHAR in YMM6 represents a mismatch.  */
076f82
+	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
076f82
+	kmovd	%k0, %LOOP_REG
076f82
 
076f82
-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
076f82
+	TESTEQ	%LOOP_REG
076f82
+	jz	L(loop)
076f82
+
076f82
+
076f82
+	/* Find which VEC has the mismatch of end of string.  */
076f82
 	VPTESTM	%YMM0, %YMM0, %k1
076f82
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
076f82
-	   in YMM0 and (%rdx).  */
076f82
 	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
076f82
 	kmovd	%k0, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
-# else
076f82
-	incl	%ecx
076f82
-# endif
076f82
-	je	L(test_vec)
076f82
-	tzcntl	%ecx, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %ecx
076f82
-# endif
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	cmpq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# endif
076f82
-	ret
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_0_end)
076f82
 
076f82
-	.p2align 4
076f82
-L(test_vec):
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	/* The first vector matched.  Return 0 if the maximum offset
076f82
-	   (%r11) <= VEC_SIZE.  */
076f82
-	cmpq	$VEC_SIZE, %r11
076f82
-	jbe	L(zero)
076f82
-# endif
076f82
-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
076f82
 	VPTESTM	%YMM2, %YMM2, %k1
076f82
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
076f82
-	   in YMM2 and VEC_SIZE(%rdx).  */
076f82
 	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
076f82
 	kmovd	%k0, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
-# else
076f82
-	incl	%ecx
076f82
-# endif
076f82
-	je	L(test_2_vec)
076f82
-	tzcntl	%ecx, %edi
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edi
076f82
-# endif
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	addq	$VEC_SIZE, %rdi
076f82
-	cmpq	%rdi, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rdi), %ecx
076f82
-	cmpl	(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rdi), %eax
076f82
-	movzbl	(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
076f82
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
076f82
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# endif
076f82
-	ret
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_1_end)
076f82
 
076f82
-	.p2align 4
076f82
-L(test_2_vec):
076f82
+
076f82
+	/* Handle VEC 2 and 3 without branches.  */
076f82
+L(return_vec_2_3_end):
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
076f82
-	   (%r11) <= 2 * VEC_SIZE.  */
076f82
-	cmpq	$(VEC_SIZE * 2), %r11
076f82
-	jbe	L(zero)
076f82
+	subq	$(CHAR_PER_VEC * 2), %rdx
076f82
+	jbe	L(ret_zero_end)
076f82
 # endif
076f82
-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
076f82
+
076f82
 	VPTESTM	%YMM4, %YMM4, %k1
076f82
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
076f82
-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
076f82
 	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
076f82
 	kmovd	%k0, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
+	TESTEQ	%ecx
076f82
+# if CHAR_PER_VEC <= 16
076f82
+	sall	$CHAR_PER_VEC, %LOOP_REG
076f82
+	orl	%ecx, %LOOP_REG
076f82
 # else
076f82
-	incl	%ecx
076f82
+	salq	$CHAR_PER_VEC, %LOOP_REG64
076f82
+	orq	%rcx, %LOOP_REG64
076f82
+# endif
076f82
+L(return_vec_3_end):
076f82
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
076f82
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
076f82
+	   must entirely be from VEC 3 which is fully represented by
076f82
+	   LOOP_REG.  */
076f82
+# if CHAR_PER_VEC <= 16
076f82
+	tzcntl	%LOOP_REG, %LOOP_REG
076f82
+# else
076f82
+	tzcntq	%LOOP_REG64, %LOOP_REG64
076f82
+# endif
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	cmpq	%LOOP_REG64, %rdx
076f82
+	jbe	L(ret_zero_end)
076f82
 # endif
076f82
-	je	L(test_3_vec)
076f82
-	tzcntl	%ecx, %edi
076f82
+
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edi
076f82
+	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
076f82
+	xorl	%eax, %eax
076f82
+	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
076f82
+	je	L(ret5)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
+# else
076f82
+	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
076f82
+	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret5):
076f82
+	ret
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	addq	$(VEC_SIZE * 2), %rdi
076f82
-	cmpq	%rdi, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_end):
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rdi), %ecx
076f82
-	cmpl	(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	ret
076f82
+# endif
076f82
+
076f82
+
076f82
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
076f82
+	   they use the value of `r8` to negate the return value. This is
076f82
+	   because the page cross logic can swap `rdi` and `rsi`.  */
076f82
+	.p2align 4,, 10
076f82
+# ifdef USE_AS_STRNCMP
076f82
+L(return_vec_1_end):
076f82
+#  if CHAR_PER_VEC <= 16
076f82
+	sall	$CHAR_PER_VEC, %ecx
076f82
 #  else
076f82
-	movzbl	(%rax, %rdi), %eax
076f82
-	movzbl	(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
+	salq	$CHAR_PER_VEC, %rcx
076f82
 #  endif
076f82
+# endif
076f82
+L(return_vec_0_end):
076f82
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
076f82
+	tzcntl	%ecx, %ecx
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
076f82
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
076f82
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	tzcntq	%rcx, %rcx
076f82
 # endif
076f82
-	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(test_3_vec):
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
076f82
-	   (%r11) <= 3 * VEC_SIZE.  */
076f82
-	cmpq	$(VEC_SIZE * 3), %r11
076f82
-	jbe	L(zero)
076f82
+	cmpq	%rcx, %rdx
076f82
+	jbe	L(ret_zero_end)
076f82
 # endif
076f82
-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
076f82
-	VPTESTM	%YMM6, %YMM6, %k1
076f82
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
076f82
-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
076f82
-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
076f82
-	kmovd	%k0, %ecx
076f82
+
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	xorl	%eax, %eax
076f82
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret6)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
076f82
+	   flip is `rdi` and `rsi` where swapped.  */
076f82
+	xorl	%r8d, %eax
076f82
 # else
076f82
-	incl	%ecx
076f82
+	movzbl	(%rdi, %rcx), %eax
076f82
+	movzbl	(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
076f82
+	   logic. Subtract `r8d` after xor for zero case.  */
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret6):
076f82
+	ret
076f82
+
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_1_end):
076f82
 	tzcntl	%ecx, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %ecx
076f82
-# endif
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	addq	$(VEC_SIZE * 3), %rcx
076f82
-	cmpq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
 #  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %esi
076f82
-	cmpl	(%rdx, %rcx), %esi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
-# else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
076f82
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
076f82
-	jne	L(wcscmp_return)
076f82
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret7)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 #  else
076f82
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
076f82
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 #  endif
076f82
-# endif
076f82
+L(ret7):
076f82
 	ret
076f82
-
076f82
-	.p2align 4
076f82
-L(loop_cross_page):
076f82
-	xorl	%r10d, %r10d
076f82
-	movq	%rdx, %rcx
076f82
-	/* Align load via RDX.  We load the extra ECX bytes which should
076f82
-	   be ignored.  */
076f82
-	andl	$((VEC_SIZE * 4) - 1), %ecx
076f82
-	/* R10 is -RCX.  */
076f82
-	subq	%rcx, %r10
076f82
-
076f82
-	/* This works only if VEC_SIZE * 2 == 64. */
076f82
-# if (VEC_SIZE * 2) != 64
076f82
-#  error (VEC_SIZE * 2) != 64
076f82
 # endif
076f82
 
076f82
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
076f82
-	cmpl	$(VEC_SIZE * 2), %ecx
076f82
-	jge	L(loop_cross_page_2_vec)
076f82
 
076f82
-	VMOVU	(%rax, %r10), %YMM2
076f82
-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
076f82
+	/* Page cross in rsi in next 4x VEC.  */
076f82
 
076f82
-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
076f82
-	VPTESTM	%YMM2, %YMM2, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
076f82
-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
076f82
-	kmovd	%k1, %r9d
076f82
-	/* Don't use subl since it is the lower 16/32 bits of RDI
076f82
-	   below.  */
076f82
-	notl	%r9d
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* Only last 8 bits are valid.  */
076f82
-	andl	$0xff, %r9d
076f82
-# endif
076f82
+	/* TODO: Improve logic here.  */
076f82
+	.p2align 4,, 10
076f82
+L(page_cross_during_loop):
076f82
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
076f82
 
076f82
-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
076f82
-	VPTESTM	%YMM3, %YMM3, %k4
076f82
-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
076f82
-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
076f82
-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
076f82
-	kmovd	%k3, %edi
076f82
-    /* Must use notl %edi here as lower bits are for CHAR
076f82
-	   comparisons potentially out of range thus can be 0 without
076f82
-	   indicating mismatch.  */
076f82
-	notl	%edi
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
076f82
-	andl	$0xff, %edi
076f82
-# endif
076f82
+	/* Optimistically rsi and rdi and both aligned in which case we
076f82
+	   don't need any logic here.  */
076f82
+	cmpl	$-(VEC_SIZE * 4), %eax
076f82
+	/* Don't adjust eax before jumping back to loop and we will
076f82
+	   never hit page cross case again.  */
076f82
+	je	L(loop_skip_page_cross_check)
076f82
 
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
076f82
-	sall	$8, %edi
076f82
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
076f82
-	   bytes.  */
076f82
-	movl	%ecx, %SHIFT_REG32
076f82
-	sarl	$2, %SHIFT_REG32
076f82
-
076f82
-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
076f82
-	orl	%r9d, %edi
076f82
-# else
076f82
-	salq	$32, %rdi
076f82
+	/* Check if we can safely load a VEC.  */
076f82
+	cmpl	$-(VEC_SIZE * 3), %eax
076f82
+	jle	L(less_1x_vec_till_page_cross)
076f82
 
076f82
-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
076f82
-	orq	%r9, %rdi
076f82
-# endif
076f82
+	VMOVA	(%rdi), %YMM0
076f82
+	VPTESTM	%YMM0, %YMM0, %k2
076f82
+	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_0_end)
076f82
+
076f82
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
076f82
+	cmpl	$-(VEC_SIZE * 2), %eax
076f82
+	jg	L(more_2x_vec_till_page_cross)
076f82
+
076f82
+	.p2align 4,, 4
076f82
+L(less_1x_vec_till_page_cross):
076f82
+	subl	$-(VEC_SIZE * 4), %eax
076f82
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
076f82
+	   concerning case is first iteration if incoming s1 was near start
076f82
+	   of a page and s2 near end. If s1 was near the start of the page
076f82
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
076f82
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
076f82
+	   here, it means the previous page (rdi - VEC_SIZE) has already
076f82
+	   been loaded earlier so must be valid.  */
076f82
+	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
076f82
+	VPTESTM	%YMM0, %YMM0, %k2
076f82
+	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
076f82
+
076f82
+	/* Mask of potentially valid bits. The lower bits can be out of
076f82
+	   range comparisons (but safe regarding page crosses).  */
076f82
 
076f82
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
076f82
-	shrxq	%SHIFT_REG64, %rdi, %rdi
076f82
-	testq	%rdi, %rdi
076f82
-	je	L(loop_cross_page_2_vec)
076f82
-	tzcntq	%rdi, %rcx
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %ecx
076f82
+	movl	$-1, %r10d
076f82
+	movl	%esi, %ecx
076f82
+	andl	$(VEC_SIZE - 1), %ecx
076f82
+	shrl	$2, %ecx
076f82
+	shlxl	%ecx, %r10d, %ecx
076f82
+	movzbl	%cl, %r10d
076f82
+# else
076f82
+	movl	$-1, %ecx
076f82
+	shlxl	%esi, %ecx, %r10d
076f82
 # endif
076f82
+
076f82
+	kmovd	%k1, %ecx
076f82
+	notl	%ecx
076f82
+
076f82
+
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	cmpq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
 #  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
+	movl	%eax, %r11d
076f82
+	shrl	$2, %r11d
076f82
+	cmpq	%r11, %rdx
076f82
 #  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
+	cmpq	%rax, %rdx
076f82
 #  endif
076f82
+	jbe	L(return_page_cross_end_check)
076f82
+# endif
076f82
+	movl	%eax, %OFFSET_REG
076f82
+
076f82
+	/* Readjust eax before potentially returning to the loop.  */
076f82
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
076f82
+
076f82
+	andl	%r10d, %ecx
076f82
+	jz	L(loop_skip_page_cross_check)
076f82
+
076f82
+	.p2align 4,, 3
076f82
+L(return_page_cross_end):
076f82
+	tzcntl	%ecx, %ecx
076f82
+
076f82
+# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
076f82
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
076f82
+L(return_page_cross_cmp_mem):
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	addl	%OFFSET_REG, %ecx
076f82
+# endif
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
076f82
+	je	L(ret8)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
+# else
076f82
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret8):
076f82
 	ret
076f82
 
076f82
-	.p2align 4
076f82
-L(loop_cross_page_2_vec):
076f82
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
076f82
-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
076f82
-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	.p2align 4,, 10
076f82
+L(return_page_cross_end_check):
076f82
+	tzcntl	%ecx, %ecx
076f82
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
076f82
+#  ifdef USE_AS_WCSCMP
076f82
+	sall	$2, %edx
076f82
+#  endif
076f82
+	cmpl	%ecx, %edx
076f82
+	ja	L(return_page_cross_cmp_mem)
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+# endif
076f82
+
076f82
 
076f82
+	.p2align 4,, 10
076f82
+L(more_2x_vec_till_page_cross):
076f82
+	/* If more 2x vec till cross we will complete a full loop
076f82
+	   iteration here.  */
076f82
+
076f82
+	VMOVA	VEC_SIZE(%rdi), %YMM0
076f82
 	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
076f82
-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
076f82
-	kmovd	%k1, %r9d
076f82
-	/* Don't use subl since it is the lower 16/32 bits of RDI
076f82
-	   below.  */
076f82
-	notl	%r9d
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* Only last 8 bits are valid.  */
076f82
-	andl	$0xff, %r9d
076f82
-# endif
076f82
+	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_1_end)
076f82
 
076f82
-	VPTESTM	%YMM1, %YMM1, %k4
076f82
-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
076f82
-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
076f82
-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
076f82
-	kmovd	%k3, %edi
076f82
-	/* Must use notl %edi here as lower bits are for CHAR
076f82
-	   comparisons potentially out of range thus can be 0 without
076f82
-	   indicating mismatch.  */
076f82
-	notl	%edi
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
076f82
-	andl	$0xff, %edi
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
076f82
+	jbe	L(ret_zero_in_loop_page_cross)
076f82
 # endif
076f82
 
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
076f82
-	sall	$8, %edi
076f82
+	subl	$-(VEC_SIZE * 4), %eax
076f82
 
076f82
-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
076f82
-	orl	%r9d, %edi
076f82
-# else
076f82
-	salq	$32, %rdi
076f82
+	/* Safe to include comparisons from lower bytes.  */
076f82
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
076f82
+	VPTESTM	%YMM0, %YMM0, %k2
076f82
+	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_page_cross_0)
076f82
+
076f82
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
076f82
+	VPTESTM	%YMM0, %YMM0, %k2
076f82
+	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(return_vec_page_cross_1)
076f82
 
076f82
-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
076f82
-	orq	%r9, %rdi
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	/* Must check length here as length might proclude reading next
076f82
+	   page.  */
076f82
+#  ifdef USE_AS_WCSCMP
076f82
+	movl	%eax, %r11d
076f82
+	shrl	$2, %r11d
076f82
+	cmpq	%r11, %rdx
076f82
+#  else
076f82
+	cmpq	%rax, %rdx
076f82
+#  endif
076f82
+	jbe	L(ret_zero_in_loop_page_cross)
076f82
 # endif
076f82
 
076f82
-	xorl	%r8d, %r8d
076f82
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
076f82
-	subl	$(VEC_SIZE * 2), %ecx
076f82
-	jle	1f
076f82
-	/* R8 has number of bytes skipped.  */
076f82
-	movl	%ecx, %r8d
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
076f82
-	   bytes.  */
076f82
-	sarl	$2, %ecx
076f82
-	/* Skip ECX bytes.  */
076f82
-	shrl	%cl, %edi
076f82
+	/* Finish the loop.  */
076f82
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
076f82
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
076f82
+	VPMINU	%YMM4, %YMM6, %YMM9
076f82
+	VPTESTM	%YMM9, %YMM9, %k1
076f82
+
076f82
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
076f82
+	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
076f82
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
076f82
+
076f82
+	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
076f82
+	kmovd	%k0, %LOOP_REG
076f82
+	TESTEQ	%LOOP_REG
076f82
+	jnz	L(return_vec_2_3_end)
076f82
+
076f82
+	/* Best for code size to include ucond-jmp here. Would be faster
076f82
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
076f82
+	   as fall-through and have jump back to loop on mismatch
076f82
+	   comparison.  */
076f82
+	subq	$-(VEC_SIZE * 4), %rdi
076f82
+	subq	$-(VEC_SIZE * 4), %rsi
076f82
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	subq	$(CHAR_PER_VEC * 4), %rdx
076f82
+	ja	L(loop_skip_page_cross_check)
076f82
+L(ret_zero_in_loop_page_cross):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
 # else
076f82
-	/* Skip ECX bytes.  */
076f82
-	shrq	%cl, %rdi
076f82
+	jmp	L(loop_skip_page_cross_check)
076f82
 # endif
076f82
-1:
076f82
-	/* Before jumping back to the loop, set ESI to the number of
076f82
-	   VEC_SIZE * 4 blocks before page crossing.  */
076f82
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
076f82
 
076f82
-	testq	%rdi, %rdi
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	/* At this point, if %rdi value is 0, it already tested
076f82
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
076f82
-	   checks whether strncmp maximum offset reached or not.  */
076f82
-	je	L(string_nbyte_offset_check)
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(return_vec_page_cross_0):
076f82
+	addl	$-VEC_SIZE, %eax
076f82
+L(return_vec_page_cross_1):
076f82
+	tzcntl	%ecx, %ecx
076f82
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
076f82
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+#   ifdef USE_AS_WCSCMP
076f82
+	/* Must divide ecx instead of multiply rdx due to overflow.  */
076f82
+	movl	%ecx, %eax
076f82
+	shrl	$2, %eax
076f82
+	cmpq	%rax, %rdx
076f82
+#   else
076f82
+	cmpq	%rcx, %rdx
076f82
+#   endif
076f82
+	jbe	L(ret_zero_in_loop_page_cross)
076f82
+#  endif
076f82
 # else
076f82
-	je	L(back_to_loop)
076f82
+	addl	%eax, %ecx
076f82
 # endif
076f82
-	tzcntq	%rdi, %rcx
076f82
+
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %ecx
076f82
-# endif
076f82
-	addq	%r10, %rcx
076f82
-	/* Adjust for number of bytes skipped.  */
076f82
-	addq	%r8, %rcx
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	addq	$(VEC_SIZE * 2), %rcx
076f82
-	subq	%rcx, %r11
076f82
-	jbe	L(zero)
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
076f82
 	xorl	%eax, %eax
076f82
-	movl	(%rsi, %rcx), %edi
076f82
-	cmpl	(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rax, %rcx), %eax
076f82
-	movzbl	(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
076f82
+	je	L(ret9)
076f82
+	setl	%al
076f82
+	negl	%eax
076f82
+	xorl	%r8d, %eax
076f82
 # else
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	movq	%rax, %rsi
076f82
-	xorl	%eax, %eax
076f82
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
076f82
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
076f82
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
076f82
-	subl	%edx, %eax
076f82
-#  endif
076f82
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
076f82
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret9):
076f82
 	ret
076f82
 
076f82
-# ifdef USE_AS_STRNCMP
076f82
-L(string_nbyte_offset_check):
076f82
-	leaq	(VEC_SIZE * 4)(%r10), %r10
076f82
-	cmpq	%r10, %r11
076f82
-	jbe	L(zero)
076f82
-	jmp	L(back_to_loop)
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(page_cross):
076f82
+# ifndef USE_AS_STRNCMP
076f82
+	/* If both are VEC aligned we don't need any special logic here.
076f82
+	   Only valid for strcmp where stop condition is guranteed to be
076f82
+	   reachable by just reading memory.  */
076f82
+	testl	$((VEC_SIZE - 1) << 20), %eax
076f82
+	jz	L(no_page_cross)
076f82
 # endif
076f82
 
076f82
-	.p2align 4
076f82
-L(cross_page_loop):
076f82
-	/* Check one byte/dword at a time.  */
076f82
+	movl	%edi, %eax
076f82
+	movl	%esi, %ecx
076f82
+	andl	$(PAGE_SIZE - 1), %eax
076f82
+	andl	$(PAGE_SIZE - 1), %ecx
076f82
+
076f82
+	xorl	%OFFSET_REG, %OFFSET_REG
076f82
+
076f82
+	/* Check which is closer to page cross, s1 or s2.  */
076f82
+	cmpl	%eax, %ecx
076f82
+	jg	L(page_cross_s2)
076f82
+
076f82
+	/* The previous page cross check has false positives. Check for
076f82
+	   true positive as page cross logic is very expensive.  */
076f82
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
076f82
+	jbe	L(no_page_cross)
076f82
+
076f82
+
076f82
+	/* Set r8 to not interfere with normal return value (rdi and rsi
076f82
+	   did not swap).  */
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	cmpl	%ecx, %eax
076f82
+	/* any non-zero positive value that doesn't inference with 0x1.
076f82
+	 */
076f82
+	movl	$2, %r8d
076f82
 # else
076f82
-	subl	%ecx, %eax
076f82
+	xorl	%r8d, %r8d
076f82
 # endif
076f82
-	jne	L(different)
076f82
-	addl	$SIZE_OF_CHAR, %edx
076f82
-	cmpl	$(VEC_SIZE * 4), %edx
076f82
-	je	L(main_loop_header)
076f82
+
076f82
+	/* Check if less than 1x VEC till page cross.  */
076f82
+	subl	$(VEC_SIZE * 3), %eax
076f82
+	jg	L(less_1x_vec_till_page)
076f82
+
076f82
+
076f82
+	/* If more than 1x VEC till page cross, loop throuh safely
076f82
+	   loadable memory until within 1x VEC of page cross.  */
076f82
+	.p2align 4,, 8
076f82
+L(page_cross_loop):
076f82
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
076f82
+	VPTESTM	%YMM0, %YMM0, %k2
076f82
+	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
+	TESTEQ	%ecx
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+	addl	$CHAR_PER_VEC, %OFFSET_REG
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross)
076f82
 # endif
076f82
+	addl	$VEC_SIZE, %eax
076f82
+	jl	L(page_cross_loop)
076f82
+
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	movl	(%rdi, %rdx), %eax
076f82
-	movl	(%rsi, %rdx), %ecx
076f82
-# else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %ecx
076f82
+	shrl	$2, %eax
076f82
 # endif
076f82
-	/* Check null CHAR.  */
076f82
-	testl	%eax, %eax
076f82
-	jne	L(cross_page_loop)
076f82
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
076f82
-	   comparisons.  */
076f82
-	subl	%ecx, %eax
076f82
-# ifndef USE_AS_WCSCMP
076f82
-L(different):
076f82
+
076f82
+
076f82
+	subl	%eax, %OFFSET_REG
076f82
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
076f82
+	   to not cross page so is safe to load. Since we have already
076f82
+	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
076f82
+	 */
076f82
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
076f82
+	VPTESTM	%YMM0, %YMM0, %k2
076f82
+	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
076f82
+
076f82
+	kmovd	%k1, %ecx
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
076f82
+	cmpq	%rax, %rdx
076f82
+	jbe	L(check_ret_vec_page_cross2)
076f82
+#  ifdef USE_AS_WCSCMP
076f82
+	addq	$-(CHAR_PER_VEC * 2), %rdx
076f82
+#  else
076f82
+	addq	%rdi, %rdx
076f82
+#  endif
076f82
 # endif
076f82
-	ret
076f82
+	TESTEQ	%ecx
076f82
+	jz	L(prepare_loop_no_len)
076f82
 
076f82
+	.p2align 4,, 4
076f82
+L(ret_vec_page_cross):
076f82
+# ifndef USE_AS_STRNCMP
076f82
+L(check_ret_vec_page_cross):
076f82
+# endif
076f82
+	tzcntl	%ecx, %ecx
076f82
+	addl	%OFFSET_REG, %ecx
076f82
+L(ret_vec_page_cross_cont):
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	.p2align 4
076f82
-L(different):
076f82
-	/* Use movl to avoid modifying EFLAGS.  */
076f82
-	movl	$0, %eax
076f82
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	xorl	%eax, %eax
076f82
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
076f82
+	je	L(ret12)
076f82
 	setl	%al
076f82
 	negl	%eax
076f82
-	orl	$1, %eax
076f82
-	ret
076f82
+	xorl	%r8d, %eax
076f82
+# else
076f82
+	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
076f82
+	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
076f82
+	subl	%ecx, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 # endif
076f82
+L(ret12):
076f82
+	ret
076f82
+
076f82
 
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	.p2align 4
076f82
-L(zero):
076f82
+	.p2align 4,, 10
076f82
+L(check_ret_vec_page_cross2):
076f82
+	TESTEQ	%ecx
076f82
+L(check_ret_vec_page_cross):
076f82
+	tzcntl	%ecx, %ecx
076f82
+	addl	%OFFSET_REG, %ecx
076f82
+	cmpq	%rcx, %rdx
076f82
+	ja	L(ret_vec_page_cross_cont)
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_page_cross):
076f82
 	xorl	%eax, %eax
076f82
 	ret
076f82
+# endif
076f82
 
076f82
-	.p2align 4
076f82
-L(char0):
076f82
-#  ifdef USE_AS_WCSCMP
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rdi), %ecx
076f82
-	cmpl	(%rsi), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
-#  else
076f82
-	movzbl	(%rsi), %ecx
076f82
-	movzbl	(%rdi), %eax
076f82
-	subl	%ecx, %eax
076f82
-#  endif
076f82
-	ret
076f82
+	.p2align 4,, 4
076f82
+L(page_cross_s2):
076f82
+	/* Ensure this is a true page cross.  */
076f82
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
076f82
+	jbe	L(no_page_cross)
076f82
+
076f82
+
076f82
+	movl	%ecx, %eax
076f82
+	movq	%rdi, %rcx
076f82
+	movq	%rsi, %rdi
076f82
+	movq	%rcx, %rsi
076f82
+
076f82
+	/* set r8 to negate return value as rdi and rsi swapped.  */
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	movl	$-4, %r8d
076f82
+# else
076f82
+	movl	$-1, %r8d
076f82
 # endif
076f82
+	xorl	%OFFSET_REG, %OFFSET_REG
076f82
 
076f82
-	.p2align 4
076f82
-L(last_vector):
076f82
-	addq	%rdx, %rdi
076f82
-	addq	%rdx, %rsi
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	subq	%rdx, %r11
076f82
+	/* Check if more than 1x VEC till page cross.  */
076f82
+	subl	$(VEC_SIZE * 3), %eax
076f82
+	jle	L(page_cross_loop)
076f82
+
076f82
+	.p2align 4,, 6
076f82
+L(less_1x_vec_till_page):
076f82
+# ifdef USE_AS_WCSCMP
076f82
+	shrl	$2, %eax
076f82
 # endif
076f82
-	tzcntl	%ecx, %edx
076f82
+	/* Find largest load size we can use.  */
076f82
+	cmpl	$(16 / SIZE_OF_CHAR), %eax
076f82
+	ja	L(less_16_till_page)
076f82
+
076f82
+	/* Use 16 byte comparison.  */
076f82
+	vmovdqu	(%rdi), %xmm0
076f82
+	VPTESTM	%xmm0, %xmm0, %k2
076f82
+	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
076f82
-	sall	$2, %edx
076f82
+	subl	$0xf, %ecx
076f82
+# else
076f82
+	incw	%cx
076f82
 # endif
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
+	subl	%eax, %OFFSET_REG
076f82
+# else
076f82
+	/* Explicit check for 16 byte alignment.  */
076f82
+	subl	%eax, %OFFSET_REG
076f82
+	jz	L(prepare_loop)
076f82
 # endif
076f82
+	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
076f82
+	VPTESTM	%xmm0, %xmm0, %k2
076f82
+	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	xorl	%eax, %eax
076f82
-	movl	(%rdi, %rdx), %ecx
076f82
-	cmpl	(%rsi, %rdx), %ecx
076f82
-	jne	L(wcscmp_return)
076f82
+	subl	$0xf, %ecx
076f82
 # else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %edx
076f82
-	subl	%edx, %eax
076f82
+	incw	%cx
076f82
 # endif
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
076f82
+	subq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
+	subq	$-(CHAR_PER_VEC * 4), %rdx
076f82
+
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
076f82
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
076f82
+# else
076f82
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
076f82
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
076f82
+# endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
+# ifdef USE_AS_STRNCMP
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_page_cross_slow_case0):
076f82
+	xorl	%eax, %eax
076f82
 	ret
076f82
+# endif
076f82
 
076f82
-	/* Comparing on page boundary region requires special treatment:
076f82
-	   It must done one vector at the time, starting with the wider
076f82
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
076f82
-	   (xmm) still passes the boundary, byte comparison must be done.
076f82
-	 */
076f82
-	.p2align 4
076f82
-L(cross_page):
076f82
-	/* Try one ymm vector at a time.  */
076f82
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
-	jg	L(cross_page_1_vector)
076f82
-L(loop_1_vector):
076f82
-	VMOVU	(%rdi, %rdx), %YMM0
076f82
 
076f82
-	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
076f82
-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
076f82
+	.p2align 4,, 10
076f82
+L(less_16_till_page):
076f82
+	cmpl	$(24 / SIZE_OF_CHAR), %eax
076f82
+	ja	L(less_8_till_page)
076f82
+
076f82
+	/* Use 8 byte comparison.  */
076f82
+	vmovq	(%rdi), %xmm0
076f82
+	vmovq	(%rsi), %xmm1
076f82
+	VPTESTM	%xmm0, %xmm0, %k2
076f82
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
076f82
 	kmovd	%k1, %ecx
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	subl	$0xff, %ecx
076f82
+	subl	$0x3, %ecx
076f82
 # else
076f82
-	incl	%ecx
076f82
+	incb	%cl
076f82
 # endif
076f82
-	jne	L(last_vector)
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
 
076f82
-	addl	$VEC_SIZE, %edx
076f82
 
076f82
-	addl	$VEC_SIZE, %eax
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	$(8 / SIZE_OF_CHAR), %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
 # endif
076f82
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
076f82
-	jle	L(loop_1_vector)
076f82
-L(cross_page_1_vector):
076f82
-	/* Less than 32 bytes to check, try one xmm vector.  */
076f82
-	cmpl	$(PAGE_SIZE - 16), %eax
076f82
-	jg	L(cross_page_1_xmm)
076f82
-	VMOVU	(%rdi, %rdx), %XMM0
076f82
+	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
076f82
+	subl	%eax, %OFFSET_REG
076f82
 
076f82
-	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
076f82
-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
076f82
+	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
076f82
+	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
076f82
+	VPTESTM	%xmm0, %xmm0, %k2
076f82
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
076f82
 	kmovd	%k1, %ecx
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	subl	$0xf, %ecx
076f82
+	subl	$0x3, %ecx
076f82
 # else
076f82
-	subl	$0xffff, %ecx
076f82
+	incb	%cl
076f82
 # endif
076f82
-	jne	L(last_vector)
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+
076f82
 
076f82
-	addl	$16, %edx
076f82
-# ifndef USE_AS_WCSCMP
076f82
-	addl	$16, %eax
076f82
-# endif
076f82
 # ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
076f82
+	subq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case0)
076f82
+	subq	$-(CHAR_PER_VEC * 4), %rdx
076f82
+
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
076f82
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
076f82
+# else
076f82
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
076f82
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
076f82
 # endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
 
076f82
-L(cross_page_1_xmm):
076f82
-# ifndef USE_AS_WCSCMP
076f82
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
076f82
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
076f82
-	cmpl	$(PAGE_SIZE - 8), %eax
076f82
-	jg	L(cross_page_8bytes)
076f82
-	vmovq	(%rdi, %rdx), %XMM0
076f82
-	vmovq	(%rsi, %rdx), %XMM1
076f82
 
076f82
-	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in XMM0 and XMM1.  */
076f82
-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
076f82
-	kmovb	%k1, %ecx
076f82
+
076f82
+
076f82
+	.p2align 4,, 10
076f82
+L(less_8_till_page):
076f82
 # ifdef USE_AS_WCSCMP
076f82
-	subl	$0x3, %ecx
076f82
+	/* If using wchar then this is the only check before we reach
076f82
+	   the page boundary.  */
076f82
+	movl	(%rdi), %eax
076f82
+	movl	(%rsi), %ecx
076f82
+	cmpl	%ecx, %eax
076f82
+	jnz	L(ret_less_8_wcs)
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	addq	$-(CHAR_PER_VEC * 2), %rdx
076f82
+	/* We already checked for len <= 1 so cannot hit that case here.
076f82
+	 */
076f82
+#  endif
076f82
+	testl	%eax, %eax
076f82
+	jnz	L(prepare_loop)
076f82
+	ret
076f82
+
076f82
+	.p2align 4,, 8
076f82
+L(ret_less_8_wcs):
076f82
+	setl	%OFFSET_REG8
076f82
+	negl	%OFFSET_REG
076f82
+	movl	%OFFSET_REG, %eax
076f82
+	xorl	%r8d, %eax
076f82
+	ret
076f82
+
076f82
 # else
076f82
-	subl	$0xff, %ecx
076f82
-# endif
076f82
-	jne	L(last_vector)
076f82
+	cmpl	$28, %eax
076f82
+	ja	L(less_4_till_page)
076f82
+
076f82
+	vmovd	(%rdi), %xmm0
076f82
+	vmovd	(%rsi), %xmm1
076f82
+	VPTESTM	%xmm0, %xmm0, %k2
076f82
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
076f82
+	kmovd	%k1, %ecx
076f82
+	subl	$0xf, %ecx
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
 
076f82
-	addl	$8, %edx
076f82
-	addl	$8, %eax
076f82
 #  ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	cmpq	$4, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case1)
076f82
 #  endif
076f82
+	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
076f82
+	subl	%eax, %OFFSET_REG
076f82
 
076f82
-L(cross_page_8bytes):
076f82
-	/* Less than 8 bytes to check, try 4 byte vector.  */
076f82
-	cmpl	$(PAGE_SIZE - 4), %eax
076f82
-	jg	L(cross_page_4bytes)
076f82
-	vmovd	(%rdi, %rdx), %XMM0
076f82
-	vmovd	(%rsi, %rdx), %XMM1
076f82
-
076f82
-	VPTESTM	%YMM0, %YMM0, %k2
076f82
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
076f82
-	   in XMM0 and XMM1.  */
076f82
-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
076f82
+	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
076f82
+	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
076f82
+	VPTESTM	%xmm0, %xmm0, %k2
076f82
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
076f82
 	kmovd	%k1, %ecx
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	subl	$0x1, %ecx
076f82
-# else
076f82
 	subl	$0xf, %ecx
076f82
-# endif
076f82
-	jne	L(last_vector)
076f82
+	jnz	L(check_ret_vec_page_cross)
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
076f82
+	subq	%OFFSET_REG64, %rdx
076f82
+	jbe	L(ret_zero_page_cross_slow_case1)
076f82
+	subq	$-(CHAR_PER_VEC * 4), %rdx
076f82
+
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
076f82
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
076f82
+#  else
076f82
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
076f82
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
076f82
+#  endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
 
076f82
-	addl	$4, %edx
076f82
 #  ifdef USE_AS_STRNCMP
076f82
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
076f82
-	   (%r11).  */
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
+	.p2align 4,, 2
076f82
+L(ret_zero_page_cross_slow_case1):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
 #  endif
076f82
 
076f82
-L(cross_page_4bytes):
076f82
-# endif
076f82
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
076f82
-# ifdef USE_AS_STRNCMP
076f82
-	cmpq	%r11, %rdx
076f82
-	jae	L(zero)
076f82
-# endif
076f82
-# ifdef USE_AS_WCSCMP
076f82
-	movl	(%rdi, %rdx), %eax
076f82
-	movl	(%rsi, %rdx), %ecx
076f82
-# else
076f82
-	movzbl	(%rdi, %rdx), %eax
076f82
-	movzbl	(%rsi, %rdx), %ecx
076f82
-# endif
076f82
-	testl	%eax, %eax
076f82
-	jne	L(cross_page_loop)
076f82
+	.p2align 4,, 10
076f82
+L(less_4_till_page):
076f82
+	subq	%rdi, %rsi
076f82
+	/* Extremely slow byte comparison loop.  */
076f82
+L(less_4_loop):
076f82
+	movzbl	(%rdi), %eax
076f82
+	movzbl	(%rsi, %rdi), %ecx
076f82
 	subl	%ecx, %eax
076f82
+	jnz	L(ret_less_4_loop)
076f82
+	testl	%ecx, %ecx
076f82
+	jz	L(ret_zero_4_loop)
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	decq	%rdx
076f82
+	jz	L(ret_zero_4_loop)
076f82
+#  endif
076f82
+	incq	%rdi
076f82
+	/* end condition is reach page boundary (rdi is aligned).  */
076f82
+	testl	$31, %edi
076f82
+	jnz	L(less_4_loop)
076f82
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
076f82
+	addq	$-(VEC_SIZE * 4), %rdi
076f82
+#  ifdef USE_AS_STRNCMP
076f82
+	subq	$-(CHAR_PER_VEC * 4), %rdx
076f82
+#  endif
076f82
+	jmp	L(prepare_loop_aligned)
076f82
+
076f82
+L(ret_zero_4_loop):
076f82
+	xorl	%eax, %eax
076f82
+	ret
076f82
+L(ret_less_4_loop):
076f82
+	xorl	%r8d, %eax
076f82
+	subl	%r8d, %eax
076f82
 	ret
076f82
-END (STRCMP)
076f82
+# endif
076f82
+END(STRCMP)
076f82
 #endif