513694
From 5e9c6a33e767576c063e1fc0077b3a749518e8f0 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Mon, 10 Jan 2022 15:35:38 -0600
513694
Subject: [PATCH] x86: Optimize strcmp-avx2.S
513694
513694
Optimization are primarily to the loop logic and how the page cross
513694
logic interacts with the loop.
513694
513694
The page cross logic is at times more expensive for short strings near
513694
the end of a page but not crossing the page. This is done to retest
513694
the page cross conditions with a non-faulty check and to improve the
513694
logic for entering the loop afterwards. This is only particular cases,
513694
however, and is general made up for by more than 10x improvements on
513694
the transition from the page cross -> loop case.
513694
513694
The non-page cross cases are improved most for smaller sizes [0, 128]
513694
and go about even for (128, 4096]. The loop page cross logic is
513694
improved so some more significant speedup is seen there as well.
513694
513694
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
513694
513694
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
513694
(cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45)
513694
---
513694
 sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++----------
513694
 1 file changed, 940 insertions(+), 652 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
513694
index 70d8499b..554ffe4c 100644
513694
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
513694
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
513694
@@ -26,35 +26,57 @@
513694
 
513694
 # define PAGE_SIZE	4096
513694
 
513694
-/* VEC_SIZE = Number of bytes in a ymm register */
513694
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
513694
 # define VEC_SIZE	32
513694
 
513694
-/* Shift for dividing by (VEC_SIZE * 4).  */
513694
-# define DIVIDE_BY_VEC_4_SHIFT	7
513694
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
513694
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
513694
-# endif
513694
+# define VMOVU	vmovdqu
513694
+# define VMOVA	vmovdqa
513694
 
513694
 # ifdef USE_AS_WCSCMP
513694
-/* Compare packed dwords.  */
513694
+	/* Compare packed dwords.  */
513694
 #  define VPCMPEQ	vpcmpeqd
513694
-/* Compare packed dwords and store minimum.  */
513694
+	/* Compare packed dwords and store minimum.  */
513694
 #  define VPMINU	vpminud
513694
-/* 1 dword char == 4 bytes.  */
513694
+	/* 1 dword char == 4 bytes.  */
513694
 #  define SIZE_OF_CHAR	4
513694
 # else
513694
-/* Compare packed bytes.  */
513694
+	/* Compare packed bytes.  */
513694
 #  define VPCMPEQ	vpcmpeqb
513694
-/* Compare packed bytes and store minimum.  */
513694
+	/* Compare packed bytes and store minimum.  */
513694
 #  define VPMINU	vpminub
513694
-/* 1 byte char == 1 byte.  */
513694
+	/* 1 byte char == 1 byte.  */
513694
 #  define SIZE_OF_CHAR	1
513694
 # endif
513694
 
513694
+# ifdef USE_AS_STRNCMP
513694
+#  define LOOP_REG	r9d
513694
+#  define LOOP_REG64	r9
513694
+
513694
+#  define OFFSET_REG8	r9b
513694
+#  define OFFSET_REG	r9d
513694
+#  define OFFSET_REG64	r9
513694
+# else
513694
+#  define LOOP_REG	edx
513694
+#  define LOOP_REG64	rdx
513694
+
513694
+#  define OFFSET_REG8	dl
513694
+#  define OFFSET_REG	edx
513694
+#  define OFFSET_REG64	rdx
513694
+# endif
513694
+
513694
 # ifndef VZEROUPPER
513694
 #  define VZEROUPPER	vzeroupper
513694
 # endif
513694
 
513694
+# if defined USE_AS_STRNCMP
513694
+#  define VEC_OFFSET	0
513694
+# else
513694
+#  define VEC_OFFSET	(-VEC_SIZE)
513694
+# endif
513694
+
513694
+# define xmmZERO	xmm15
513694
+# define ymmZERO	ymm15
513694
+
513694
 # ifndef SECTION
513694
 #  define SECTION(p)	p##.avx
513694
 # endif
513694
@@ -79,783 +101,1049 @@
513694
    the maximum offset is reached before a difference is found, zero is
513694
    returned.  */
513694
 
513694
-	.section SECTION(.text),"ax",@progbits
513694
-ENTRY (STRCMP)
513694
+	.section SECTION(.text), "ax", @progbits
513694
+ENTRY(STRCMP)
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Check for simple cases (0 or 1) in offset.  */
513694
+#  ifdef __ILP32__
513694
+	/* Clear the upper 32 bits.  */
513694
+	movl	%edx, %rdx
513694
+#  endif
513694
 	cmp	$1, %RDX_LP
513694
-	je	L(char0)
513694
-	jb	L(zero)
513694
+	/* Signed comparison intentional. We use this branch to also
513694
+	   test cases where length >= 2^63. These very large sizes can be
513694
+	   handled with strcmp as there is no way for that length to
513694
+	   actually bound the buffer.  */
513694
+	jle	L(one_or_less)
513694
 #  ifdef USE_AS_WCSCMP
513694
-#  ifndef __ILP32__
513694
 	movq	%rdx, %rcx
513694
-	/* Check if length could overflow when multiplied by
513694
-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
513694
-	   overflow cases as well as redirect cases where its impossible to
513694
-	   length to bound a valid memory region. In these cases just use
513694
-	   'wcscmp'.  */
513694
+
513694
+	/* Multiplying length by sizeof(wchar_t) can result in overflow.
513694
+	   Check if that is possible. All cases where overflow are possible
513694
+	   are cases where length is large enough that it can never be a
513694
+	   bound on valid memory so just use wcscmp.  */
513694
 	shrq	$56, %rcx
513694
-	jnz	OVERFLOW_STRCMP
513694
-#  endif
513694
-	/* Convert units: from wide to byte char.  */
513694
-	shl	$2, %RDX_LP
513694
+	jnz	__wcscmp_avx2
513694
+
513694
+	leaq	(, %rdx, 4), %rdx
513694
 #  endif
513694
-	/* Register %r11 tracks the maximum offset.  */
513694
-	mov	%RDX_LP, %R11_LP
513694
 # endif
513694
+	vpxor	%xmmZERO, %xmmZERO, %xmmZERO
513694
 	movl	%edi, %eax
513694
-	xorl	%edx, %edx
513694
-	/* Make %xmm7 (%ymm7) all zeros in this function.  */
513694
-	vpxor	%xmm7, %xmm7, %xmm7
513694
 	orl	%esi, %eax
513694
-	andl	$(PAGE_SIZE - 1), %eax
513694
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
513694
-	jg	L(cross_page)
513694
-	/* Start comparing 4 vectors.  */
513694
-	vmovdqu	(%rdi), %ymm1
513694
-	VPCMPEQ	(%rsi), %ymm1, %ymm0
513694
-	VPMINU	%ymm1, %ymm0, %ymm0
513694
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
513694
-	vpmovmskb %ymm0, %ecx
513694
-	testl	%ecx, %ecx
513694
-	je	L(next_3_vectors)
513694
-	tzcntl	%ecx, %edx
513694
+	sall	$20, %eax
513694
+	/* Check if s1 or s2 may cross a page  in next 4x VEC loads.  */
513694
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
513694
+	ja	L(page_cross)
513694
+
513694
+L(no_page_cross):
513694
+	/* Safe to compare 4x vectors.  */
513694
+	VMOVU	(%rdi), %ymm0
513694
+	/* 1s where s1 and s2 equal.  */
513694
+	VPCMPEQ	(%rsi), %ymm0, %ymm1
513694
+	/* 1s at null CHAR.  */
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	/* 1s where s1 and s2 equal AND not null CHAR.  */
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+
513694
+	/* All 1s -> keep going, any 0s -> return.  */
513694
+	vpmovmskb %ymm1, %ecx
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
513694
-	   offset (%r11).   */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	$VEC_SIZE, %rdx
513694
+	jbe	L(vec_0_test_len)
513694
 # endif
513694
+
513694
+	/* All 1s represents all equals. incl will overflow to zero in
513694
+	   all equals case. Otherwise 1s will carry until position of first
513694
+	   mismatch.  */
513694
+	incl	%ecx
513694
+	jz	L(more_3x_vec)
513694
+
513694
+	.p2align 4,, 4
513694
+L(return_vec_0):
513694
+	tzcntl	%ecx, %ecx
513694
 # ifdef USE_AS_WCSCMP
513694
+	movl	(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	je	L(return)
513694
-L(wcscmp_return):
513694
+	cmpl	(%rsi, %rcx), %edx
513694
+	je	L(ret0)
513694
 	setl	%al
513694
 	negl	%eax
513694
 	orl	$1, %eax
513694
-L(return):
513694
 # else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	movzbl	(%rdi, %rcx), %eax
513694
+	movzbl	(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
 # endif
513694
+L(ret0):
513694
 L(return_vzeroupper):
513694
 	ZERO_UPPER_VEC_REGISTERS_RETURN
513694
 
513694
-	.p2align 4
513694
-L(return_vec_size):
513694
-	tzcntl	%ecx, %edx
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
513694
-	   the maximum offset (%r11).  */
513694
-	addq	$VEC_SIZE, %rdx
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
+	.p2align 4,, 8
513694
+L(vec_0_test_len):
513694
+	notl	%ecx
513694
+	bzhil	%edx, %ecx, %eax
513694
+	jnz	L(return_vec_0)
513694
+	/* Align if will cross fetch block.  */
513694
+	.p2align 4,, 2
513694
+L(ret_zero):
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# else
513694
+	VZEROUPPER_RETURN
513694
+
513694
+	.p2align 4,, 5
513694
+L(one_or_less):
513694
+	jb	L(ret_zero)
513694
 #  ifdef USE_AS_WCSCMP
513694
+	/* 'nbe' covers the case where length is negative (large
513694
+	   unsigned).  */
513694
+	jnbe	__wcscmp_avx2
513694
+	movl	(%rdi), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
513694
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	cmpl	(%rsi), %edx
513694
+	je	L(ret1)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 #  else
513694
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
513694
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	/* 'nbe' covers the case where length is negative (large
513694
+	   unsigned).  */
513694
+
513694
+	jnbe	__strcmp_avx2
513694
+	movzbl	(%rdi), %eax
513694
+	movzbl	(%rsi), %ecx
513694
+	subl	%ecx, %eax
513694
 #  endif
513694
+L(ret1):
513694
+	ret
513694
 # endif
513694
-	VZEROUPPER_RETURN
513694
 
513694
-	.p2align 4
513694
-L(return_2_vec_size):
513694
-	tzcntl	%ecx, %edx
513694
+	.p2align 4,, 10
513694
+L(return_vec_1):
513694
+	tzcntl	%ecx, %ecx
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
513694
-	   after the maximum offset (%r11).  */
513694
-	addq	$(VEC_SIZE * 2), %rdx
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
+	/* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of
513694
+	   overflow.  */
513694
+	addq	$-VEC_SIZE, %rdx
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	VEC_SIZE(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
513694
+	je	L(ret2)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
513694
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
513694
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
513694
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
 # endif
513694
+L(ret2):
513694
 	VZEROUPPER_RETURN
513694
 
513694
-	.p2align 4
513694
-L(return_3_vec_size):
513694
-	tzcntl	%ecx, %edx
513694
+	.p2align 4,, 10
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
513694
-	   after the maximum offset (%r11).  */
513694
-	addq	$(VEC_SIZE * 3), %rdx
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
+L(return_vec_3):
513694
+	salq	$32, %rcx
513694
+# endif
513694
+
513694
+L(return_vec_2):
513694
+# ifndef USE_AS_STRNCMP
513694
+	tzcntl	%ecx, %ecx
513694
+# else
513694
+	tzcntq	%rcx, %rcx
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
513694
+	je	L(ret3)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 # else
513694
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
513694
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+# endif
513694
+L(ret3):
513694
+	VZEROUPPER_RETURN
513694
+
513694
+# ifndef USE_AS_STRNCMP
513694
+	.p2align 4,, 10
513694
+L(return_vec_3):
513694
+	tzcntl	%ecx, %ecx
513694
 #  ifdef USE_AS_WCSCMP
513694
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
513694
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
513694
+	je	L(ret4)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 #  else
513694
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
513694
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
513694
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
 #  endif
513694
-# endif
513694
+L(ret4):
513694
 	VZEROUPPER_RETURN
513694
+# endif
513694
+
513694
+	.p2align 4,, 10
513694
+L(more_3x_vec):
513694
+	/* Safe to compare 4x vectors.  */
513694
+	VMOVU	VEC_SIZE(%rdi), %ymm0
513694
+	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_1)
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	subq	$(VEC_SIZE * 2), %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+
513694
+	VMOVU	(VEC_SIZE * 2)(%rdi), %ymm0
513694
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_2)
513694
+
513694
+	VMOVU	(VEC_SIZE * 3)(%rdi), %ymm0
513694
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_3)
513694
 
513694
-	.p2align 4
513694
-L(next_3_vectors):
513694
-	vmovdqu	VEC_SIZE(%rdi), %ymm6
513694
-	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
513694
-	VPMINU	%ymm6, %ymm3, %ymm3
513694
-	VPCMPEQ	%ymm7, %ymm3, %ymm3
513694
-	vpmovmskb %ymm3, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(return_vec_size)
513694
-	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
513694
-	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
513694
-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
513694
-	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
513694
-	VPMINU	%ymm5, %ymm2, %ymm2
513694
-	VPCMPEQ	%ymm4, %ymm0, %ymm0
513694
-	VPCMPEQ	%ymm7, %ymm2, %ymm2
513694
-	vpmovmskb %ymm2, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(return_2_vec_size)
513694
-	VPMINU	%ymm4, %ymm0, %ymm0
513694
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
513694
-	vpmovmskb %ymm0, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(return_3_vec_size)
513694
-L(main_loop_header):
513694
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
513694
-	movl	$PAGE_SIZE, %ecx
513694
-	/* Align load via RAX.  */
513694
-	andq	$-(VEC_SIZE * 4), %rdx
513694
-	subq	%rdi, %rdx
513694
-	leaq	(%rdi, %rdx), %rax
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Starting from this point, the maximum offset, or simply the
513694
-	   'offset', DECREASES by the same amount when base pointers are
513694
-	   moved forward.  Return 0 when:
513694
-	     1) On match: offset <= the matched vector index.
513694
-	     2) On mistmach, offset is before the mistmatched index.
513694
+	cmpq	$(VEC_SIZE * 2), %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+
513694
+# ifdef USE_AS_WCSCMP
513694
+	/* any non-zero positive value that doesn't inference with 0x1.
513694
 	 */
513694
-	subq	%rdx, %r11
513694
-	jbe	L(zero)
513694
-# endif
513694
-	addq	%rsi, %rdx
513694
-	movq	%rdx, %rsi
513694
-	andl	$(PAGE_SIZE - 1), %esi
513694
-	/* Number of bytes before page crossing.  */
513694
-	subq	%rsi, %rcx
513694
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
513694
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
513694
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
513694
-	movl	%ecx, %esi
513694
-	jmp	L(loop_start)
513694
+	movl	$2, %r8d
513694
 
513694
+# else
513694
+	xorl	%r8d, %r8d
513694
+# endif
513694
+
513694
+	/* The prepare labels are various entry points from the page
513694
+	   cross logic.  */
513694
+L(prepare_loop):
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
513694
+	   the loop.  */
513694
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
513694
+# endif
513694
+L(prepare_loop_no_len):
513694
+
513694
+	/* Align s1 and adjust s2 accordingly.  */
513694
+	subq	%rdi, %rsi
513694
+	andq	$-(VEC_SIZE * 4), %rdi
513694
+	addq	%rdi, %rsi
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	subq	%rdi, %rdx
513694
+# endif
513694
+
513694
+L(prepare_loop_aligned):
513694
+	/* eax stores distance from rsi to next page cross. These cases
513694
+	   need to be handled specially as the 4x loop could potentially
513694
+	   read memory past the length of s1 or s2 and across a page
513694
+	   boundary.  */
513694
+	movl	$-(VEC_SIZE * 4), %eax
513694
+	subl	%esi, %eax
513694
+	andl	$(PAGE_SIZE - 1), %eax
513694
+
513694
+	/* Loop 4x comparisons at a time.  */
513694
 	.p2align 4
513694
 L(loop):
513694
+
513694
+	/* End condition for strncmp.  */
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
513694
-	   the maximum offset (%r11) by the same amount.  */
513694
-	subq	$(VEC_SIZE * 4), %r11
513694
-	jbe	L(zero)
513694
-# endif
513694
-	addq	$(VEC_SIZE * 4), %rax
513694
-	addq	$(VEC_SIZE * 4), %rdx
513694
-L(loop_start):
513694
-	testl	%esi, %esi
513694
-	leal	-1(%esi), %esi
513694
-	je	L(loop_cross_page)
513694
-L(back_to_loop):
513694
-	/* Main loop, comparing 4 vectors are a time.  */
513694
-	vmovdqa	(%rax), %ymm0
513694
-	vmovdqa	VEC_SIZE(%rax), %ymm3
513694
-	VPCMPEQ	(%rdx), %ymm0, %ymm4
513694
-	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
513694
-	VPMINU	%ymm0, %ymm4, %ymm4
513694
-	VPMINU	%ymm3, %ymm1, %ymm1
513694
-	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
513694
-	VPMINU	%ymm1, %ymm4, %ymm0
513694
-	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
513694
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
513694
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
513694
-	VPMINU	%ymm2, %ymm5, %ymm5
513694
-	VPMINU	%ymm3, %ymm6, %ymm6
513694
-	VPMINU	%ymm5, %ymm0, %ymm0
513694
-	VPMINU	%ymm6, %ymm0, %ymm0
513694
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
513694
-
513694
-	/* Test each mask (32 bits) individually because for VEC_SIZE
513694
-	   == 32 is not possible to OR the four masks and keep all bits
513694
-	   in a 64-bit integer register, differing from SSE2 strcmp
513694
-	   where ORing is possible.  */
513694
-	vpmovmskb %ymm0, %ecx
513694
+	subq	$(VEC_SIZE * 4), %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+
513694
+	subq	$-(VEC_SIZE * 4), %rdi
513694
+	subq	$-(VEC_SIZE * 4), %rsi
513694
+
513694
+	/* Check if rsi loads will cross a page boundary.  */
513694
+	addl	$-(VEC_SIZE * 4), %eax
513694
+	jnb	L(page_cross_during_loop)
513694
+
513694
+	/* Loop entry after handling page cross during loop.  */
513694
+L(loop_skip_page_cross_check):
513694
+	VMOVA	(VEC_SIZE * 0)(%rdi), %ymm0
513694
+	VMOVA	(VEC_SIZE * 1)(%rdi), %ymm2
513694
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
513694
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
513694
+
513694
+	/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise.  */
513694
+	VPCMPEQ	(VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
513694
+
513694
+	VPCMPEQ	(VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
513694
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
513694
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
513694
+
513694
+
513694
+	/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
513694
+	   zero.  */
513694
+	vpand	%ymm0, %ymm1, %ymm1
513694
+
513694
+
513694
+	vpand	%ymm2, %ymm3, %ymm3
513694
+	vpand	%ymm4, %ymm5, %ymm5
513694
+	vpand	%ymm6, %ymm7, %ymm7
513694
+
513694
+	VPMINU	%ymm1, %ymm3, %ymm3
513694
+	VPMINU	%ymm5, %ymm7, %ymm7
513694
+
513694
+	/* Reduce all 0 CHARs for the 4x VEC into ymm7.  */
513694
+	VPMINU	%ymm3, %ymm7, %ymm7
513694
+
513694
+	/* If any 0 CHAR then done.  */
513694
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
513694
+	vpmovmskb %ymm7, %LOOP_REG
513694
+	testl	%LOOP_REG, %LOOP_REG
513694
+	jz	L(loop)
513694
+
513694
+	/* Find which VEC has the mismatch of end of string.  */
513694
+	VPCMPEQ	%ymm1, %ymmZERO, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
 	testl	%ecx, %ecx
513694
-	je	L(loop)
513694
-	VPCMPEQ	%ymm7, %ymm4, %ymm0
513694
-	vpmovmskb %ymm0, %edi
513694
-	testl	%edi, %edi
513694
-	je	L(test_vec)
513694
-	tzcntl	%edi, %ecx
513694
+	jnz	L(return_vec_0_end)
513694
+
513694
+
513694
+	VPCMPEQ	%ymm3, %ymmZERO, %ymm3
513694
+	vpmovmskb %ymm3, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(return_vec_1_end)
513694
+
513694
+L(return_vec_2_3_end):
513694
 # ifdef USE_AS_STRNCMP
513694
-	cmpq	%rcx, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	subq	$(VEC_SIZE * 2), %rdx
513694
+	jbe	L(ret_zero_end)
513694
+# endif
513694
+
513694
+	VPCMPEQ	%ymm5, %ymmZERO, %ymm5
513694
+	vpmovmskb %ymm5, %ecx
513694
+	testl	%ecx, %ecx
513694
+	jnz	L(return_vec_2_end)
513694
+
513694
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
513694
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
513694
+	   must entirely be from VEC 3 which is fully represented by
513694
+	   LOOP_REG.  */
513694
+	tzcntl	%LOOP_REG, %LOOP_REG
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	subl	$-(VEC_SIZE), %LOOP_REG
513694
+	cmpq	%LOOP_REG64, %rdx
513694
+	jbe	L(ret_zero_end)
513694
+# endif
513694
+
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
513694
+	je	L(ret5)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
513694
+	movzbl	(VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret5):
513694
 	VZEROUPPER_RETURN
513694
 
513694
-	.p2align 4
513694
-L(test_vec):
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* The first vector matched.  Return 0 if the maximum offset
513694
-	   (%r11) <= VEC_SIZE.  */
513694
-	cmpq	$VEC_SIZE, %r11
513694
-	jbe	L(zero)
513694
+	.p2align 4,, 2
513694
+L(ret_zero_end):
513694
+	xorl	%eax, %eax
513694
+	VZEROUPPER_RETURN
513694
 # endif
513694
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
513694
-	vpmovmskb %ymm1, %ecx
513694
-	testl	%ecx, %ecx
513694
-	je	L(test_2_vec)
513694
-	tzcntl	%ecx, %edi
513694
+
513694
+
513694
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
513694
+	   they use the value of `r8` to negate the return value. This is
513694
+	   because the page cross logic can swap `rdi` and `rsi`.  */
513694
+	.p2align 4,, 10
513694
 # ifdef USE_AS_STRNCMP
513694
-	addq	$VEC_SIZE, %rdi
513694
-	cmpq	%rdi, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+L(return_vec_1_end):
513694
+	salq	$32, %rcx
513694
+# endif
513694
+L(return_vec_0_end):
513694
+# ifndef USE_AS_STRNCMP
513694
+	tzcntl	%ecx, %ecx
513694
+# else
513694
+	tzcntq	%rcx, %rcx
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero_end)
513694
+# endif
513694
+
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rdi), %ecx
513694
-	cmpl	(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rdi), %eax
513694
-	movzbl	(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	(%rsi, %rcx), %edx
513694
+	je	L(ret6)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 # else
513694
+	movzbl	(%rdi, %rcx), %eax
513694
+	movzbl	(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
+# endif
513694
+L(ret6):
513694
+	VZEROUPPER_RETURN
513694
+
513694
+# ifndef USE_AS_STRNCMP
513694
+	.p2align 4,, 10
513694
+L(return_vec_1_end):
513694
+	tzcntl	%ecx, %ecx
513694
 #  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	movl	VEC_SIZE(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
513694
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
513694
+	je	L(ret7)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 #  else
513694
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
513694
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
513694
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 #  endif
513694
-# endif
513694
+L(ret7):
513694
 	VZEROUPPER_RETURN
513694
+# endif
513694
 
513694
-	.p2align 4
513694
-L(test_2_vec):
513694
+	.p2align 4,, 10
513694
+L(return_vec_2_end):
513694
+	tzcntl	%ecx, %ecx
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
513694
-	   (%r11) <= 2 * VEC_SIZE.  */
513694
-	cmpq	$(VEC_SIZE * 2), %r11
513694
-	jbe	L(zero)
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero_page_cross)
513694
 # endif
513694
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
513694
-	vpmovmskb %ymm5, %ecx
513694
-	testl	%ecx, %ecx
513694
-	je	L(test_3_vec)
513694
-	tzcntl	%ecx, %edi
513694
-# ifdef USE_AS_STRNCMP
513694
-	addq	$(VEC_SIZE * 2), %rdi
513694
-	cmpq	%rdi, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rdi), %ecx
513694
-	cmpl	(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rdi), %eax
513694
-	movzbl	(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
513694
+	je	L(ret11)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
513694
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
513694
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
513694
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret11):
513694
 	VZEROUPPER_RETURN
513694
 
513694
-	.p2align 4
513694
-L(test_3_vec):
513694
+
513694
+	/* Page cross in rsi in next 4x VEC.  */
513694
+
513694
+	/* TODO: Improve logic here.  */
513694
+	.p2align 4,, 10
513694
+L(page_cross_during_loop):
513694
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
513694
+
513694
+	/* Optimistically rsi and rdi and both aligned inwhich case we
513694
+	   don't need any logic here.  */
513694
+	cmpl	$-(VEC_SIZE * 4), %eax
513694
+	/* Don't adjust eax before jumping back to loop and we will
513694
+	   never hit page cross case again.  */
513694
+	je	L(loop_skip_page_cross_check)
513694
+
513694
+	/* Check if we can safely load a VEC.  */
513694
+	cmpl	$-(VEC_SIZE * 3), %eax
513694
+	jle	L(less_1x_vec_till_page_cross)
513694
+
513694
+	VMOVA	(%rdi), %ymm0
513694
+	VPCMPEQ	(%rsi), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_0_end)
513694
+
513694
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
513694
+	cmpl	$-(VEC_SIZE * 2), %eax
513694
+	jg	L(more_2x_vec_till_page_cross)
513694
+
513694
+	.p2align 4,, 4
513694
+L(less_1x_vec_till_page_cross):
513694
+	subl	$-(VEC_SIZE * 4), %eax
513694
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
513694
+	   concerning case is first iteration if incoming s1 was near start
513694
+	   of a page and s2 near end. If s1 was near the start of the page
513694
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
513694
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
513694
+	   here, it means the previous page (rdi - VEC_SIZE) has already
513694
+	   been loaded earlier so must be valid.  */
513694
+	VMOVU	-VEC_SIZE(%rdi, %rax), %ymm0
513694
+	VPCMPEQ	-VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+
513694
+	/* Mask of potentially valid bits. The lower bits can be out of
513694
+	   range comparisons (but safe regarding page crosses).  */
513694
+	movl	$-1, %r10d
513694
+	shlxl	%esi, %r10d, %r10d
513694
+	notl	%ecx
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
513694
-	   (%r11) <= 3 * VEC_SIZE.  */
513694
-	cmpq	$(VEC_SIZE * 3), %r11
513694
-	jbe	L(zero)
513694
-# endif
513694
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
513694
-	vpmovmskb %ymm6, %esi
513694
-	tzcntl	%esi, %ecx
513694
+	cmpq	%rax, %rdx
513694
+	jbe	L(return_page_cross_end_check)
513694
+# endif
513694
+	movl	%eax, %OFFSET_REG
513694
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
513694
+
513694
+	andl	%r10d, %ecx
513694
+	jz	L(loop_skip_page_cross_check)
513694
+
513694
+	.p2align 4,, 3
513694
+L(return_page_cross_end):
513694
+	tzcntl	%ecx, %ecx
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	addq	$(VEC_SIZE * 3), %rcx
513694
-	cmpq	%rcx, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %esi
513694
-	cmpl	(%rdx, %rcx), %esi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx), %ecx
513694
+L(return_page_cross_cmp_mem):
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	addl	%OFFSET_REG, %ecx
513694
+# endif
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
513694
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
513694
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
513694
+	je	L(ret8)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
+# else
513694
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
513694
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret8):
513694
 	VZEROUPPER_RETURN
513694
 
513694
-	.p2align 4
513694
-L(loop_cross_page):
513694
-	xorl	%r10d, %r10d
513694
-	movq	%rdx, %rcx
513694
-	/* Align load via RDX.  We load the extra ECX bytes which should
513694
-	   be ignored.  */
513694
-	andl	$((VEC_SIZE * 4) - 1), %ecx
513694
-	/* R10 is -RCX.  */
513694
-	subq	%rcx, %r10
513694
-
513694
-	/* This works only if VEC_SIZE * 2 == 64. */
513694
-# if (VEC_SIZE * 2) != 64
513694
-#  error (VEC_SIZE * 2) != 64
513694
-# endif
513694
-
513694
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
513694
-	cmpl	$(VEC_SIZE * 2), %ecx
513694
-	jge	L(loop_cross_page_2_vec)
513694
-
513694
-	vmovdqu	(%rax, %r10), %ymm2
513694
-	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
513694
-	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
513694
-	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
513694
-	VPMINU	%ymm2, %ymm0, %ymm0
513694
-	VPMINU	%ymm3, %ymm1, %ymm1
513694
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
513694
-	VPCMPEQ	%ymm7, %ymm1, %ymm1
513694
-
513694
-	vpmovmskb %ymm0, %edi
513694
-	vpmovmskb %ymm1, %esi
513694
-
513694
-	salq	$32, %rsi
513694
-	xorq	%rsi, %rdi
513694
-
513694
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
513694
-	shrq	%cl, %rdi
513694
-
513694
-	testq	%rdi, %rdi
513694
-	je	L(loop_cross_page_2_vec)
513694
-	tzcntq	%rdi, %rcx
513694
 # ifdef USE_AS_STRNCMP
513694
-	cmpq	%rcx, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	.p2align 4,, 10
513694
+L(return_page_cross_end_check):
513694
+	tzcntl	%ecx, %ecx
513694
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
513694
+	cmpl	%ecx, %edx
513694
+	ja	L(return_page_cross_cmp_mem)
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# endif
513694
 	VZEROUPPER_RETURN
513694
+# endif
513694
 
513694
-	.p2align 4
513694
-L(loop_cross_page_2_vec):
513694
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
513694
-	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
513694
-	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
513694
-	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
513694
-	VPMINU	%ymm2, %ymm5, %ymm5
513694
-	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
513694
-	VPCMPEQ	%ymm7, %ymm5, %ymm5
513694
-	VPMINU	%ymm3, %ymm6, %ymm6
513694
-	VPCMPEQ	%ymm7, %ymm6, %ymm6
513694
-
513694
-	vpmovmskb %ymm5, %edi
513694
-	vpmovmskb %ymm6, %esi
513694
-
513694
-	salq	$32, %rsi
513694
-	xorq	%rsi, %rdi
513694
 
513694
-	xorl	%r8d, %r8d
513694
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
513694
-	subl	$(VEC_SIZE * 2), %ecx
513694
-	jle	1f
513694
-	/* Skip ECX bytes.  */
513694
-	shrq	%cl, %rdi
513694
-	/* R8 has number of bytes skipped.  */
513694
-	movl	%ecx, %r8d
513694
-1:
513694
-	/* Before jumping back to the loop, set ESI to the number of
513694
-	   VEC_SIZE * 4 blocks before page crossing.  */
513694
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
513694
-
513694
-	testq	%rdi, %rdi
513694
+	.p2align 4,, 10
513694
+L(more_2x_vec_till_page_cross):
513694
+	/* If more 2x vec till cross we will complete a full loop
513694
+	   iteration here.  */
513694
+
513694
+	VMOVU	VEC_SIZE(%rdi), %ymm0
513694
+	VPCMPEQ	VEC_SIZE(%rsi), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_1_end)
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* At this point, if %rdi value is 0, it already tested
513694
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
513694
-	   checks whether strncmp maximum offset reached or not.  */
513694
-	je	L(string_nbyte_offset_check)
513694
-# else
513694
-	je	L(back_to_loop)
513694
+	cmpq	$(VEC_SIZE * 2), %rdx
513694
+	jbe	L(ret_zero_in_loop_page_cross)
513694
 # endif
513694
-	tzcntq	%rdi, %rcx
513694
-	addq	%r10, %rcx
513694
-	/* Adjust for number of bytes skipped.  */
513694
-	addq	%r8, %rcx
513694
+
513694
+	subl	$-(VEC_SIZE * 4), %eax
513694
+
513694
+	/* Safe to include comparisons from lower bytes.  */
513694
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %ymm0
513694
+	VPCMPEQ	-(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_page_cross_0)
513694
+
513694
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %ymm0
513694
+	VPCMPEQ	-(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+	jnz	L(return_vec_page_cross_1)
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	addq	$(VEC_SIZE * 2), %rcx
513694
-	subq	%rcx, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	/* Must check length here as length might proclude reading next
513694
+	   page.  */
513694
+	cmpq	%rax, %rdx
513694
+	jbe	L(ret_zero_in_loop_page_cross)
513694
+# endif
513694
+
513694
+	/* Finish the loop.  */
513694
+	VMOVA	(VEC_SIZE * 2)(%rdi), %ymm4
513694
+	VMOVA	(VEC_SIZE * 3)(%rdi), %ymm6
513694
+
513694
+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
513694
+	VPCMPEQ	(VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
513694
+	vpand	%ymm4, %ymm5, %ymm5
513694
+	vpand	%ymm6, %ymm7, %ymm7
513694
+	VPMINU	%ymm5, %ymm7, %ymm7
513694
+	VPCMPEQ	%ymm7, %ymmZERO, %ymm7
513694
+	vpmovmskb %ymm7, %LOOP_REG
513694
+	testl	%LOOP_REG, %LOOP_REG
513694
+	jnz	L(return_vec_2_3_end)
513694
+
513694
+	/* Best for code size to include ucond-jmp here. Would be faster
513694
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
513694
+	   as fall-through and have jump back to loop on mismatch
513694
+	   comparison.  */
513694
+	subq	$-(VEC_SIZE * 4), %rdi
513694
+	subq	$-(VEC_SIZE * 4), %rsi
513694
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
513694
+# ifdef USE_AS_STRNCMP
513694
+	subq	$(VEC_SIZE * 4), %rdx
513694
+	ja	L(loop_skip_page_cross_check)
513694
+L(ret_zero_in_loop_page_cross):
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	VZEROUPPER_RETURN
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
513694
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
513694
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	jmp	L(loop_skip_page_cross_check)
513694
 # endif
513694
-	VZEROUPPER_RETURN
513694
 
513694
+
513694
+	.p2align 4,, 10
513694
+L(return_vec_page_cross_0):
513694
+	addl	$-VEC_SIZE, %eax
513694
+L(return_vec_page_cross_1):
513694
+	tzcntl	%ecx, %ecx
513694
 # ifdef USE_AS_STRNCMP
513694
-L(string_nbyte_offset_check):
513694
-	leaq	(VEC_SIZE * 4)(%r10), %r10
513694
-	cmpq	%r10, %r11
513694
-	jbe	L(zero)
513694
-	jmp	L(back_to_loop)
513694
+	leal	-VEC_SIZE(%rax, %rcx), %ecx
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero_in_loop_page_cross)
513694
+# else
513694
+	addl	%eax, %ecx
513694
 # endif
513694
 
513694
-	.p2align 4
513694
-L(cross_page_loop):
513694
-	/* Check one byte/dword at a time.  */
513694
 # ifdef USE_AS_WCSCMP
513694
-	cmpl	%ecx, %eax
513694
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
513694
+	xorl	%eax, %eax
513694
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
513694
+	je	L(ret9)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 # else
513694
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
513694
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
513694
 	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
-	jne	L(different)
513694
-	addl	$SIZE_OF_CHAR, %edx
513694
-	cmpl	$(VEC_SIZE * 4), %edx
513694
-	je	L(main_loop_header)
513694
-# ifdef USE_AS_STRNCMP
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+L(ret9):
513694
+	VZEROUPPER_RETURN
513694
+
513694
+
513694
+	.p2align 4,, 10
513694
+L(page_cross):
513694
+# ifndef USE_AS_STRNCMP
513694
+	/* If both are VEC aligned we don't need any special logic here.
513694
+	   Only valid for strcmp where stop condition is guranteed to be
513694
+	   reachable by just reading memory.  */
513694
+	testl	$((VEC_SIZE - 1) << 20), %eax
513694
+	jz	L(no_page_cross)
513694
 # endif
513694
+
513694
+	movl	%edi, %eax
513694
+	movl	%esi, %ecx
513694
+	andl	$(PAGE_SIZE - 1), %eax
513694
+	andl	$(PAGE_SIZE - 1), %ecx
513694
+
513694
+	xorl	%OFFSET_REG, %OFFSET_REG
513694
+
513694
+	/* Check which is closer to page cross, s1 or s2.  */
513694
+	cmpl	%eax, %ecx
513694
+	jg	L(page_cross_s2)
513694
+
513694
+	/* The previous page cross check has false positives. Check for
513694
+	   true positive as page cross logic is very expensive.  */
513694
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
513694
+	jbe	L(no_page_cross)
513694
+
513694
+	/* Set r8 to not interfere with normal return value (rdi and rsi
513694
+	   did not swap).  */
513694
 # ifdef USE_AS_WCSCMP
513694
-	movl	(%rdi, %rdx), %eax
513694
-	movl	(%rsi, %rdx), %ecx
513694
+	/* any non-zero positive value that doesn't inference with 0x1.
513694
+	 */
513694
+	movl	$2, %r8d
513694
 # else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %ecx
513694
+	xorl	%r8d, %r8d
513694
 # endif
513694
-	/* Check null char.  */
513694
-	testl	%eax, %eax
513694
-	jne	L(cross_page_loop)
513694
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
513694
-	   comparisons.  */
513694
-	subl	%ecx, %eax
513694
-# ifndef USE_AS_WCSCMP
513694
-L(different):
513694
+
513694
+	/* Check if less than 1x VEC till page cross.  */
513694
+	subl	$(VEC_SIZE * 3), %eax
513694
+	jg	L(less_1x_vec_till_page)
513694
+
513694
+	/* If more than 1x VEC till page cross, loop throuh safely
513694
+	   loadable memory until within 1x VEC of page cross.  */
513694
+
513694
+	.p2align 4,, 10
513694
+L(page_cross_loop):
513694
+
513694
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
513694
+	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incl	%ecx
513694
+
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+	addl	$VEC_SIZE, %OFFSET_REG
513694
+# ifdef USE_AS_STRNCMP
513694
+	cmpq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross)
513694
 # endif
513694
-	VZEROUPPER_RETURN
513694
+	addl	$VEC_SIZE, %eax
513694
+	jl	L(page_cross_loop)
513694
+
513694
+	subl	%eax, %OFFSET_REG
513694
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
513694
+	   to not cross page so is safe to load. Since we have already
513694
+	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
513694
+	 */
513694
+
513694
+	VMOVU	(%rdi, %OFFSET_REG64), %ymm0
513694
+	VPCMPEQ	(%rsi, %OFFSET_REG64), %ymm0, %ymm1
513694
+	VPCMPEQ	%ymm0, %ymmZERO, %ymm2
513694
+	vpandn	%ymm1, %ymm2, %ymm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	leal	VEC_SIZE(%OFFSET_REG64), %eax
513694
+	cmpq	%rax, %rdx
513694
+	jbe	L(check_ret_vec_page_cross2)
513694
+	addq	%rdi, %rdx
513694
+# endif
513694
+	incl	%ecx
513694
+	jz	L(prepare_loop_no_len)
513694
 
513694
+	.p2align 4,, 4
513694
+L(ret_vec_page_cross):
513694
+# ifndef USE_AS_STRNCMP
513694
+L(check_ret_vec_page_cross):
513694
+# endif
513694
+	tzcntl	%ecx, %ecx
513694
+	addl	%OFFSET_REG, %ecx
513694
+L(ret_vec_page_cross_cont):
513694
 # ifdef USE_AS_WCSCMP
513694
-	.p2align 4
513694
-L(different):
513694
-	/* Use movl to avoid modifying EFLAGS.  */
513694
-	movl	$0, %eax
513694
+	movl	(%rdi, %rcx), %edx
513694
+	xorl	%eax, %eax
513694
+	cmpl	(%rsi, %rcx), %edx
513694
+	je	L(ret12)
513694
 	setl	%al
513694
 	negl	%eax
513694
-	orl	$1, %eax
513694
-	VZEROUPPER_RETURN
513694
+	xorl	%r8d, %eax
513694
+# else
513694
+	movzbl	(%rdi, %rcx), %eax
513694
+	movzbl	(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret12):
513694
+	VZEROUPPER_RETURN
513694
 
513694
 # ifdef USE_AS_STRNCMP
513694
-	.p2align 4
513694
-L(zero):
513694
+	.p2align 4,, 10
513694
+L(check_ret_vec_page_cross2):
513694
+	incl	%ecx
513694
+L(check_ret_vec_page_cross):
513694
+	tzcntl	%ecx, %ecx
513694
+	addl	%OFFSET_REG, %ecx
513694
+	cmpq	%rcx, %rdx
513694
+	ja	L(ret_vec_page_cross_cont)
513694
+	.p2align 4,, 2
513694
+L(ret_zero_page_cross):
513694
 	xorl	%eax, %eax
513694
 	VZEROUPPER_RETURN
513694
+# endif
513694
 
513694
-	.p2align 4
513694
-L(char0):
513694
-#  ifdef USE_AS_WCSCMP
513694
-	xorl	%eax, %eax
513694
-	movl	(%rdi), %ecx
513694
-	cmpl	(%rsi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rsi), %ecx
513694
-	movzbl	(%rdi), %eax
513694
-	subl	%ecx, %eax
513694
-#  endif
513694
-	VZEROUPPER_RETURN
513694
+	.p2align 4,, 4
513694
+L(page_cross_s2):
513694
+	/* Ensure this is a true page cross.  */
513694
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
513694
+	jbe	L(no_page_cross)
513694
+
513694
+
513694
+	movl	%ecx, %eax
513694
+	movq	%rdi, %rcx
513694
+	movq	%rsi, %rdi
513694
+	movq	%rcx, %rsi
513694
+
513694
+	/* set r8 to negate return value as rdi and rsi swapped.  */
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	$-4, %r8d
513694
+# else
513694
+	movl	$-1, %r8d
513694
 # endif
513694
+	xorl	%OFFSET_REG, %OFFSET_REG
513694
 
513694
-	.p2align 4
513694
-L(last_vector):
513694
-	addq	%rdx, %rdi
513694
-	addq	%rdx, %rsi
513694
+	/* Check if more than 1x VEC till page cross.  */
513694
+	subl	$(VEC_SIZE * 3), %eax
513694
+	jle	L(page_cross_loop)
513694
+
513694
+	.p2align 4,, 6
513694
+L(less_1x_vec_till_page):
513694
+	/* Find largest load size we can use.  */
513694
+	cmpl	$16, %eax
513694
+	ja	L(less_16_till_page)
513694
+
513694
+	VMOVU	(%rdi), %xmm0
513694
+	VPCMPEQ	(%rsi), %xmm0, %xmm1
513694
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
513694
+	vpandn	%xmm1, %xmm2, %xmm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incw	%cx
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+	movl	$16, %OFFSET_REG
513694
 # ifdef USE_AS_STRNCMP
513694
-	subq	%rdx, %r11
513694
+	cmpq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
+	subl	%eax, %OFFSET_REG
513694
+# else
513694
+	/* Explicit check for 16 byte alignment.  */
513694
+	subl	%eax, %OFFSET_REG
513694
+	jz	L(prepare_loop)
513694
 # endif
513694
-	tzcntl	%ecx, %edx
513694
+
513694
+	VMOVU	(%rdi, %OFFSET_REG64), %xmm0
513694
+	VPCMPEQ	(%rsi, %OFFSET_REG64), %xmm0, %xmm1
513694
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
513694
+	vpandn	%xmm1, %xmm2, %xmm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incw	%cx
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	addl	$16, %OFFSET_REG
513694
+	subq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
+	subq	$-(VEC_SIZE * 4), %rdx
513694
+
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
513694
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
513694
+# else
513694
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
513694
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
513694
 # endif
513694
-# ifdef USE_AS_WCSCMP
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	.p2align 4,, 2
513694
+L(ret_zero_page_cross_slow_case0):
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-# else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	ret
513694
 # endif
513694
-	VZEROUPPER_RETURN
513694
 
513694
-	/* Comparing on page boundary region requires special treatment:
513694
-	   It must done one vector at the time, starting with the wider
513694
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
513694
-	   (xmm) still passes the boundary, byte comparison must be done.
513694
-	 */
513694
-	.p2align 4
513694
-L(cross_page):
513694
-	/* Try one ymm vector at a time.  */
513694
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
513694
-	jg	L(cross_page_1_vector)
513694
-L(loop_1_vector):
513694
-	vmovdqu	(%rdi, %rdx), %ymm1
513694
-	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
513694
-	VPMINU	%ymm1, %ymm0, %ymm0
513694
-	VPCMPEQ	%ymm7, %ymm0, %ymm0
513694
-	vpmovmskb %ymm0, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(last_vector)
513694
 
513694
-	addl	$VEC_SIZE, %edx
513694
+	.p2align 4,, 10
513694
+L(less_16_till_page):
513694
+	/* Find largest load size we can use.  */
513694
+	cmpl	$24, %eax
513694
+	ja	L(less_8_till_page)
513694
 
513694
-	addl	$VEC_SIZE, %eax
513694
-# ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-# endif
513694
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
513694
-	jle	L(loop_1_vector)
513694
-L(cross_page_1_vector):
513694
-	/* Less than 32 bytes to check, try one xmm vector.  */
513694
-	cmpl	$(PAGE_SIZE - 16), %eax
513694
-	jg	L(cross_page_1_xmm)
513694
-	vmovdqu	(%rdi, %rdx), %xmm1
513694
-	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
513694
-	VPMINU	%xmm1, %xmm0, %xmm0
513694
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
513694
-	vpmovmskb %xmm0, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(last_vector)
513694
+	vmovq	(%rdi), %xmm0
513694
+	vmovq	(%rsi), %xmm1
513694
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
513694
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
513694
+	vpandn	%xmm1, %xmm2, %xmm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incb	%cl
513694
+	jnz	L(check_ret_vec_page_cross)
513694
 
513694
-	addl	$16, %edx
513694
-# ifndef USE_AS_WCSCMP
513694
-	addl	$16, %eax
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	cmpq	$8, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
 # endif
513694
+	movl	$24, %OFFSET_REG
513694
+	/* Explicit check for 16 byte alignment.  */
513694
+	subl	%eax, %OFFSET_REG
513694
+
513694
+
513694
+
513694
+	vmovq	(%rdi, %OFFSET_REG64), %xmm0
513694
+	vmovq	(%rsi, %OFFSET_REG64), %xmm1
513694
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
513694
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
513694
+	vpandn	%xmm1, %xmm2, %xmm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	incb	%cl
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-# endif
513694
-
513694
-L(cross_page_1_xmm):
513694
-# ifndef USE_AS_WCSCMP
513694
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
513694
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
513694
-	cmpl	$(PAGE_SIZE - 8), %eax
513694
-	jg	L(cross_page_8bytes)
513694
-	vmovq	(%rdi, %rdx), %xmm1
513694
-	vmovq	(%rsi, %rdx), %xmm0
513694
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
513694
-	VPMINU	%xmm1, %xmm0, %xmm0
513694
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
513694
-	vpmovmskb %xmm0, %ecx
513694
-	/* Only last 8 bits are valid.  */
513694
-	andl	$0xff, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(last_vector)
513694
+	addl	$8, %OFFSET_REG
513694
+	subq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
+	subq	$-(VEC_SIZE * 4), %rdx
513694
 
513694
-	addl	$8, %edx
513694
-	addl	$8, %eax
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
513694
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
513694
+# else
513694
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
513694
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
513694
+# endif
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
+
513694
+	.p2align 4,, 10
513694
+L(less_8_till_page):
513694
+# ifdef USE_AS_WCSCMP
513694
+	/* If using wchar then this is the only check before we reach
513694
+	   the page boundary.  */
513694
+	movl	(%rdi), %eax
513694
+	movl	(%rsi), %ecx
513694
+	cmpl	%ecx, %eax
513694
+	jnz	L(ret_less_8_wcs)
513694
 #  ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	addq	%rdi, %rdx
513694
+	/* We already checked for len <= 1 so cannot hit that case here.
513694
+	 */
513694
 #  endif
513694
+	testl	%eax, %eax
513694
+	jnz	L(prepare_loop_no_len)
513694
+	ret
513694
 
513694
-L(cross_page_8bytes):
513694
-	/* Less than 8 bytes to check, try 4 byte vector.  */
513694
-	cmpl	$(PAGE_SIZE - 4), %eax
513694
-	jg	L(cross_page_4bytes)
513694
-	vmovd	(%rdi, %rdx), %xmm1
513694
-	vmovd	(%rsi, %rdx), %xmm0
513694
-	VPCMPEQ	%xmm0, %xmm1, %xmm0
513694
-	VPMINU	%xmm1, %xmm0, %xmm0
513694
-	VPCMPEQ	%xmm7, %xmm0, %xmm0
513694
-	vpmovmskb %xmm0, %ecx
513694
-	/* Only last 4 bits are valid.  */
513694
-	andl	$0xf, %ecx
513694
-	testl	%ecx, %ecx
513694
-	jne	L(last_vector)
513694
+	.p2align 4,, 8
513694
+L(ret_less_8_wcs):
513694
+	setl	%OFFSET_REG8
513694
+	negl	%OFFSET_REG
513694
+	movl	%OFFSET_REG, %eax
513694
+	xorl	%r8d, %eax
513694
+	ret
513694
+
513694
+# else
513694
+
513694
+	/* Find largest load size we can use.  */
513694
+	cmpl	$28, %eax
513694
+	ja	L(less_4_till_page)
513694
+
513694
+	vmovd	(%rdi), %xmm0
513694
+	vmovd	(%rsi), %xmm1
513694
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
513694
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
513694
+	vpandn	%xmm1, %xmm2, %xmm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	subl	$0xf, %ecx
513694
+	jnz	L(check_ret_vec_page_cross)
513694
 
513694
-	addl	$4, %edx
513694
 #  ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	$4, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case1)
513694
 #  endif
513694
+	movl	$28, %OFFSET_REG
513694
+	/* Explicit check for 16 byte alignment.  */
513694
+	subl	%eax, %OFFSET_REG
513694
 
513694
-L(cross_page_4bytes):
513694
-# endif
513694
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
513694
-# ifdef USE_AS_STRNCMP
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-# endif
513694
-# ifdef USE_AS_WCSCMP
513694
-	movl	(%rdi, %rdx), %eax
513694
-	movl	(%rsi, %rdx), %ecx
513694
-# else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %ecx
513694
-# endif
513694
-	testl	%eax, %eax
513694
-	jne	L(cross_page_loop)
513694
+
513694
+
513694
+	vmovd	(%rdi, %OFFSET_REG64), %xmm0
513694
+	vmovd	(%rsi, %OFFSET_REG64), %xmm1
513694
+	VPCMPEQ	%xmm0, %xmmZERO, %xmm2
513694
+	VPCMPEQ	%xmm1, %xmm0, %xmm1
513694
+	vpandn	%xmm1, %xmm2, %xmm1
513694
+	vpmovmskb %ymm1, %ecx
513694
+	subl	$0xf, %ecx
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+
513694
+#  ifdef USE_AS_STRNCMP
513694
+	addl	$4, %OFFSET_REG
513694
+	subq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case1)
513694
+	subq	$-(VEC_SIZE * 4), %rdx
513694
+
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
513694
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
513694
+#  else
513694
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi
513694
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi
513694
+#  endif
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
+#  ifdef USE_AS_STRNCMP
513694
+	.p2align 4,, 2
513694
+L(ret_zero_page_cross_slow_case1):
513694
+	xorl	%eax, %eax
513694
+	ret
513694
+#  endif
513694
+
513694
+	.p2align 4,, 10
513694
+L(less_4_till_page):
513694
+	subq	%rdi, %rsi
513694
+	/* Extremely slow byte comparison loop.  */
513694
+L(less_4_loop):
513694
+	movzbl	(%rdi), %eax
513694
+	movzbl	(%rsi, %rdi), %ecx
513694
 	subl	%ecx, %eax
513694
-	VZEROUPPER_RETURN
513694
-END (STRCMP)
513694
+	jnz	L(ret_less_4_loop)
513694
+	testl	%ecx, %ecx
513694
+	jz	L(ret_zero_4_loop)
513694
+#  ifdef USE_AS_STRNCMP
513694
+	decq	%rdx
513694
+	jz	L(ret_zero_4_loop)
513694
+#  endif
513694
+	incq	%rdi
513694
+	/* end condition is reach page boundary (rdi is aligned).  */
513694
+	testl	$31, %edi
513694
+	jnz	L(less_4_loop)
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
513694
+	addq	$-(VEC_SIZE * 4), %rdi
513694
+#  ifdef USE_AS_STRNCMP
513694
+	subq	$-(VEC_SIZE * 4), %rdx
513694
+#  endif
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
+L(ret_zero_4_loop):
513694
+	xorl	%eax, %eax
513694
+	ret
513694
+L(ret_less_4_loop):
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
+	ret
513694
+# endif
513694
+END(STRCMP)
513694
 #endif
513694
-- 
513694
GitLab
513694