513694
From d16c728bff5a92a254d7078d1766a4f3070acd66 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Mon, 10 Jan 2022 15:35:39 -0600
513694
Subject: [PATCH] x86: Optimize strcmp-evex.S
513694
513694
Optimization are primarily to the loop logic and how the page cross
513694
logic interacts with the loop.
513694
513694
The page cross logic is at times more expensive for short strings near
513694
the end of a page but not crossing the page. This is done to retest
513694
the page cross conditions with a non-faulty check and to improve the
513694
logic for entering the loop afterwards. This is only particular cases,
513694
however, and is general made up for by more than 10x improvements on
513694
the transition from the page cross -> loop case.
513694
513694
The non-page cross cases as well are nearly universally improved.
513694
513694
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
513694
513694
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
513694
(cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9)
513694
---
513694
 sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++-----------
513694
 1 file changed, 919 insertions(+), 793 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
513694
index 6f5c4bf9..99d8409a 100644
513694
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
513694
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
513694
@@ -26,54 +26,69 @@
513694
 
513694
 # define PAGE_SIZE	4096
513694
 
513694
-/* VEC_SIZE = Number of bytes in a ymm register */
513694
+	/* VEC_SIZE = Number of bytes in a ymm register.  */
513694
 # define VEC_SIZE	32
513694
+# define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
513694
 
513694
-/* Shift for dividing by (VEC_SIZE * 4).  */
513694
-# define DIVIDE_BY_VEC_4_SHIFT	7
513694
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
513694
-#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
513694
-# endif
513694
-
513694
-# define VMOVU		vmovdqu64
513694
-# define VMOVA		vmovdqa64
513694
+# define VMOVU	vmovdqu64
513694
+# define VMOVA	vmovdqa64
513694
 
513694
 # ifdef USE_AS_WCSCMP
513694
-/* Compare packed dwords.  */
513694
-#  define VPCMP		vpcmpd
513694
+#  define TESTEQ	subl	$0xff,
513694
+	/* Compare packed dwords.  */
513694
+#  define VPCMP	vpcmpd
513694
 #  define VPMINU	vpminud
513694
 #  define VPTESTM	vptestmd
513694
-#  define SHIFT_REG32	r8d
513694
-#  define SHIFT_REG64	r8
513694
-/* 1 dword char == 4 bytes.  */
513694
+	/* 1 dword char == 4 bytes.  */
513694
 #  define SIZE_OF_CHAR	4
513694
 # else
513694
-/* Compare packed bytes.  */
513694
-#  define VPCMP		vpcmpb
513694
+#  define TESTEQ	incl
513694
+	/* Compare packed bytes.  */
513694
+#  define VPCMP	vpcmpb
513694
 #  define VPMINU	vpminub
513694
 #  define VPTESTM	vptestmb
513694
-#  define SHIFT_REG32	ecx
513694
-#  define SHIFT_REG64	rcx
513694
-/* 1 byte char == 1 byte.  */
513694
+	/* 1 byte char == 1 byte.  */
513694
 #  define SIZE_OF_CHAR	1
513694
 # endif
513694
 
513694
+# ifdef USE_AS_STRNCMP
513694
+#  define LOOP_REG	r9d
513694
+#  define LOOP_REG64	r9
513694
+
513694
+#  define OFFSET_REG8	r9b
513694
+#  define OFFSET_REG	r9d
513694
+#  define OFFSET_REG64	r9
513694
+# else
513694
+#  define LOOP_REG	edx
513694
+#  define LOOP_REG64	rdx
513694
+
513694
+#  define OFFSET_REG8	dl
513694
+#  define OFFSET_REG	edx
513694
+#  define OFFSET_REG64	rdx
513694
+# endif
513694
+
513694
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
513694
+#  define VEC_OFFSET	0
513694
+# else
513694
+#  define VEC_OFFSET	(-VEC_SIZE)
513694
+# endif
513694
+
513694
 # define XMMZERO	xmm16
513694
-# define XMM0		xmm17
513694
-# define XMM1		xmm18
513694
+# define XMM0	xmm17
513694
+# define XMM1	xmm18
513694
 
513694
 # define YMMZERO	ymm16
513694
-# define YMM0		ymm17
513694
-# define YMM1		ymm18
513694
-# define YMM2		ymm19
513694
-# define YMM3		ymm20
513694
-# define YMM4		ymm21
513694
-# define YMM5		ymm22
513694
-# define YMM6		ymm23
513694
-# define YMM7		ymm24
513694
-# define YMM8		ymm25
513694
-# define YMM9		ymm26
513694
-# define YMM10		ymm27
513694
+# define YMM0	ymm17
513694
+# define YMM1	ymm18
513694
+# define YMM2	ymm19
513694
+# define YMM3	ymm20
513694
+# define YMM4	ymm21
513694
+# define YMM5	ymm22
513694
+# define YMM6	ymm23
513694
+# define YMM7	ymm24
513694
+# define YMM8	ymm25
513694
+# define YMM9	ymm26
513694
+# define YMM10	ymm27
513694
 
513694
 /* Warning!
513694
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
513694
@@ -96,985 +111,1096 @@
513694
    the maximum offset is reached before a difference is found, zero is
513694
    returned.  */
513694
 
513694
-	.section .text.evex,"ax",@progbits
513694
-ENTRY (STRCMP)
513694
+	.section .text.evex, "ax", @progbits
513694
+ENTRY(STRCMP)
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Check for simple cases (0 or 1) in offset.  */
513694
-	cmp	$1, %RDX_LP
513694
-	je	L(char0)
513694
-	jb	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-#  ifndef __ILP32__
513694
-	movq	%rdx, %rcx
513694
-	/* Check if length could overflow when multiplied by
513694
-	   sizeof(wchar_t). Checking top 8 bits will cover all potential
513694
-	   overflow cases as well as redirect cases where its impossible to
513694
-	   length to bound a valid memory region. In these cases just use
513694
-	   'wcscmp'.  */
513694
-	shrq	$56, %rcx
513694
-	jnz	__wcscmp_evex
513694
-#  endif
513694
-	/* Convert units: from wide to byte char.  */
513694
-	shl	$2, %RDX_LP
513694
+#  ifdef __ILP32__
513694
+	/* Clear the upper 32 bits.  */
513694
+	movl	%edx, %rdx
513694
 #  endif
513694
-	/* Register %r11 tracks the maximum offset.  */
513694
-	mov	%RDX_LP, %R11_LP
513694
+	cmp	$1, %RDX_LP
513694
+	/* Signed comparison intentional. We use this branch to also
513694
+	   test cases where length >= 2^63. These very large sizes can be
513694
+	   handled with strcmp as there is no way for that length to
513694
+	   actually bound the buffer.  */
513694
+	jle	L(one_or_less)
513694
 # endif
513694
 	movl	%edi, %eax
513694
-	xorl	%edx, %edx
513694
-	/* Make %XMMZERO (%YMMZERO) all zeros in this function.  */
513694
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
513694
 	orl	%esi, %eax
513694
-	andl	$(PAGE_SIZE - 1), %eax
513694
-	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
513694
-	jg	L(cross_page)
513694
-	/* Start comparing 4 vectors.  */
513694
+	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
513694
+	sall	$20, %eax
513694
+	/* Check if s1 or s2 may cross a page in next 4x VEC loads.  */
513694
+	cmpl	$((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
513694
+	ja	L(page_cross)
513694
+
513694
+L(no_page_cross):
513694
+	/* Safe to compare 4x vectors.  */
513694
 	VMOVU	(%rdi), %YMM0
513694
-
513694
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
513694
 	VPTESTM	%YMM0, %YMM0, %k2
513694
-
513694
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
 	   in YMM0 and 32 bytes at (%rsi).  */
513694
 	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
513694
-
513694
 	kmovd	%k1, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
-# else
513694
-	incl	%ecx
513694
-# endif
513694
-	je	L(next_3_vectors)
513694
-	tzcntl	%ecx, %edx
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edx
513694
-# endif
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx) is after the maximum
513694
-	   offset (%r11).   */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	$CHAR_PER_VEC, %rdx
513694
+	jbe	L(vec_0_test_len)
513694
 # endif
513694
+
513694
+	/* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
513694
+	   wcscmp/wcsncmp.  */
513694
+
513694
+	/* All 1s represents all equals. TESTEQ will overflow to zero in
513694
+	   all equals case. Otherwise 1s will carry until position of first
513694
+	   mismatch.  */
513694
+	TESTEQ	%ecx
513694
+	jz	L(more_3x_vec)
513694
+
513694
+	.p2align 4,, 4
513694
+L(return_vec_0):
513694
+	tzcntl	%ecx, %ecx
513694
 # ifdef USE_AS_WCSCMP
513694
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	je	L(return)
513694
-L(wcscmp_return):
513694
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret0)
513694
 	setl	%al
513694
 	negl	%eax
513694
 	orl	$1, %eax
513694
-L(return):
513694
 # else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	movzbl	(%rdi, %rcx), %eax
513694
+	movzbl	(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
 # endif
513694
+L(ret0):
513694
 	ret
513694
 
513694
-L(return_vec_size):
513694
-	tzcntl	%ecx, %edx
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edx
513694
-# endif
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
513694
-	   the maximum offset (%r11).  */
513694
-	addq	$VEC_SIZE, %rdx
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
+	.p2align 4,, 4
513694
+L(vec_0_test_len):
513694
+	notl	%ecx
513694
+	bzhil	%edx, %ecx, %eax
513694
+	jnz	L(return_vec_0)
513694
+	/* Align if will cross fetch block.  */
513694
+	.p2align 4,, 2
513694
+L(ret_zero):
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# else
513694
+	ret
513694
+
513694
+	.p2align 4,, 5
513694
+L(one_or_less):
513694
+	jb	L(ret_zero)
513694
 #  ifdef USE_AS_WCSCMP
513694
+	/* 'nbe' covers the case where length is negative (large
513694
+	   unsigned).  */
513694
+	jnbe	__wcscmp_evex
513694
+	movl	(%rdi), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	VEC_SIZE(%rdi, %rdx), %ecx
513694
-	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	cmpl	(%rsi), %edx
513694
+	je	L(ret1)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 #  else
513694
-	movzbl	VEC_SIZE(%rdi, %rdx), %eax
513694
-	movzbl	VEC_SIZE(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	/* 'nbe' covers the case where length is negative (large
513694
+	   unsigned).  */
513694
+	jnbe	__strcmp_evex
513694
+	movzbl	(%rdi), %eax
513694
+	movzbl	(%rsi), %ecx
513694
+	subl	%ecx, %eax
513694
 #  endif
513694
-# endif
513694
+L(ret1):
513694
 	ret
513694
+# endif
513694
 
513694
-L(return_2_vec_size):
513694
-	tzcntl	%ecx, %edx
513694
+	.p2align 4,, 10
513694
+L(return_vec_1):
513694
+	tzcntl	%ecx, %ecx
513694
+# ifdef USE_AS_STRNCMP
513694
+	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
513694
+	   worrying about underflow.  */
513694
+	addq	$-CHAR_PER_VEC, %rdx
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
 # ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edx
513694
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
+	xorl	%eax, %eax
513694
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret2)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
+# else
513694
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
513694
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
 # endif
513694
+L(ret2):
513694
+	ret
513694
+
513694
+	.p2align 4,, 10
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
513694
-	   after the maximum offset (%r11).  */
513694
-	addq	$(VEC_SIZE * 2), %rdx
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
+L(return_vec_3):
513694
+#  if CHAR_PER_VEC <= 16
513694
+	sall	$CHAR_PER_VEC, %ecx
513694
 #  else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	salq	$CHAR_PER_VEC, %rcx
513694
 #  endif
513694
+# endif
513694
+L(return_vec_2):
513694
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
513694
+	tzcntl	%ecx, %ecx
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
513694
-	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
513694
-	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	tzcntq	%rcx, %rcx
513694
 # endif
513694
-	ret
513694
 
513694
-L(return_3_vec_size):
513694
-	tzcntl	%ecx, %edx
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edx
513694
-# endif
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
513694
-	   after the maximum offset (%r11).  */
513694
-	addq	$(VEC_SIZE * 3), %rdx
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret3)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 # else
513694
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
513694
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+# endif
513694
+L(ret3):
513694
+	ret
513694
+
513694
+# ifndef USE_AS_STRNCMP
513694
+	.p2align 4,, 10
513694
+L(return_vec_3):
513694
+	tzcntl	%ecx, %ecx
513694
 #  ifdef USE_AS_WCSCMP
513694
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
513694
-	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret4)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	orl	$1, %eax
513694
 #  else
513694
-	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
513694
-	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
513694
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
 #  endif
513694
-# endif
513694
+L(ret4):
513694
 	ret
513694
+# endif
513694
 
513694
-	.p2align 4
513694
-L(next_3_vectors):
513694
-	VMOVU	VEC_SIZE(%rdi), %YMM0
513694
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
513694
+	/* 32 byte align here ensures the main loop is ideally aligned
513694
+	   for DSB.  */
513694
+	.p2align 5
513694
+L(more_3x_vec):
513694
+	/* Safe to compare 4x vectors.  */
513694
+	VMOVU	(VEC_SIZE)(%rdi), %YMM0
513694
 	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in YMM0 and 32 bytes at VEC_SIZE(%rsi).  */
513694
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
513694
+	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
513694
 	kmovd	%k1, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
-# else
513694
-	incl	%ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_1)
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	subq	$(CHAR_PER_VEC * 2), %rdx
513694
+	jbe	L(ret_zero)
513694
 # endif
513694
-	jne	L(return_vec_size)
513694
 
513694
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
513694
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
513694
 	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
513694
 	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
513694
 	kmovd	%k1, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
-# else
513694
-	incl	%ecx
513694
-# endif
513694
-	jne	L(return_2_vec_size)
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_2)
513694
 
513694
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
513694
-	/* Each bit set in K2 represents a non-null CHAR in YMM0.  */
513694
 	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi).  */
513694
 	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
513694
 	kmovd	%k1, %ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_3)
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
513694
+	jbe	L(ret_zero)
513694
+# endif
513694
+
513694
+
513694
 # ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
+	/* any non-zero positive value that doesn't inference with 0x1.
513694
+	 */
513694
+	movl	$2, %r8d
513694
+
513694
 # else
513694
-	incl	%ecx
513694
+	xorl	%r8d, %r8d
513694
 # endif
513694
-	jne	L(return_3_vec_size)
513694
-L(main_loop_header):
513694
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
513694
-	movl	$PAGE_SIZE, %ecx
513694
-	/* Align load via RAX.  */
513694
-	andq	$-(VEC_SIZE * 4), %rdx
513694
-	subq	%rdi, %rdx
513694
-	leaq	(%rdi, %rdx), %rax
513694
+
513694
+	/* The prepare labels are various entry points from the page
513694
+	   cross logic.  */
513694
+L(prepare_loop):
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Starting from this point, the maximum offset, or simply the
513694
-	   'offset', DECREASES by the same amount when base pointers are
513694
-	   moved forward.  Return 0 when:
513694
-	     1) On match: offset <= the matched vector index.
513694
-	     2) On mistmach, offset is before the mistmatched index.
513694
-	 */
513694
-	subq	%rdx, %r11
513694
-	jbe	L(zero)
513694
+#  ifdef USE_AS_WCSCMP
513694
+L(prepare_loop_no_len):
513694
+	movl	%edi, %ecx
513694
+	andl	$(VEC_SIZE * 4 - 1), %ecx
513694
+	shrl	$2, %ecx
513694
+	leaq	(CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
513694
+#  else
513694
+	/* Store N + (VEC_SIZE * 4) and place check at the begining of
513694
+	   the loop.  */
513694
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx), %rdx
513694
+L(prepare_loop_no_len):
513694
+#  endif
513694
+# else
513694
+L(prepare_loop_no_len):
513694
 # endif
513694
-	addq	%rsi, %rdx
513694
-	movq	%rdx, %rsi
513694
-	andl	$(PAGE_SIZE - 1), %esi
513694
-	/* Number of bytes before page crossing.  */
513694
-	subq	%rsi, %rcx
513694
-	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
513694
-	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
513694
-	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
513694
-	movl	%ecx, %esi
513694
-	jmp	L(loop_start)
513694
 
513694
+	/* Align s1 and adjust s2 accordingly.  */
513694
+	subq	%rdi, %rsi
513694
+	andq	$-(VEC_SIZE * 4), %rdi
513694
+L(prepare_loop_readj):
513694
+	addq	%rdi, %rsi
513694
+# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
513694
+	subq	%rdi, %rdx
513694
+# endif
513694
+
513694
+L(prepare_loop_aligned):
513694
+	/* eax stores distance from rsi to next page cross. These cases
513694
+	   need to be handled specially as the 4x loop could potentially
513694
+	   read memory past the length of s1 or s2 and across a page
513694
+	   boundary.  */
513694
+	movl	$-(VEC_SIZE * 4), %eax
513694
+	subl	%esi, %eax
513694
+	andl	$(PAGE_SIZE - 1), %eax
513694
+
513694
+	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
513694
+
513694
+	/* Loop 4x comparisons at a time.  */
513694
 	.p2align 4
513694
 L(loop):
513694
+
513694
+	/* End condition for strncmp.  */
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
513694
-	   the maximum offset (%r11) by the same amount.  */
513694
-	subq	$(VEC_SIZE * 4), %r11
513694
-	jbe	L(zero)
513694
+	subq	$(CHAR_PER_VEC * 4), %rdx
513694
+	jbe	L(ret_zero)
513694
 # endif
513694
-	addq	$(VEC_SIZE * 4), %rax
513694
-	addq	$(VEC_SIZE * 4), %rdx
513694
-L(loop_start):
513694
-	testl	%esi, %esi
513694
-	leal	-1(%esi), %esi
513694
-	je	L(loop_cross_page)
513694
-L(back_to_loop):
513694
-	/* Main loop, comparing 4 vectors are a time.  */
513694
-	VMOVA	(%rax), %YMM0
513694
-	VMOVA	VEC_SIZE(%rax), %YMM2
513694
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM4
513694
-	VMOVA	(VEC_SIZE * 3)(%rax), %YMM6
513694
+
513694
+	subq	$-(VEC_SIZE * 4), %rdi
513694
+	subq	$-(VEC_SIZE * 4), %rsi
513694
+
513694
+	/* Check if rsi loads will cross a page boundary.  */
513694
+	addl	$-(VEC_SIZE * 4), %eax
513694
+	jnb	L(page_cross_during_loop)
513694
+
513694
+	/* Loop entry after handling page cross during loop.  */
513694
+L(loop_skip_page_cross_check):
513694
+	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
513694
+	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
513694
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
513694
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
513694
 
513694
 	VPMINU	%YMM0, %YMM2, %YMM8
513694
 	VPMINU	%YMM4, %YMM6, %YMM9
513694
 
513694
-	/* A zero CHAR in YMM8 means that there is a null CHAR.  */
513694
-	VPMINU	%YMM8, %YMM9, %YMM8
513694
+	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
513694
+	VPMINU	%YMM8, %YMM9, %YMM9
513694
 
513694
 	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
513694
-	VPTESTM	%YMM8, %YMM8, %k1
513694
+	VPTESTM	%YMM9, %YMM9, %k1
513694
 
513694
-	/* (YMM ^ YMM): A non-zero CHAR represents a mismatch.  */
513694
-	vpxorq	(%rdx), %YMM0, %YMM1
513694
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM3
513694
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
513694
-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
513694
+	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
513694
+	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
513694
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
513694
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
513694
+	   oring with YMM1. Result is stored in YMM6.  */
513694
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
513694
 
513694
-	vporq	%YMM1, %YMM3, %YMM9
513694
-	vporq	%YMM5, %YMM7, %YMM10
513694
+	/* Or together YMM3, YMM5, and YMM6.  */
513694
+	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
513694
 
513694
-	/* A non-zero CHAR in YMM9 represents a mismatch.  */
513694
-	vporq	%YMM9, %YMM10, %YMM9
513694
 
513694
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR.  */
513694
-	VPCMP	$0, %YMMZERO, %YMM9, %k0{%k1}
513694
-	kmovd   %k0, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
-# else
513694
-	incl	%ecx
513694
-# endif
513694
-	je	 L(loop)
513694
+	/* A non-zero CHAR in YMM6 represents a mismatch.  */
513694
+	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
513694
+	kmovd	%k0, %LOOP_REG
513694
 
513694
-	/* Each bit set in K1 represents a non-null CHAR in YMM0.  */
513694
+	TESTEQ	%LOOP_REG
513694
+	jz	L(loop)
513694
+
513694
+
513694
+	/* Find which VEC has the mismatch of end of string.  */
513694
 	VPTESTM	%YMM0, %YMM0, %k1
513694
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
513694
-	   in YMM0 and (%rdx).  */
513694
 	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
513694
 	kmovd	%k0, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
-# else
513694
-	incl	%ecx
513694
-# endif
513694
-	je	L(test_vec)
513694
-	tzcntl	%ecx, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %ecx
513694
-# endif
513694
-# ifdef USE_AS_STRNCMP
513694
-	cmpq	%rcx, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# endif
513694
-	ret
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_0_end)
513694
 
513694
-	.p2align 4
513694
-L(test_vec):
513694
-# ifdef USE_AS_STRNCMP
513694
-	/* The first vector matched.  Return 0 if the maximum offset
513694
-	   (%r11) <= VEC_SIZE.  */
513694
-	cmpq	$VEC_SIZE, %r11
513694
-	jbe	L(zero)
513694
-# endif
513694
-	/* Each bit set in K1 represents a non-null CHAR in YMM2.  */
513694
 	VPTESTM	%YMM2, %YMM2, %k1
513694
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
513694
-	   in YMM2 and VEC_SIZE(%rdx).  */
513694
 	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
513694
 	kmovd	%k0, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
-# else
513694
-	incl	%ecx
513694
-# endif
513694
-	je	L(test_2_vec)
513694
-	tzcntl	%ecx, %edi
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edi
513694
-# endif
513694
-# ifdef USE_AS_STRNCMP
513694
-	addq	$VEC_SIZE, %rdi
513694
-	cmpq	%rdi, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rdi), %ecx
513694
-	cmpl	(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rdi), %eax
513694
-	movzbl	(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	VEC_SIZE(%rsi, %rdi), %ecx
513694
-	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	VEC_SIZE(%rax, %rdi), %eax
513694
-	movzbl	VEC_SIZE(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# endif
513694
-	ret
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_1_end)
513694
 
513694
-	.p2align 4
513694
-L(test_2_vec):
513694
+
513694
+	/* Handle VEC 2 and 3 without branches.  */
513694
+L(return_vec_2_3_end):
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* The first 2 vectors matched.  Return 0 if the maximum offset
513694
-	   (%r11) <= 2 * VEC_SIZE.  */
513694
-	cmpq	$(VEC_SIZE * 2), %r11
513694
-	jbe	L(zero)
513694
+	subq	$(CHAR_PER_VEC * 2), %rdx
513694
+	jbe	L(ret_zero_end)
513694
 # endif
513694
-	/* Each bit set in K1 represents a non-null CHAR in YMM4.  */
513694
+
513694
 	VPTESTM	%YMM4, %YMM4, %k1
513694
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
513694
-	   in YMM4 and (VEC_SIZE * 2)(%rdx).  */
513694
 	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
513694
 	kmovd	%k0, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
+	TESTEQ	%ecx
513694
+# if CHAR_PER_VEC <= 16
513694
+	sall	$CHAR_PER_VEC, %LOOP_REG
513694
+	orl	%ecx, %LOOP_REG
513694
 # else
513694
-	incl	%ecx
513694
+	salq	$CHAR_PER_VEC, %LOOP_REG64
513694
+	orq	%rcx, %LOOP_REG64
513694
+# endif
513694
+L(return_vec_3_end):
513694
+	/* LOOP_REG contains matches for null/mismatch from the loop. If
513694
+	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
513694
+	   must entirely be from VEC 3 which is fully represented by
513694
+	   LOOP_REG.  */
513694
+# if CHAR_PER_VEC <= 16
513694
+	tzcntl	%LOOP_REG, %LOOP_REG
513694
+# else
513694
+	tzcntq	%LOOP_REG64, %LOOP_REG64
513694
+# endif
513694
+# ifdef USE_AS_STRNCMP
513694
+	cmpq	%LOOP_REG64, %rdx
513694
+	jbe	L(ret_zero_end)
513694
 # endif
513694
-	je	L(test_3_vec)
513694
-	tzcntl	%ecx, %edi
513694
+
513694
 # ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edi
513694
+	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
513694
+	xorl	%eax, %eax
513694
+	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
513694
+	je	L(ret5)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
+# else
513694
+	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
513694
+	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret5):
513694
+	ret
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	addq	$(VEC_SIZE * 2), %rdi
513694
-	cmpq	%rdi, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	.p2align 4,, 2
513694
+L(ret_zero_end):
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rdi), %ecx
513694
-	cmpl	(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	ret
513694
+# endif
513694
+
513694
+
513694
+	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
513694
+	   they use the value of `r8` to negate the return value. This is
513694
+	   because the page cross logic can swap `rdi` and `rsi`.  */
513694
+	.p2align 4,, 10
513694
+# ifdef USE_AS_STRNCMP
513694
+L(return_vec_1_end):
513694
+#  if CHAR_PER_VEC <= 16
513694
+	sall	$CHAR_PER_VEC, %ecx
513694
 #  else
513694
-	movzbl	(%rax, %rdi), %eax
513694
-	movzbl	(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
+	salq	$CHAR_PER_VEC, %rcx
513694
 #  endif
513694
+# endif
513694
+L(return_vec_0_end):
513694
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
513694
+	tzcntl	%ecx, %ecx
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
513694
-	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
513694
-	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	tzcntq	%rcx, %rcx
513694
 # endif
513694
-	ret
513694
 
513694
-	.p2align 4
513694
-L(test_3_vec):
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* The first 3 vectors matched.  Return 0 if the maximum offset
513694
-	   (%r11) <= 3 * VEC_SIZE.  */
513694
-	cmpq	$(VEC_SIZE * 3), %r11
513694
-	jbe	L(zero)
513694
+	cmpq	%rcx, %rdx
513694
+	jbe	L(ret_zero_end)
513694
 # endif
513694
-	/* Each bit set in K1 represents a non-null CHAR in YMM6.  */
513694
-	VPTESTM	%YMM6, %YMM6, %k1
513694
-	/* Each bit cleared in K0 represents a mismatch or a null CHAR
513694
-	   in YMM6 and (VEC_SIZE * 3)(%rdx).  */
513694
-	VPCMP	$0, %YMMZERO, %YMM7, %k0{%k1}
513694
-	kmovd	%k0, %ecx
513694
+
513694
 # ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
+	xorl	%eax, %eax
513694
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret6)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
513694
+	   flip is `rdi` and `rsi` where swapped.  */
513694
+	xorl	%r8d, %eax
513694
 # else
513694
-	incl	%ecx
513694
+	movzbl	(%rdi, %rcx), %eax
513694
+	movzbl	(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
513694
+	   logic. Subtract `r8d` after xor for zero case.  */
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret6):
513694
+	ret
513694
+
513694
+# ifndef USE_AS_STRNCMP
513694
+	.p2align 4,, 10
513694
+L(return_vec_1_end):
513694
 	tzcntl	%ecx, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %ecx
513694
-# endif
513694
-# ifdef USE_AS_STRNCMP
513694
-	addq	$(VEC_SIZE * 3), %rcx
513694
-	cmpq	%rcx, %r11
513694
-	jbe	L(zero)
513694
 #  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %esi
513694
-	cmpl	(%rdx, %rcx), %esi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
-# else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
513694
-	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
513694
-	jne	L(wcscmp_return)
513694
+	cmpl	VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret7)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 #  else
513694
-	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
513694
-	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
513694
+	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 #  endif
513694
-# endif
513694
+L(ret7):
513694
 	ret
513694
-
513694
-	.p2align 4
513694
-L(loop_cross_page):
513694
-	xorl	%r10d, %r10d
513694
-	movq	%rdx, %rcx
513694
-	/* Align load via RDX.  We load the extra ECX bytes which should
513694
-	   be ignored.  */
513694
-	andl	$((VEC_SIZE * 4) - 1), %ecx
513694
-	/* R10 is -RCX.  */
513694
-	subq	%rcx, %r10
513694
-
513694
-	/* This works only if VEC_SIZE * 2 == 64. */
513694
-# if (VEC_SIZE * 2) != 64
513694
-#  error (VEC_SIZE * 2) != 64
513694
 # endif
513694
 
513694
-	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
513694
-	cmpl	$(VEC_SIZE * 2), %ecx
513694
-	jge	L(loop_cross_page_2_vec)
513694
 
513694
-	VMOVU	(%rax, %r10), %YMM2
513694
-	VMOVU	VEC_SIZE(%rax, %r10), %YMM3
513694
+	/* Page cross in rsi in next 4x VEC.  */
513694
 
513694
-	/* Each bit set in K2 represents a non-null CHAR in YMM2.  */
513694
-	VPTESTM	%YMM2, %YMM2, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in YMM2 and 32 bytes at (%rdx, %r10).  */
513694
-	VPCMP	$0, (%rdx, %r10), %YMM2, %k1{%k2}
513694
-	kmovd	%k1, %r9d
513694
-	/* Don't use subl since it is the lower 16/32 bits of RDI
513694
-	   below.  */
513694
-	notl	%r9d
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* Only last 8 bits are valid.  */
513694
-	andl	$0xff, %r9d
513694
-# endif
513694
+	/* TODO: Improve logic here.  */
513694
+	.p2align 4,, 10
513694
+L(page_cross_during_loop):
513694
+	/* eax contains [distance_from_page - (VEC_SIZE * 4)].  */
513694
 
513694
-	/* Each bit set in K4 represents a non-null CHAR in YMM3.  */
513694
-	VPTESTM	%YMM3, %YMM3, %k4
513694
-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
513694
-	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
513694
-	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
513694
-	kmovd	%k3, %edi
513694
-    /* Must use notl %edi here as lower bits are for CHAR
513694
-	   comparisons potentially out of range thus can be 0 without
513694
-	   indicating mismatch.  */
513694
-	notl	%edi
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
513694
-	andl	$0xff, %edi
513694
-# endif
513694
+	/* Optimistically rsi and rdi and both aligned in which case we
513694
+	   don't need any logic here.  */
513694
+	cmpl	$-(VEC_SIZE * 4), %eax
513694
+	/* Don't adjust eax before jumping back to loop and we will
513694
+	   never hit page cross case again.  */
513694
+	je	L(loop_skip_page_cross_check)
513694
 
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
513694
-	sall	$8, %edi
513694
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
513694
-	   bytes.  */
513694
-	movl	%ecx, %SHIFT_REG32
513694
-	sarl	$2, %SHIFT_REG32
513694
-
513694
-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
513694
-	orl	%r9d, %edi
513694
-# else
513694
-	salq	$32, %rdi
513694
+	/* Check if we can safely load a VEC.  */
513694
+	cmpl	$-(VEC_SIZE * 3), %eax
513694
+	jle	L(less_1x_vec_till_page_cross)
513694
 
513694
-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
513694
-	orq	%r9, %rdi
513694
-# endif
513694
+	VMOVA	(%rdi), %YMM0
513694
+	VPTESTM	%YMM0, %YMM0, %k2
513694
+	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_0_end)
513694
+
513694
+	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
513694
+	cmpl	$-(VEC_SIZE * 2), %eax
513694
+	jg	L(more_2x_vec_till_page_cross)
513694
+
513694
+	.p2align 4,, 4
513694
+L(less_1x_vec_till_page_cross):
513694
+	subl	$-(VEC_SIZE * 4), %eax
513694
+	/* Guranteed safe to read from rdi - VEC_SIZE here. The only
513694
+	   concerning case is first iteration if incoming s1 was near start
513694
+	   of a page and s2 near end. If s1 was near the start of the page
513694
+	   we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
513694
+	   to read back -VEC_SIZE. If rdi is truly at the start of a page
513694
+	   here, it means the previous page (rdi - VEC_SIZE) has already
513694
+	   been loaded earlier so must be valid.  */
513694
+	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
513694
+	VPTESTM	%YMM0, %YMM0, %k2
513694
+	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
513694
+
513694
+	/* Mask of potentially valid bits. The lower bits can be out of
513694
+	   range comparisons (but safe regarding page crosses).  */
513694
 
513694
-	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
513694
-	shrxq	%SHIFT_REG64, %rdi, %rdi
513694
-	testq	%rdi, %rdi
513694
-	je	L(loop_cross_page_2_vec)
513694
-	tzcntq	%rdi, %rcx
513694
 # ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %ecx
513694
+	movl	$-1, %r10d
513694
+	movl	%esi, %ecx
513694
+	andl	$(VEC_SIZE - 1), %ecx
513694
+	shrl	$2, %ecx
513694
+	shlxl	%ecx, %r10d, %ecx
513694
+	movzbl	%cl, %r10d
513694
+# else
513694
+	movl	$-1, %ecx
513694
+	shlxl	%esi, %ecx, %r10d
513694
 # endif
513694
+
513694
+	kmovd	%k1, %ecx
513694
+	notl	%ecx
513694
+
513694
+
513694
 # ifdef USE_AS_STRNCMP
513694
-	cmpq	%rcx, %r11
513694
-	jbe	L(zero)
513694
 #  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
+	movl	%eax, %r11d
513694
+	shrl	$2, %r11d
513694
+	cmpq	%r11, %rdx
513694
 #  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
+	cmpq	%rax, %rdx
513694
 #  endif
513694
+	jbe	L(return_page_cross_end_check)
513694
+# endif
513694
+	movl	%eax, %OFFSET_REG
513694
+
513694
+	/* Readjust eax before potentially returning to the loop.  */
513694
+	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
513694
+
513694
+	andl	%r10d, %ecx
513694
+	jz	L(loop_skip_page_cross_check)
513694
+
513694
+	.p2align 4,, 3
513694
+L(return_page_cross_end):
513694
+	tzcntl	%ecx, %ecx
513694
+
513694
+# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
513694
+	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
513694
+L(return_page_cross_cmp_mem):
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	addl	%OFFSET_REG, %ecx
513694
+# endif
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
513694
+	je	L(ret8)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
+# else
513694
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
513694
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret8):
513694
 	ret
513694
 
513694
-	.p2align 4
513694
-L(loop_cross_page_2_vec):
513694
-	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
513694
-	VMOVU	(VEC_SIZE * 2)(%rax, %r10), %YMM0
513694
-	VMOVU	(VEC_SIZE * 3)(%rax, %r10), %YMM1
513694
+# ifdef USE_AS_STRNCMP
513694
+	.p2align 4,, 10
513694
+L(return_page_cross_end_check):
513694
+	tzcntl	%ecx, %ecx
513694
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
513694
+#  ifdef USE_AS_WCSCMP
513694
+	sall	$2, %edx
513694
+#  endif
513694
+	cmpl	%ecx, %edx
513694
+	ja	L(return_page_cross_cmp_mem)
513694
+	xorl	%eax, %eax
513694
+	ret
513694
+# endif
513694
+
513694
 
513694
+	.p2align 4,, 10
513694
+L(more_2x_vec_till_page_cross):
513694
+	/* If more 2x vec till cross we will complete a full loop
513694
+	   iteration here.  */
513694
+
513694
+	VMOVA	VEC_SIZE(%rdi), %YMM0
513694
 	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10).  */
513694
-	VPCMP	$0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
513694
-	kmovd	%k1, %r9d
513694
-	/* Don't use subl since it is the lower 16/32 bits of RDI
513694
-	   below.  */
513694
-	notl	%r9d
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* Only last 8 bits are valid.  */
513694
-	andl	$0xff, %r9d
513694
-# endif
513694
+	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_1_end)
513694
 
513694
-	VPTESTM	%YMM1, %YMM1, %k4
513694
-	/* Each bit cleared in K3 represents a mismatch or a null CHAR
513694
-	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
513694
-	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
513694
-	kmovd	%k3, %edi
513694
-	/* Must use notl %edi here as lower bits are for CHAR
513694
-	   comparisons potentially out of range thus can be 0 without
513694
-	   indicating mismatch.  */
513694
-	notl	%edi
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* Don't use subl since it is the upper 8 bits of EDI below.  */
513694
-	andl	$0xff, %edi
513694
+# ifdef USE_AS_STRNCMP
513694
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
513694
+	jbe	L(ret_zero_in_loop_page_cross)
513694
 # endif
513694
 
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Each bit in EDI/R9D represents 4-byte element.  */
513694
-	sall	$8, %edi
513694
+	subl	$-(VEC_SIZE * 4), %eax
513694
 
513694
-	/* Each bit in EDI represents a null CHAR or a mismatch.  */
513694
-	orl	%r9d, %edi
513694
-# else
513694
-	salq	$32, %rdi
513694
+	/* Safe to include comparisons from lower bytes.  */
513694
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
513694
+	VPTESTM	%YMM0, %YMM0, %k2
513694
+	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_page_cross_0)
513694
+
513694
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
513694
+	VPTESTM	%YMM0, %YMM0, %k2
513694
+	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(return_vec_page_cross_1)
513694
 
513694
-	/* Each bit in RDI represents a null CHAR or a mismatch.  */
513694
-	orq	%r9, %rdi
513694
+# ifdef USE_AS_STRNCMP
513694
+	/* Must check length here as length might proclude reading next
513694
+	   page.  */
513694
+#  ifdef USE_AS_WCSCMP
513694
+	movl	%eax, %r11d
513694
+	shrl	$2, %r11d
513694
+	cmpq	%r11, %rdx
513694
+#  else
513694
+	cmpq	%rax, %rdx
513694
+#  endif
513694
+	jbe	L(ret_zero_in_loop_page_cross)
513694
 # endif
513694
 
513694
-	xorl	%r8d, %r8d
513694
-	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
513694
-	subl	$(VEC_SIZE * 2), %ecx
513694
-	jle	1f
513694
-	/* R8 has number of bytes skipped.  */
513694
-	movl	%ecx, %r8d
513694
-# ifdef USE_AS_WCSCMP
513694
-	/* NB: Divide shift count by 4 since each bit in RDI represent 4
513694
-	   bytes.  */
513694
-	sarl	$2, %ecx
513694
-	/* Skip ECX bytes.  */
513694
-	shrl	%cl, %edi
513694
+	/* Finish the loop.  */
513694
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
513694
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
513694
+	VPMINU	%YMM4, %YMM6, %YMM9
513694
+	VPTESTM	%YMM9, %YMM9, %k1
513694
+
513694
+	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
513694
+	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
513694
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
513694
+
513694
+	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
513694
+	kmovd	%k0, %LOOP_REG
513694
+	TESTEQ	%LOOP_REG
513694
+	jnz	L(return_vec_2_3_end)
513694
+
513694
+	/* Best for code size to include ucond-jmp here. Would be faster
513694
+	   if this case is hot to duplicate the L(return_vec_2_3_end) code
513694
+	   as fall-through and have jump back to loop on mismatch
513694
+	   comparison.  */
513694
+	subq	$-(VEC_SIZE * 4), %rdi
513694
+	subq	$-(VEC_SIZE * 4), %rsi
513694
+	addl	$(PAGE_SIZE - VEC_SIZE * 8), %eax
513694
+# ifdef USE_AS_STRNCMP
513694
+	subq	$(CHAR_PER_VEC * 4), %rdx
513694
+	ja	L(loop_skip_page_cross_check)
513694
+L(ret_zero_in_loop_page_cross):
513694
+	xorl	%eax, %eax
513694
+	ret
513694
 # else
513694
-	/* Skip ECX bytes.  */
513694
-	shrq	%cl, %rdi
513694
+	jmp	L(loop_skip_page_cross_check)
513694
 # endif
513694
-1:
513694
-	/* Before jumping back to the loop, set ESI to the number of
513694
-	   VEC_SIZE * 4 blocks before page crossing.  */
513694
-	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
513694
 
513694
-	testq	%rdi, %rdi
513694
-# ifdef USE_AS_STRNCMP
513694
-	/* At this point, if %rdi value is 0, it already tested
513694
-	   VEC_SIZE*4+%r10 byte starting from %rax. This label
513694
-	   checks whether strncmp maximum offset reached or not.  */
513694
-	je	L(string_nbyte_offset_check)
513694
+
513694
+	.p2align 4,, 10
513694
+L(return_vec_page_cross_0):
513694
+	addl	$-VEC_SIZE, %eax
513694
+L(return_vec_page_cross_1):
513694
+	tzcntl	%ecx, %ecx
513694
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
513694
+	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
513694
+#  ifdef USE_AS_STRNCMP
513694
+#   ifdef USE_AS_WCSCMP
513694
+	/* Must divide ecx instead of multiply rdx due to overflow.  */
513694
+	movl	%ecx, %eax
513694
+	shrl	$2, %eax
513694
+	cmpq	%rax, %rdx
513694
+#   else
513694
+	cmpq	%rcx, %rdx
513694
+#   endif
513694
+	jbe	L(ret_zero_in_loop_page_cross)
513694
+#  endif
513694
 # else
513694
-	je	L(back_to_loop)
513694
+	addl	%eax, %ecx
513694
 # endif
513694
-	tzcntq	%rdi, %rcx
513694
+
513694
 # ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %ecx
513694
-# endif
513694
-	addq	%r10, %rcx
513694
-	/* Adjust for number of bytes skipped.  */
513694
-	addq	%r8, %rcx
513694
-# ifdef USE_AS_STRNCMP
513694
-	addq	$(VEC_SIZE * 2), %rcx
513694
-	subq	%rcx, %r11
513694
-	jbe	L(zero)
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
+	movl	VEC_OFFSET(%rdi, %rcx), %edx
513694
 	xorl	%eax, %eax
513694
-	movl	(%rsi, %rcx), %edi
513694
-	cmpl	(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rax, %rcx), %eax
513694
-	movzbl	(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	cmpl	VEC_OFFSET(%rsi, %rcx), %edx
513694
+	je	L(ret9)
513694
+	setl	%al
513694
+	negl	%eax
513694
+	xorl	%r8d, %eax
513694
 # else
513694
-#  ifdef USE_AS_WCSCMP
513694
-	movq	%rax, %rsi
513694
-	xorl	%eax, %eax
513694
-	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
513694
-	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
513694
-	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
513694
-	subl	%edx, %eax
513694
-#  endif
513694
+	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
513694
+	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret9):
513694
 	ret
513694
 
513694
-# ifdef USE_AS_STRNCMP
513694
-L(string_nbyte_offset_check):
513694
-	leaq	(VEC_SIZE * 4)(%r10), %r10
513694
-	cmpq	%r10, %r11
513694
-	jbe	L(zero)
513694
-	jmp	L(back_to_loop)
513694
+
513694
+	.p2align 4,, 10
513694
+L(page_cross):
513694
+# ifndef USE_AS_STRNCMP
513694
+	/* If both are VEC aligned we don't need any special logic here.
513694
+	   Only valid for strcmp where stop condition is guranteed to be
513694
+	   reachable by just reading memory.  */
513694
+	testl	$((VEC_SIZE - 1) << 20), %eax
513694
+	jz	L(no_page_cross)
513694
 # endif
513694
 
513694
-	.p2align 4
513694
-L(cross_page_loop):
513694
-	/* Check one byte/dword at a time.  */
513694
+	movl	%edi, %eax
513694
+	movl	%esi, %ecx
513694
+	andl	$(PAGE_SIZE - 1), %eax
513694
+	andl	$(PAGE_SIZE - 1), %ecx
513694
+
513694
+	xorl	%OFFSET_REG, %OFFSET_REG
513694
+
513694
+	/* Check which is closer to page cross, s1 or s2.  */
513694
+	cmpl	%eax, %ecx
513694
+	jg	L(page_cross_s2)
513694
+
513694
+	/* The previous page cross check has false positives. Check for
513694
+	   true positive as page cross logic is very expensive.  */
513694
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
513694
+	jbe	L(no_page_cross)
513694
+
513694
+
513694
+	/* Set r8 to not interfere with normal return value (rdi and rsi
513694
+	   did not swap).  */
513694
 # ifdef USE_AS_WCSCMP
513694
-	cmpl	%ecx, %eax
513694
+	/* any non-zero positive value that doesn't inference with 0x1.
513694
+	 */
513694
+	movl	$2, %r8d
513694
 # else
513694
-	subl	%ecx, %eax
513694
+	xorl	%r8d, %r8d
513694
 # endif
513694
-	jne	L(different)
513694
-	addl	$SIZE_OF_CHAR, %edx
513694
-	cmpl	$(VEC_SIZE * 4), %edx
513694
-	je	L(main_loop_header)
513694
+
513694
+	/* Check if less than 1x VEC till page cross.  */
513694
+	subl	$(VEC_SIZE * 3), %eax
513694
+	jg	L(less_1x_vec_till_page)
513694
+
513694
+
513694
+	/* If more than 1x VEC till page cross, loop throuh safely
513694
+	   loadable memory until within 1x VEC of page cross.  */
513694
+	.p2align 4,, 8
513694
+L(page_cross_loop):
513694
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
513694
+	VPTESTM	%YMM0, %YMM0, %k2
513694
+	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
+	TESTEQ	%ecx
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+	addl	$CHAR_PER_VEC, %OFFSET_REG
513694
 # ifdef USE_AS_STRNCMP
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross)
513694
 # endif
513694
+	addl	$VEC_SIZE, %eax
513694
+	jl	L(page_cross_loop)
513694
+
513694
 # ifdef USE_AS_WCSCMP
513694
-	movl	(%rdi, %rdx), %eax
513694
-	movl	(%rsi, %rdx), %ecx
513694
-# else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %ecx
513694
+	shrl	$2, %eax
513694
 # endif
513694
-	/* Check null CHAR.  */
513694
-	testl	%eax, %eax
513694
-	jne	L(cross_page_loop)
513694
-	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
513694
-	   comparisons.  */
513694
-	subl	%ecx, %eax
513694
-# ifndef USE_AS_WCSCMP
513694
-L(different):
513694
+
513694
+
513694
+	subl	%eax, %OFFSET_REG
513694
+	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
513694
+	   to not cross page so is safe to load. Since we have already
513694
+	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
513694
+	 */
513694
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
513694
+	VPTESTM	%YMM0, %YMM0, %k2
513694
+	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
513694
+
513694
+	kmovd	%k1, %ecx
513694
+# ifdef USE_AS_STRNCMP
513694
+	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
513694
+	cmpq	%rax, %rdx
513694
+	jbe	L(check_ret_vec_page_cross2)
513694
+#  ifdef USE_AS_WCSCMP
513694
+	addq	$-(CHAR_PER_VEC * 2), %rdx
513694
+#  else
513694
+	addq	%rdi, %rdx
513694
+#  endif
513694
 # endif
513694
-	ret
513694
+	TESTEQ	%ecx
513694
+	jz	L(prepare_loop_no_len)
513694
 
513694
+	.p2align 4,, 4
513694
+L(ret_vec_page_cross):
513694
+# ifndef USE_AS_STRNCMP
513694
+L(check_ret_vec_page_cross):
513694
+# endif
513694
+	tzcntl	%ecx, %ecx
513694
+	addl	%OFFSET_REG, %ecx
513694
+L(ret_vec_page_cross_cont):
513694
 # ifdef USE_AS_WCSCMP
513694
-	.p2align 4
513694
-L(different):
513694
-	/* Use movl to avoid modifying EFLAGS.  */
513694
-	movl	$0, %eax
513694
+	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
513694
+	xorl	%eax, %eax
513694
+	cmpl	(%rsi, %rcx, SIZE_OF_CHAR), %edx
513694
+	je	L(ret12)
513694
 	setl	%al
513694
 	negl	%eax
513694
-	orl	$1, %eax
513694
-	ret
513694
+	xorl	%r8d, %eax
513694
+# else
513694
+	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
513694
+	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
513694
+	subl	%ecx, %eax
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 # endif
513694
+L(ret12):
513694
+	ret
513694
+
513694
 
513694
 # ifdef USE_AS_STRNCMP
513694
-	.p2align 4
513694
-L(zero):
513694
+	.p2align 4,, 10
513694
+L(check_ret_vec_page_cross2):
513694
+	TESTEQ	%ecx
513694
+L(check_ret_vec_page_cross):
513694
+	tzcntl	%ecx, %ecx
513694
+	addl	%OFFSET_REG, %ecx
513694
+	cmpq	%rcx, %rdx
513694
+	ja	L(ret_vec_page_cross_cont)
513694
+	.p2align 4,, 2
513694
+L(ret_zero_page_cross):
513694
 	xorl	%eax, %eax
513694
 	ret
513694
+# endif
513694
 
513694
-	.p2align 4
513694
-L(char0):
513694
-#  ifdef USE_AS_WCSCMP
513694
-	xorl	%eax, %eax
513694
-	movl	(%rdi), %ecx
513694
-	cmpl	(%rsi), %ecx
513694
-	jne	L(wcscmp_return)
513694
-#  else
513694
-	movzbl	(%rsi), %ecx
513694
-	movzbl	(%rdi), %eax
513694
-	subl	%ecx, %eax
513694
-#  endif
513694
-	ret
513694
+	.p2align 4,, 4
513694
+L(page_cross_s2):
513694
+	/* Ensure this is a true page cross.  */
513694
+	subl	$(PAGE_SIZE - VEC_SIZE * 4), %ecx
513694
+	jbe	L(no_page_cross)
513694
+
513694
+
513694
+	movl	%ecx, %eax
513694
+	movq	%rdi, %rcx
513694
+	movq	%rsi, %rdi
513694
+	movq	%rcx, %rsi
513694
+
513694
+	/* set r8 to negate return value as rdi and rsi swapped.  */
513694
+# ifdef USE_AS_WCSCMP
513694
+	movl	$-4, %r8d
513694
+# else
513694
+	movl	$-1, %r8d
513694
 # endif
513694
+	xorl	%OFFSET_REG, %OFFSET_REG
513694
 
513694
-	.p2align 4
513694
-L(last_vector):
513694
-	addq	%rdx, %rdi
513694
-	addq	%rdx, %rsi
513694
-# ifdef USE_AS_STRNCMP
513694
-	subq	%rdx, %r11
513694
+	/* Check if more than 1x VEC till page cross.  */
513694
+	subl	$(VEC_SIZE * 3), %eax
513694
+	jle	L(page_cross_loop)
513694
+
513694
+	.p2align 4,, 6
513694
+L(less_1x_vec_till_page):
513694
+# ifdef USE_AS_WCSCMP
513694
+	shrl	$2, %eax
513694
 # endif
513694
-	tzcntl	%ecx, %edx
513694
+	/* Find largest load size we can use.  */
513694
+	cmpl	$(16 / SIZE_OF_CHAR), %eax
513694
+	ja	L(less_16_till_page)
513694
+
513694
+	/* Use 16 byte comparison.  */
513694
+	vmovdqu	(%rdi), %xmm0
513694
+	VPTESTM	%xmm0, %xmm0, %k2
513694
+	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
 # ifdef USE_AS_WCSCMP
513694
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
513694
-	sall	$2, %edx
513694
+	subl	$0xf, %ecx
513694
+# else
513694
+	incw	%cx
513694
 # endif
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
513694
 # ifdef USE_AS_STRNCMP
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
+	subl	%eax, %OFFSET_REG
513694
+# else
513694
+	/* Explicit check for 16 byte alignment.  */
513694
+	subl	%eax, %OFFSET_REG
513694
+	jz	L(prepare_loop)
513694
 # endif
513694
+	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
513694
+	VPTESTM	%xmm0, %xmm0, %k2
513694
+	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
 # ifdef USE_AS_WCSCMP
513694
-	xorl	%eax, %eax
513694
-	movl	(%rdi, %rdx), %ecx
513694
-	cmpl	(%rsi, %rdx), %ecx
513694
-	jne	L(wcscmp_return)
513694
+	subl	$0xf, %ecx
513694
 # else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %edx
513694
-	subl	%edx, %eax
513694
+	incw	%cx
513694
 # endif
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+# ifdef USE_AS_STRNCMP
513694
+	addl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
513694
+	subq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
+	subq	$-(CHAR_PER_VEC * 4), %rdx
513694
+
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
513694
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
513694
+# else
513694
+	leaq	(16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
513694
+	leaq	(16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
513694
+# endif
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
+# ifdef USE_AS_STRNCMP
513694
+	.p2align 4,, 2
513694
+L(ret_zero_page_cross_slow_case0):
513694
+	xorl	%eax, %eax
513694
 	ret
513694
+# endif
513694
 
513694
-	/* Comparing on page boundary region requires special treatment:
513694
-	   It must done one vector at the time, starting with the wider
513694
-	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
513694
-	   (xmm) still passes the boundary, byte comparison must be done.
513694
-	 */
513694
-	.p2align 4
513694
-L(cross_page):
513694
-	/* Try one ymm vector at a time.  */
513694
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
513694
-	jg	L(cross_page_1_vector)
513694
-L(loop_1_vector):
513694
-	VMOVU	(%rdi, %rdx), %YMM0
513694
 
513694
-	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in YMM0 and 32 bytes at (%rsi, %rdx).  */
513694
-	VPCMP	$0, (%rsi, %rdx), %YMM0, %k1{%k2}
513694
+	.p2align 4,, 10
513694
+L(less_16_till_page):
513694
+	cmpl	$(24 / SIZE_OF_CHAR), %eax
513694
+	ja	L(less_8_till_page)
513694
+
513694
+	/* Use 8 byte comparison.  */
513694
+	vmovq	(%rdi), %xmm0
513694
+	vmovq	(%rsi), %xmm1
513694
+	VPTESTM	%xmm0, %xmm0, %k2
513694
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
513694
 	kmovd	%k1, %ecx
513694
 # ifdef USE_AS_WCSCMP
513694
-	subl	$0xff, %ecx
513694
+	subl	$0x3, %ecx
513694
 # else
513694
-	incl	%ecx
513694
+	incb	%cl
513694
 # endif
513694
-	jne	L(last_vector)
513694
+	jnz	L(check_ret_vec_page_cross)
513694
 
513694
-	addl	$VEC_SIZE, %edx
513694
 
513694
-	addl	$VEC_SIZE, %eax
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	$(8 / SIZE_OF_CHAR), %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
 # endif
513694
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
513694
-	jle	L(loop_1_vector)
513694
-L(cross_page_1_vector):
513694
-	/* Less than 32 bytes to check, try one xmm vector.  */
513694
-	cmpl	$(PAGE_SIZE - 16), %eax
513694
-	jg	L(cross_page_1_xmm)
513694
-	VMOVU	(%rdi, %rdx), %XMM0
513694
+	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
513694
+	subl	%eax, %OFFSET_REG
513694
 
513694
-	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in XMM0 and 16 bytes at (%rsi, %rdx).  */
513694
-	VPCMP	$0, (%rsi, %rdx), %XMM0, %k1{%k2}
513694
+	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
513694
+	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
513694
+	VPTESTM	%xmm0, %xmm0, %k2
513694
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
513694
 	kmovd	%k1, %ecx
513694
 # ifdef USE_AS_WCSCMP
513694
-	subl	$0xf, %ecx
513694
+	subl	$0x3, %ecx
513694
 # else
513694
-	subl	$0xffff, %ecx
513694
+	incb	%cl
513694
 # endif
513694
-	jne	L(last_vector)
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+
513694
 
513694
-	addl	$16, %edx
513694
-# ifndef USE_AS_WCSCMP
513694
-	addl	$16, %eax
513694
-# endif
513694
 # ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	addl	$(8 / SIZE_OF_CHAR), %OFFSET_REG
513694
+	subq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case0)
513694
+	subq	$-(CHAR_PER_VEC * 4), %rdx
513694
+
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
513694
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
513694
+# else
513694
+	leaq	(8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
513694
+	leaq	(8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
513694
 # endif
513694
+	jmp	L(prepare_loop_aligned)
513694
 
513694
-L(cross_page_1_xmm):
513694
-# ifndef USE_AS_WCSCMP
513694
-	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
513694
-	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
513694
-	cmpl	$(PAGE_SIZE - 8), %eax
513694
-	jg	L(cross_page_8bytes)
513694
-	vmovq	(%rdi, %rdx), %XMM0
513694
-	vmovq	(%rsi, %rdx), %XMM1
513694
 
513694
-	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in XMM0 and XMM1.  */
513694
-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
513694
-	kmovb	%k1, %ecx
513694
+
513694
+
513694
+	.p2align 4,, 10
513694
+L(less_8_till_page):
513694
 # ifdef USE_AS_WCSCMP
513694
-	subl	$0x3, %ecx
513694
+	/* If using wchar then this is the only check before we reach
513694
+	   the page boundary.  */
513694
+	movl	(%rdi), %eax
513694
+	movl	(%rsi), %ecx
513694
+	cmpl	%ecx, %eax
513694
+	jnz	L(ret_less_8_wcs)
513694
+#  ifdef USE_AS_STRNCMP
513694
+	addq	$-(CHAR_PER_VEC * 2), %rdx
513694
+	/* We already checked for len <= 1 so cannot hit that case here.
513694
+	 */
513694
+#  endif
513694
+	testl	%eax, %eax
513694
+	jnz	L(prepare_loop)
513694
+	ret
513694
+
513694
+	.p2align 4,, 8
513694
+L(ret_less_8_wcs):
513694
+	setl	%OFFSET_REG8
513694
+	negl	%OFFSET_REG
513694
+	movl	%OFFSET_REG, %eax
513694
+	xorl	%r8d, %eax
513694
+	ret
513694
+
513694
 # else
513694
-	subl	$0xff, %ecx
513694
-# endif
513694
-	jne	L(last_vector)
513694
+	cmpl	$28, %eax
513694
+	ja	L(less_4_till_page)
513694
+
513694
+	vmovd	(%rdi), %xmm0
513694
+	vmovd	(%rsi), %xmm1
513694
+	VPTESTM	%xmm0, %xmm0, %k2
513694
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
513694
+	kmovd	%k1, %ecx
513694
+	subl	$0xf, %ecx
513694
+	jnz	L(check_ret_vec_page_cross)
513694
 
513694
-	addl	$8, %edx
513694
-	addl	$8, %eax
513694
 #  ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	cmpq	$4, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case1)
513694
 #  endif
513694
+	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
513694
+	subl	%eax, %OFFSET_REG
513694
 
513694
-L(cross_page_8bytes):
513694
-	/* Less than 8 bytes to check, try 4 byte vector.  */
513694
-	cmpl	$(PAGE_SIZE - 4), %eax
513694
-	jg	L(cross_page_4bytes)
513694
-	vmovd	(%rdi, %rdx), %XMM0
513694
-	vmovd	(%rsi, %rdx), %XMM1
513694
-
513694
-	VPTESTM	%YMM0, %YMM0, %k2
513694
-	/* Each bit cleared in K1 represents a mismatch or a null CHAR
513694
-	   in XMM0 and XMM1.  */
513694
-	VPCMP	$0, %XMM1, %XMM0, %k1{%k2}
513694
+	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
513694
+	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
513694
+	VPTESTM	%xmm0, %xmm0, %k2
513694
+	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
513694
 	kmovd	%k1, %ecx
513694
-# ifdef USE_AS_WCSCMP
513694
-	subl	$0x1, %ecx
513694
-# else
513694
 	subl	$0xf, %ecx
513694
-# endif
513694
-	jne	L(last_vector)
513694
+	jnz	L(check_ret_vec_page_cross)
513694
+#  ifdef USE_AS_STRNCMP
513694
+	addl	$(4 / SIZE_OF_CHAR), %OFFSET_REG
513694
+	subq	%OFFSET_REG64, %rdx
513694
+	jbe	L(ret_zero_page_cross_slow_case1)
513694
+	subq	$-(CHAR_PER_VEC * 4), %rdx
513694
+
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
513694
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
513694
+#  else
513694
+	leaq	(4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
513694
+	leaq	(4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
513694
+#  endif
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
 
513694
-	addl	$4, %edx
513694
 #  ifdef USE_AS_STRNCMP
513694
-	/* Return 0 if the current offset (%rdx) >= the maximum offset
513694
-	   (%r11).  */
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
+	.p2align 4,, 2
513694
+L(ret_zero_page_cross_slow_case1):
513694
+	xorl	%eax, %eax
513694
+	ret
513694
 #  endif
513694
 
513694
-L(cross_page_4bytes):
513694
-# endif
513694
-	/* Less than 4 bytes to check, try one byte/dword at a time.  */
513694
-# ifdef USE_AS_STRNCMP
513694
-	cmpq	%r11, %rdx
513694
-	jae	L(zero)
513694
-# endif
513694
-# ifdef USE_AS_WCSCMP
513694
-	movl	(%rdi, %rdx), %eax
513694
-	movl	(%rsi, %rdx), %ecx
513694
-# else
513694
-	movzbl	(%rdi, %rdx), %eax
513694
-	movzbl	(%rsi, %rdx), %ecx
513694
-# endif
513694
-	testl	%eax, %eax
513694
-	jne	L(cross_page_loop)
513694
+	.p2align 4,, 10
513694
+L(less_4_till_page):
513694
+	subq	%rdi, %rsi
513694
+	/* Extremely slow byte comparison loop.  */
513694
+L(less_4_loop):
513694
+	movzbl	(%rdi), %eax
513694
+	movzbl	(%rsi, %rdi), %ecx
513694
 	subl	%ecx, %eax
513694
+	jnz	L(ret_less_4_loop)
513694
+	testl	%ecx, %ecx
513694
+	jz	L(ret_zero_4_loop)
513694
+#  ifdef USE_AS_STRNCMP
513694
+	decq	%rdx
513694
+	jz	L(ret_zero_4_loop)
513694
+#  endif
513694
+	incq	%rdi
513694
+	/* end condition is reach page boundary (rdi is aligned).  */
513694
+	testl	$31, %edi
513694
+	jnz	L(less_4_loop)
513694
+	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
513694
+	addq	$-(VEC_SIZE * 4), %rdi
513694
+#  ifdef USE_AS_STRNCMP
513694
+	subq	$-(CHAR_PER_VEC * 4), %rdx
513694
+#  endif
513694
+	jmp	L(prepare_loop_aligned)
513694
+
513694
+L(ret_zero_4_loop):
513694
+	xorl	%eax, %eax
513694
+	ret
513694
+L(ret_less_4_loop):
513694
+	xorl	%r8d, %eax
513694
+	subl	%r8d, %eax
513694
 	ret
513694
-END (STRCMP)
513694
+# endif
513694
+END(STRCMP)
513694
 #endif
513694
-- 
513694
GitLab
513694