Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/ia-opt-strcmp-avx2.patch

Blob History Raw

		513694	`From 5e9c6a33e767576c063e1fc0077b3a749518e8f0 Mon Sep 17 00:00:00 2001`
		513694	`From: Noah Goldstein <goldstein.w.n@gmail.com>`
		513694	`Date: Mon, 10 Jan 2022 15:35:38 -0600`
		513694	`Subject: [PATCH] x86: Optimize strcmp-avx2.S`
		513694
		513694	`Optimization are primarily to the loop logic and how the page cross`
		513694	`logic interacts with the loop.`
		513694
		513694	`The page cross logic is at times more expensive for short strings near`
		513694	`the end of a page but not crossing the page. This is done to retest`
		513694	`the page cross conditions with a non-faulty check and to improve the`
		513694	`logic for entering the loop afterwards. This is only particular cases,`
		513694	`however, and is general made up for by more than 10x improvements on`
		513694	`the transition from the page cross -> loop case.`
		513694
		513694	`The non-page cross cases are improved most for smaller sizes [0, 128]`
		513694	`and go about even for (128, 4096]. The loop page cross logic is`
		513694	`improved so some more significant speedup is seen there as well.`
		513694
		513694	`test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.`
		513694
		513694	`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
		513694	`(cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45)`
		513694	`---`
		513694	`sysdeps/x86_64/multiarch/strcmp-avx2.S \| 1592 ++++++++++++++----------`
		513694	`1 file changed, 940 insertions(+), 652 deletions(-)`
		513694
		513694	`diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S`
		513694	`index 70d8499b..554ffe4c 100644`
		513694	`--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S`
		513694	`+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S`
		513694	`@@ -26,35 +26,57 @@`
		513694
		513694	`# define PAGE_SIZE 4096`
		513694
		513694	`-/* VEC_SIZE = Number of bytes in a ymm register */`
		513694	`+ /* VEC_SIZE = Number of bytes in a ymm register. */`
		513694	`# define VEC_SIZE 32`
		513694
		513694	`-/* Shift for dividing by (VEC_SIZE * 4). */`
		513694	`-# define DIVIDE_BY_VEC_4_SHIFT 7`
		513694	`-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)`
		513694	`-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)`
		513694	`-# endif`
		513694	`+# define VMOVU vmovdqu`
		513694	`+# define VMOVA vmovdqa`
		513694
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`-/* Compare packed dwords. */`
		513694	`+ /* Compare packed dwords. */`
		513694	`# define VPCMPEQ vpcmpeqd`
		513694	`-/* Compare packed dwords and store minimum. */`
		513694	`+ /* Compare packed dwords and store minimum. */`
		513694	`# define VPMINU vpminud`
		513694	`-/* 1 dword char == 4 bytes. */`
		513694	`+ /* 1 dword char == 4 bytes. */`
		513694	`# define SIZE_OF_CHAR 4`
		513694	`# else`
		513694	`-/* Compare packed bytes. */`
		513694	`+ /* Compare packed bytes. */`
		513694	`# define VPCMPEQ vpcmpeqb`
		513694	`-/* Compare packed bytes and store minimum. */`
		513694	`+ /* Compare packed bytes and store minimum. */`
		513694	`# define VPMINU vpminub`
		513694	`-/* 1 byte char == 1 byte. */`
		513694	`+ /* 1 byte char == 1 byte. */`
		513694	`# define SIZE_OF_CHAR 1`
		513694	`# endif`
		513694
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+# define LOOP_REG r9d`
		513694	`+# define LOOP_REG64 r9`
		513694	`+`
		513694	`+# define OFFSET_REG8 r9b`
		513694	`+# define OFFSET_REG r9d`
		513694	`+# define OFFSET_REG64 r9`
		513694	`+# else`
		513694	`+# define LOOP_REG edx`
		513694	`+# define LOOP_REG64 rdx`
		513694	`+`
		513694	`+# define OFFSET_REG8 dl`
		513694	`+# define OFFSET_REG edx`
		513694	`+# define OFFSET_REG64 rdx`
		513694	`+# endif`
		513694	`+`
		513694	`# ifndef VZEROUPPER`
		513694	`# define VZEROUPPER vzeroupper`
		513694	`# endif`
		513694
		513694	`+# if defined USE_AS_STRNCMP`
		513694	`+# define VEC_OFFSET 0`
		513694	`+# else`
		513694	`+# define VEC_OFFSET (-VEC_SIZE)`
		513694	`+# endif`
		513694	`+`
		513694	`+# define xmmZERO xmm15`
		513694	`+# define ymmZERO ymm15`
		513694	`+`
		513694	`# ifndef SECTION`
		513694	`# define SECTION(p) p##.avx`
		513694	`# endif`
		513694	`@@ -79,783 +101,1049 @@`
		513694	`the maximum offset is reached before a difference is found, zero is`
		513694	`returned. */`
		513694
		513694	`- .section SECTION(.text),"ax",@progbits`
		513694	`-ENTRY (STRCMP)`
		513694	`+ .section SECTION(.text), "ax", @progbits`
		513694	`+ENTRY(STRCMP)`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Check for simple cases (0 or 1) in offset. */`
		513694	`+# ifdef __ILP32__`
		513694	`+ /* Clear the upper 32 bits. */`
		513694	`+ movl %edx, %rdx`
		513694	`+# endif`
		513694	`cmp $1, %RDX_LP`
		513694	`- je L(char0)`
		513694	`- jb L(zero)`
		513694	`+ /* Signed comparison intentional. We use this branch to also`
		513694	`+ test cases where length >= 2^63. These very large sizes can be`
		513694	`+ handled with strcmp as there is no way for that length to`
		513694	`+ actually bound the buffer. */`
		513694	`+ jle L(one_or_less)`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`-# ifndef __ILP32__`
		513694	`movq %rdx, %rcx`
		513694	`- /* Check if length could overflow when multiplied by`
		513694	`- sizeof(wchar_t). Checking top 8 bits will cover all potential`
		513694	`- overflow cases as well as redirect cases where its impossible to`
		513694	`- length to bound a valid memory region. In these cases just use`
		513694	`- 'wcscmp'. */`
		513694	`+`
		513694	`+ /* Multiplying length by sizeof(wchar_t) can result in overflow.`
		513694	`+ Check if that is possible. All cases where overflow are possible`
		513694	`+ are cases where length is large enough that it can never be a`
		513694	`+ bound on valid memory so just use wcscmp. */`
		513694	`shrq $56, %rcx`
		513694	`- jnz OVERFLOW_STRCMP`
		513694	`-# endif`
		513694	`- /* Convert units: from wide to byte char. */`
		513694	`- shl $2, %RDX_LP`
		513694	`+ jnz __wcscmp_avx2`
		513694	`+`
		513694	`+ leaq (, %rdx, 4), %rdx`
		513694	`# endif`
		513694	`- /* Register %r11 tracks the maximum offset. */`
		513694	`- mov %RDX_LP, %R11_LP`
		513694	`# endif`
		513694	`+ vpxor %xmmZERO, %xmmZERO, %xmmZERO`
		513694	`movl %edi, %eax`
		513694	`- xorl %edx, %edx`
		513694	`- /* Make %xmm7 (%ymm7) all zeros in this function. */`
		513694	`- vpxor %xmm7, %xmm7, %xmm7`
		513694	`orl %esi, %eax`
		513694	`- andl $(PAGE_SIZE - 1), %eax`
		513694	`- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax`
		513694	`- jg L(cross_page)`
		513694	`- /* Start comparing 4 vectors. */`
		513694	`- vmovdqu (%rdi), %ymm1`
		513694	`- VPCMPEQ (%rsi), %ymm1, %ymm0`
		513694	`- VPMINU %ymm1, %ymm0, %ymm0`
		513694	`- VPCMPEQ %ymm7, %ymm0, %ymm0`
		513694	`- vpmovmskb %ymm0, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- je L(next_3_vectors)`
		513694	`- tzcntl %ecx, %edx`
		513694	`+ sall $20, %eax`
		513694	`+ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */`
		513694	`+ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax`
		513694	`+ ja L(page_cross)`
		513694	`+`
		513694	`+L(no_page_cross):`
		513694	`+ /* Safe to compare 4x vectors. */`
		513694	`+ VMOVU (%rdi), %ymm0`
		513694	`+ /* 1s where s1 and s2 equal. */`
		513694	`+ VPCMPEQ (%rsi), %ymm0, %ymm1`
		513694	`+ /* 1s at null CHAR. */`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ /* 1s where s1 and s2 equal AND not null CHAR. */`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+`
		513694	`+ /* All 1s -> keep going, any 0s -> return. */`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the mismatched index (%rdx) is after the maximum`
		513694	`- offset (%r11). */`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`+ cmpq $VEC_SIZE, %rdx`
		513694	`+ jbe L(vec_0_test_len)`
		513694	`# endif`
		513694	`+`
		513694	`+ /* All 1s represents all equals. incl will overflow to zero in`
		513694	`+ all equals case. Otherwise 1s will carry until position of first`
		513694	`+ mismatch. */`
		513694	`+ incl %ecx`
		513694	`+ jz L(more_3x_vec)`
		513694	`+`
		513694	`+ .p2align 4,, 4`
		513694	`+L(return_vec_0):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`+ movl (%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rdi, %rdx), %ecx`
		513694	`- cmpl (%rsi, %rdx), %ecx`
		513694	`- je L(return)`
		513694	`-L(wcscmp_return):`
		513694	`+ cmpl (%rsi, %rcx), %edx`
		513694	`+ je L(ret0)`
		513694	`setl %al`
		513694	`negl %eax`
		513694	`orl $1, %eax`
		513694	`-L(return):`
		513694	`# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`+ movzbl (%rdi, %rcx), %eax`
		513694	`+ movzbl (%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`# endif`
		513694	`+L(ret0):`
		513694	`L(return_vzeroupper):`
		513694	`ZERO_UPPER_VEC_REGISTERS_RETURN`
		513694
		513694	`- .p2align 4`
		513694	`-L(return_vec_size):`
		513694	`- tzcntl %ecx, %edx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after`
		513694	`- the maximum offset (%r11). */`
		513694	`- addq $VEC_SIZE, %rdx`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`+ .p2align 4,, 8`
		513694	`+L(vec_0_test_len):`
		513694	`+ notl %ecx`
		513694	`+ bzhil %edx, %ecx, %eax`
		513694	`+ jnz L(return_vec_0)`
		513694	`+ /* Align if will cross fetch block. */`
		513694	`+ .p2align 4,, 2`
		513694	`+L(ret_zero):`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rdi, %rdx), %ecx`
		513694	`- cmpl (%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`-# else`
		513694	`+ VZEROUPPER_RETURN`
		513694	`+`
		513694	`+ .p2align 4,, 5`
		513694	`+L(one_or_less):`
		513694	`+ jb L(ret_zero)`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`+ /* 'nbe' covers the case where length is negative (large`
		513694	`+ unsigned). */`
		513694	`+ jnbe __wcscmp_avx2`
		513694	`+ movl (%rdi), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl VEC_SIZE(%rdi, %rdx), %ecx`
		513694	`- cmpl VEC_SIZE(%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`+ cmpl (%rsi), %edx`
		513694	`+ je L(ret1)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ orl $1, %eax`
		513694	`# else`
		513694	`- movzbl VEC_SIZE(%rdi, %rdx), %eax`
		513694	`- movzbl VEC_SIZE(%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`+ /* 'nbe' covers the case where length is negative (large`
		513694	`+ unsigned). */`
		513694	`+`
		513694	`+ jnbe __strcmp_avx2`
		513694	`+ movzbl (%rdi), %eax`
		513694	`+ movzbl (%rsi), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`# endif`
		513694	`+L(ret1):`
		513694	`+ ret`
		513694	`# endif`
		513694	`- VZEROUPPER_RETURN`
		513694
		513694	`- .p2align 4`
		513694	`-L(return_2_vec_size):`
		513694	`- tzcntl %ecx, %edx`
		513694	`+ .p2align 4,, 10`
		513694	`+L(return_vec_1):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is`
		513694	`- after the maximum offset (%r11). */`
		513694	`- addq $(VEC_SIZE * 2), %rdx`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`+ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of`
		513694	`+ overflow. */`
		513694	`+ addq $-VEC_SIZE, %rdx`
		513694	`+ cmpq %rcx, %rdx`
		513694	`+ jbe L(ret_zero)`
		513694	`+# endif`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl VEC_SIZE(%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rdi, %rdx), %ecx`
		513694	`- cmpl (%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ cmpl VEC_SIZE(%rsi, %rcx), %edx`
		513694	`+ je L(ret2)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ orl $1, %eax`
		513694	`# else`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- xorl %eax, %eax`
		513694	`- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx`
		513694	`- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax`
		513694	`- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ movzbl VEC_SIZE(%rdi, %rcx), %eax`
		513694	`+ movzbl VEC_SIZE(%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`# endif`
		513694	`+L(ret2):`
		513694	`VZEROUPPER_RETURN`
		513694
		513694	`- .p2align 4`
		513694	`-L(return_3_vec_size):`
		513694	`- tzcntl %ecx, %edx`
		513694	`+ .p2align 4,, 10`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is`
		513694	`- after the maximum offset (%r11). */`
		513694	`- addq $(VEC_SIZE * 3), %rdx`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`+L(return_vec_3):`
		513694	`+ salq $32, %rcx`
		513694	`+# endif`
		513694	`+`
		513694	`+L(return_vec_2):`
		513694	`+# ifndef USE_AS_STRNCMP`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`+# else`
		513694	`+ tzcntq %rcx, %rcx`
		513694	`+ cmpq %rcx, %rdx`
		513694	`+ jbe L(ret_zero)`
		513694	`+# endif`
		513694	`+`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rdi, %rdx), %ecx`
		513694	`- cmpl (%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx`
		513694	`+ je L(ret3)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ orl $1, %eax`
		513694	`# else`
		513694	`+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax`
		513694	`+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+# endif`
		513694	`+L(ret3):`
		513694	`+ VZEROUPPER_RETURN`
		513694	`+`
		513694	`+# ifndef USE_AS_STRNCMP`
		513694	`+ .p2align 4,, 10`
		513694	`+L(return_vec_3):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`+ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx`
		513694	`- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx`
		513694	`+ je L(ret4)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ orl $1, %eax`
		513694	`# else`
		513694	`- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax`
		513694	`- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		513694	`+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`# endif`
		513694	`-# endif`
		513694	`+L(ret4):`
		513694	`VZEROUPPER_RETURN`
		513694	`+# endif`
		513694	`+`
		513694	`+ .p2align 4,, 10`
		513694	`+L(more_3x_vec):`
		513694	`+ /* Safe to compare 4x vectors. */`
		513694	`+ VMOVU VEC_SIZE(%rdi), %ymm0`
		513694	`+ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_1)`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ subq $(VEC_SIZE * 2), %rdx`
		513694	`+ jbe L(ret_zero)`
		513694	`+# endif`
		513694	`+`
		513694	`+ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0`
		513694	`+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_2)`
		513694	`+`
		513694	`+ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0`
		513694	`+ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_3)`
		513694
		513694	`- .p2align 4`
		513694	`-L(next_3_vectors):`
		513694	`- vmovdqu VEC_SIZE(%rdi), %ymm6`
		513694	`- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3`
		513694	`- VPMINU %ymm6, %ymm3, %ymm3`
		513694	`- VPCMPEQ %ymm7, %ymm3, %ymm3`
		513694	`- vpmovmskb %ymm3, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(return_vec_size)`
		513694	`- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5`
		513694	`- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4`
		513694	`- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0`
		513694	`- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2`
		513694	`- VPMINU %ymm5, %ymm2, %ymm2`
		513694	`- VPCMPEQ %ymm4, %ymm0, %ymm0`
		513694	`- VPCMPEQ %ymm7, %ymm2, %ymm2`
		513694	`- vpmovmskb %ymm2, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(return_2_vec_size)`
		513694	`- VPMINU %ymm4, %ymm0, %ymm0`
		513694	`- VPCMPEQ %ymm7, %ymm0, %ymm0`
		513694	`- vpmovmskb %ymm0, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(return_3_vec_size)`
		513694	`-L(main_loop_header):`
		513694	`- leaq (VEC_SIZE * 4)(%rdi), %rdx`
		513694	`- movl $PAGE_SIZE, %ecx`
		513694	`- /* Align load via RAX. */`
		513694	`- andq $-(VEC_SIZE * 4), %rdx`
		513694	`- subq %rdi, %rdx`
		513694	`- leaq (%rdi, %rdx), %rax`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Starting from this point, the maximum offset, or simply the`
		513694	`- 'offset', DECREASES by the same amount when base pointers are`
		513694	`- moved forward. Return 0 when:`
		513694	`- 1) On match: offset <= the matched vector index.`
		513694	`- 2) On mistmach, offset is before the mistmatched index.`
		513694	`+ cmpq $(VEC_SIZE * 2), %rdx`
		513694	`+ jbe L(ret_zero)`
		513694	`+# endif`
		513694	`+`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ /* any non-zero positive value that doesn't inference with 0x1.`
		513694	`*/`
		513694	`- subq %rdx, %r11`
		513694	`- jbe L(zero)`
		513694	`-# endif`
		513694	`- addq %rsi, %rdx`
		513694	`- movq %rdx, %rsi`
		513694	`- andl $(PAGE_SIZE - 1), %esi`
		513694	`- /* Number of bytes before page crossing. */`
		513694	`- subq %rsi, %rcx`
		513694	`- /* Number of VEC_SIZE * 4 blocks before page crossing. */`
		513694	`- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx`
		513694	`- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */`
		513694	`- movl %ecx, %esi`
		513694	`- jmp L(loop_start)`
		513694	`+ movl $2, %r8d`
		513694
		513694	`+# else`
		513694	`+ xorl %r8d, %r8d`
		513694	`+# endif`
		513694	`+`
		513694	`+ /* The prepare labels are various entry points from the page`
		513694	`+ cross logic. */`
		513694	`+L(prepare_loop):`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ /* Store N + (VEC_SIZE * 4) and place check at the begining of`
		513694	`+ the loop. */`
		513694	`+ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx`
		513694	`+# endif`
		513694	`+L(prepare_loop_no_len):`
		513694	`+`
		513694	`+ /* Align s1 and adjust s2 accordingly. */`
		513694	`+ subq %rdi, %rsi`
		513694	`+ andq $-(VEC_SIZE * 4), %rdi`
		513694	`+ addq %rdi, %rsi`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ subq %rdi, %rdx`
		513694	`+# endif`
		513694	`+`
		513694	`+L(prepare_loop_aligned):`
		513694	`+ /* eax stores distance from rsi to next page cross. These cases`
		513694	`+ need to be handled specially as the 4x loop could potentially`
		513694	`+ read memory past the length of s1 or s2 and across a page`
		513694	`+ boundary. */`
		513694	`+ movl $-(VEC_SIZE * 4), %eax`
		513694	`+ subl %esi, %eax`
		513694	`+ andl $(PAGE_SIZE - 1), %eax`
		513694	`+`
		513694	`+ /* Loop 4x comparisons at a time. */`
		513694	`.p2align 4`
		513694	`L(loop):`
		513694	`+`
		513694	`+ /* End condition for strncmp. */`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease`
		513694	`- the maximum offset (%r11) by the same amount. */`
		513694	`- subq $(VEC_SIZE * 4), %r11`
		513694	`- jbe L(zero)`
		513694	`-# endif`
		513694	`- addq $(VEC_SIZE * 4), %rax`
		513694	`- addq $(VEC_SIZE * 4), %rdx`
		513694	`-L(loop_start):`
		513694	`- testl %esi, %esi`
		513694	`- leal -1(%esi), %esi`
		513694	`- je L(loop_cross_page)`
		513694	`-L(back_to_loop):`
		513694	`- /* Main loop, comparing 4 vectors are a time. */`
		513694	`- vmovdqa (%rax), %ymm0`
		513694	`- vmovdqa VEC_SIZE(%rax), %ymm3`
		513694	`- VPCMPEQ (%rdx), %ymm0, %ymm4`
		513694	`- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1`
		513694	`- VPMINU %ymm0, %ymm4, %ymm4`
		513694	`- VPMINU %ymm3, %ymm1, %ymm1`
		513694	`- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2`
		513694	`- VPMINU %ymm1, %ymm4, %ymm0`
		513694	`- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3`
		513694	`- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5`
		513694	`- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6`
		513694	`- VPMINU %ymm2, %ymm5, %ymm5`
		513694	`- VPMINU %ymm3, %ymm6, %ymm6`
		513694	`- VPMINU %ymm5, %ymm0, %ymm0`
		513694	`- VPMINU %ymm6, %ymm0, %ymm0`
		513694	`- VPCMPEQ %ymm7, %ymm0, %ymm0`
		513694	`-`
		513694	`- /* Test each mask (32 bits) individually because for VEC_SIZE`
		513694	`- == 32 is not possible to OR the four masks and keep all bits`
		513694	`- in a 64-bit integer register, differing from SSE2 strcmp`
		513694	`- where ORing is possible. */`
		513694	`- vpmovmskb %ymm0, %ecx`
		513694	`+ subq $(VEC_SIZE * 4), %rdx`
		513694	`+ jbe L(ret_zero)`
		513694	`+# endif`
		513694	`+`
		513694	`+ subq $-(VEC_SIZE * 4), %rdi`
		513694	`+ subq $-(VEC_SIZE * 4), %rsi`
		513694	`+`
		513694	`+ /* Check if rsi loads will cross a page boundary. */`
		513694	`+ addl $-(VEC_SIZE * 4), %eax`
		513694	`+ jnb L(page_cross_during_loop)`
		513694	`+`
		513694	`+ /* Loop entry after handling page cross during loop. */`
		513694	`+L(loop_skip_page_cross_check):`
		513694	`+ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0`
		513694	`+ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2`
		513694	`+ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4`
		513694	`+ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6`
		513694	`+`
		513694	`+ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */`
		513694	`+ VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1`
		513694	`+`
		513694	`+ VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3`
		513694	`+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5`
		513694	`+ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7`
		513694	`+`
		513694	`+`
		513694	`+ /* If any mismatches or null CHAR then 0 CHAR, otherwise non-`
		513694	`+ zero. */`
		513694	`+ vpand %ymm0, %ymm1, %ymm1`
		513694	`+`
		513694	`+`
		513694	`+ vpand %ymm2, %ymm3, %ymm3`
		513694	`+ vpand %ymm4, %ymm5, %ymm5`
		513694	`+ vpand %ymm6, %ymm7, %ymm7`
		513694	`+`
		513694	`+ VPMINU %ymm1, %ymm3, %ymm3`
		513694	`+ VPMINU %ymm5, %ymm7, %ymm7`
		513694	`+`
		513694	`+ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */`
		513694	`+ VPMINU %ymm3, %ymm7, %ymm7`
		513694	`+`
		513694	`+ /* If any 0 CHAR then done. */`
		513694	`+ VPCMPEQ %ymm7, %ymmZERO, %ymm7`
		513694	`+ vpmovmskb %ymm7, %LOOP_REG`
		513694	`+ testl %LOOP_REG, %LOOP_REG`
		513694	`+ jz L(loop)`
		513694	`+`
		513694	`+ /* Find which VEC has the mismatch of end of string. */`
		513694	`+ VPCMPEQ %ymm1, %ymmZERO, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`testl %ecx, %ecx`
		513694	`- je L(loop)`
		513694	`- VPCMPEQ %ymm7, %ymm4, %ymm0`
		513694	`- vpmovmskb %ymm0, %edi`
		513694	`- testl %edi, %edi`
		513694	`- je L(test_vec)`
		513694	`- tzcntl %edi, %ecx`
		513694	`+ jnz L(return_vec_0_end)`
		513694	`+`
		513694	`+`
		513694	`+ VPCMPEQ %ymm3, %ymmZERO, %ymm3`
		513694	`+ vpmovmskb %ymm3, %ecx`
		513694	`+ testl %ecx, %ecx`
		513694	`+ jnz L(return_vec_1_end)`
		513694	`+`
		513694	`+L(return_vec_2_3_end):`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- cmpq %rcx, %r11`
		513694	`- jbe L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+ subq $(VEC_SIZE * 2), %rdx`
		513694	`+ jbe L(ret_zero_end)`
		513694	`+# endif`
		513694	`+`
		513694	`+ VPCMPEQ %ymm5, %ymmZERO, %ymm5`
		513694	`+ vpmovmskb %ymm5, %ecx`
		513694	`+ testl %ecx, %ecx`
		513694	`+ jnz L(return_vec_2_end)`
		513694	`+`
		513694	`+ /* LOOP_REG contains matches for null/mismatch from the loop. If`
		513694	`+ VEC 0,1,and 2 all have no null and no mismatches then mismatch`
		513694	`+ must entirely be from VEC 3 which is fully represented by`
		513694	`+ LOOP_REG. */`
		513694	`+ tzcntl %LOOP_REG, %LOOP_REG`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ subl $-(VEC_SIZE), %LOOP_REG`
		513694	`+ cmpq %LOOP_REG64, %rdx`
		513694	`+ jbe L(ret_zero_end)`
		513694	`+# endif`
		513694	`+`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rsi, %rcx), %edi`
		513694	`- cmpl (%rdx, %rcx), %edi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rcx), %eax`
		513694	`- movzbl (%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx`
		513694	`+ je L(ret5)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`# else`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`- xorl %eax, %eax`
		513694	`- movl (%rsi, %rcx), %edi`
		513694	`- cmpl (%rdx, %rcx), %edi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rcx), %eax`
		513694	`- movzbl (%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax`
		513694	`+ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`# endif`
		513694	`+L(ret5):`
		513694	`VZEROUPPER_RETURN`
		513694
		513694	`- .p2align 4`
		513694	`-L(test_vec):`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* The first vector matched. Return 0 if the maximum offset`
		513694	`- (%r11) <= VEC_SIZE. */`
		513694	`- cmpq $VEC_SIZE, %r11`
		513694	`- jbe L(zero)`
		513694	`+ .p2align 4,, 2`
		513694	`+L(ret_zero_end):`
		513694	`+ xorl %eax, %eax`
		513694	`+ VZEROUPPER_RETURN`
		513694	`# endif`
		513694	`- VPCMPEQ %ymm7, %ymm1, %ymm1`
		513694	`- vpmovmskb %ymm1, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- je L(test_2_vec)`
		513694	`- tzcntl %ecx, %edi`
		513694	`+`
		513694	`+`
		513694	`+ /* The L(return_vec_N_end) differ from L(return_vec_N) in that`
		513694	+ they use the value of `r8` to negate the return value. This is
		513694	+ because the page cross logic can swap `rdi` and `rsi`. */
		513694	`+ .p2align 4,, 10`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- addq $VEC_SIZE, %rdi`
		513694	`- cmpq %rdi, %r11`
		513694	`- jbe L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+L(return_vec_1_end):`
		513694	`+ salq $32, %rcx`
		513694	`+# endif`
		513694	`+L(return_vec_0_end):`
		513694	`+# ifndef USE_AS_STRNCMP`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`+# else`
		513694	`+ tzcntq %rcx, %rcx`
		513694	`+ cmpq %rcx, %rdx`
		513694	`+ jbe L(ret_zero_end)`
		513694	`+# endif`
		513694	`+`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl (%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rsi, %rdi), %ecx`
		513694	`- cmpl (%rdx, %rdi), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rdi), %eax`
		513694	`- movzbl (%rdx, %rdi), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ cmpl (%rsi, %rcx), %edx`
		513694	`+ je L(ret6)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`# else`
		513694	`+ movzbl (%rdi, %rcx), %eax`
		513694	`+ movzbl (%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`+# endif`
		513694	`+L(ret6):`
		513694	`+ VZEROUPPER_RETURN`
		513694	`+`
		513694	`+# ifndef USE_AS_STRNCMP`
		513694	`+ .p2align 4,, 10`
		513694	`+L(return_vec_1_end):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+ movl VEC_SIZE(%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl VEC_SIZE(%rsi, %rdi), %ecx`
		513694	`- cmpl VEC_SIZE(%rdx, %rdi), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`+ cmpl VEC_SIZE(%rsi, %rcx), %edx`
		513694	`+ je L(ret7)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`# else`
		513694	`- movzbl VEC_SIZE(%rax, %rdi), %eax`
		513694	`- movzbl VEC_SIZE(%rdx, %rdi), %edx`
		513694	`- subl %edx, %eax`
		513694	`+ movzbl VEC_SIZE(%rdi, %rcx), %eax`
		513694	`+ movzbl VEC_SIZE(%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`# endif`
		513694	`-# endif`
		513694	`+L(ret7):`
		513694	`VZEROUPPER_RETURN`
		513694	`+# endif`
		513694
		513694	`- .p2align 4`
		513694	`-L(test_2_vec):`
		513694	`+ .p2align 4,, 10`
		513694	`+L(return_vec_2_end):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* The first 2 vectors matched. Return 0 if the maximum offset`
		513694	`- (%r11) <= 2 * VEC_SIZE. */`
		513694	`- cmpq $(VEC_SIZE * 2), %r11`
		513694	`- jbe L(zero)`
		513694	`+ cmpq %rcx, %rdx`
		513694	`+ jbe L(ret_zero_page_cross)`
		513694	`# endif`
		513694	`- VPCMPEQ %ymm7, %ymm5, %ymm5`
		513694	`- vpmovmskb %ymm5, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- je L(test_3_vec)`
		513694	`- tzcntl %ecx, %edi`
		513694	`-# ifdef USE_AS_STRNCMP`
		513694	`- addq $(VEC_SIZE * 2), %rdi`
		513694	`- cmpq %rdi, %r11`
		513694	`- jbe L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rsi, %rdi), %ecx`
		513694	`- cmpl (%rdx, %rdi), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rdi), %eax`
		513694	`- movzbl (%rdx, %rdi), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx`
		513694	`+ je L(ret11)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`# else`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`- xorl %eax, %eax`
		513694	`- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx`
		513694	`- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax`
		513694	`- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax`
		513694	`+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`# endif`
		513694	`+L(ret11):`
		513694	`VZEROUPPER_RETURN`
		513694
		513694	`- .p2align 4`
		513694	`-L(test_3_vec):`
		513694	`+`
		513694	`+ /* Page cross in rsi in next 4x VEC. */`
		513694	`+`
		513694	`+ /* TODO: Improve logic here. */`
		513694	`+ .p2align 4,, 10`
		513694	`+L(page_cross_during_loop):`
		513694	`+ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */`
		513694	`+`
		513694	`+ /* Optimistically rsi and rdi and both aligned inwhich case we`
		513694	`+ don't need any logic here. */`
		513694	`+ cmpl $-(VEC_SIZE * 4), %eax`
		513694	`+ /* Don't adjust eax before jumping back to loop and we will`
		513694	`+ never hit page cross case again. */`
		513694	`+ je L(loop_skip_page_cross_check)`
		513694	`+`
		513694	`+ /* Check if we can safely load a VEC. */`
		513694	`+ cmpl $-(VEC_SIZE * 3), %eax`
		513694	`+ jle L(less_1x_vec_till_page_cross)`
		513694	`+`
		513694	`+ VMOVA (%rdi), %ymm0`
		513694	`+ VPCMPEQ (%rsi), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_0_end)`
		513694	`+`
		513694	`+ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */`
		513694	`+ cmpl $-(VEC_SIZE * 2), %eax`
		513694	`+ jg L(more_2x_vec_till_page_cross)`
		513694	`+`
		513694	`+ .p2align 4,, 4`
		513694	`+L(less_1x_vec_till_page_cross):`
		513694	`+ subl $-(VEC_SIZE * 4), %eax`
		513694	`+ /* Guranteed safe to read from rdi - VEC_SIZE here. The only`
		513694	`+ concerning case is first iteration if incoming s1 was near start`
		513694	`+ of a page and s2 near end. If s1 was near the start of the page`
		513694	`+ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe`
		513694	`+ to read back -VEC_SIZE. If rdi is truly at the start of a page`
		513694	`+ here, it means the previous page (rdi - VEC_SIZE) has already`
		513694	`+ been loaded earlier so must be valid. */`
		513694	`+ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0`
		513694	`+ VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+`
		513694	`+ /* Mask of potentially valid bits. The lower bits can be out of`
		513694	`+ range comparisons (but safe regarding page crosses). */`
		513694	`+ movl $-1, %r10d`
		513694	`+ shlxl %esi, %r10d, %r10d`
		513694	`+ notl %ecx`
		513694	`+`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* The first 3 vectors matched. Return 0 if the maximum offset`
		513694	`- (%r11) <= 3 * VEC_SIZE. */`
		513694	`- cmpq $(VEC_SIZE * 3), %r11`
		513694	`- jbe L(zero)`
		513694	`-# endif`
		513694	`- VPCMPEQ %ymm7, %ymm6, %ymm6`
		513694	`- vpmovmskb %ymm6, %esi`
		513694	`- tzcntl %esi, %ecx`
		513694	`+ cmpq %rax, %rdx`
		513694	`+ jbe L(return_page_cross_end_check)`
		513694	`+# endif`
		513694	`+ movl %eax, %OFFSET_REG`
		513694	`+ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax`
		513694	`+`
		513694	`+ andl %r10d, %ecx`
		513694	`+ jz L(loop_skip_page_cross_check)`
		513694	`+`
		513694	`+ .p2align 4,, 3`
		513694	`+L(return_page_cross_end):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`+`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- addq $(VEC_SIZE * 3), %rcx`
		513694	`- cmpq %rcx, %r11`
		513694	`- jbe L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`- xorl %eax, %eax`
		513694	`- movl (%rsi, %rcx), %esi`
		513694	`- cmpl (%rdx, %rcx), %esi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rcx), %eax`
		513694	`- movzbl (%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx`
		513694	`+L(return_page_cross_cmp_mem):`
		513694	`# else`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+ addl %OFFSET_REG, %ecx`
		513694	`+# endif`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl VEC_OFFSET(%rdi, %rcx), %edx`
		513694	`xorl %eax, %eax`
		513694	`- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi`
		513694	`- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax`
		513694	`- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ cmpl VEC_OFFSET(%rsi, %rcx), %edx`
		513694	`+ je L(ret8)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+# else`
		513694	`+ movzbl VEC_OFFSET(%rdi, %rcx), %eax`
		513694	`+ movzbl VEC_OFFSET(%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`# endif`
		513694	`+L(ret8):`
		513694	`VZEROUPPER_RETURN`
		513694
		513694	`- .p2align 4`
		513694	`-L(loop_cross_page):`
		513694	`- xorl %r10d, %r10d`
		513694	`- movq %rdx, %rcx`
		513694	`- /* Align load via RDX. We load the extra ECX bytes which should`
		513694	`- be ignored. */`
		513694	`- andl $((VEC_SIZE * 4) - 1), %ecx`
		513694	`- /* R10 is -RCX. */`
		513694	`- subq %rcx, %r10`
		513694	`-`
		513694	`- /* This works only if VEC_SIZE * 2 == 64. */`
		513694	`-# if (VEC_SIZE * 2) != 64`
		513694	`-# error (VEC_SIZE * 2) != 64`
		513694	`-# endif`
		513694	`-`
		513694	`- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */`
		513694	`- cmpl $(VEC_SIZE * 2), %ecx`
		513694	`- jge L(loop_cross_page_2_vec)`
		513694	`-`
		513694	`- vmovdqu (%rax, %r10), %ymm2`
		513694	`- vmovdqu VEC_SIZE(%rax, %r10), %ymm3`
		513694	`- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0`
		513694	`- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1`
		513694	`- VPMINU %ymm2, %ymm0, %ymm0`
		513694	`- VPMINU %ymm3, %ymm1, %ymm1`
		513694	`- VPCMPEQ %ymm7, %ymm0, %ymm0`
		513694	`- VPCMPEQ %ymm7, %ymm1, %ymm1`
		513694	`-`
		513694	`- vpmovmskb %ymm0, %edi`
		513694	`- vpmovmskb %ymm1, %esi`
		513694	`-`
		513694	`- salq $32, %rsi`
		513694	`- xorq %rsi, %rdi`
		513694	`-`
		513694	`- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */`
		513694	`- shrq %cl, %rdi`
		513694	`-`
		513694	`- testq %rdi, %rdi`
		513694	`- je L(loop_cross_page_2_vec)`
		513694	`- tzcntq %rdi, %rcx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- cmpq %rcx, %r11`
		513694	`- jbe L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+ .p2align 4,, 10`
		513694	`+L(return_page_cross_end_check):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`+ leal -VEC_SIZE(%rax, %rcx), %ecx`
		513694	`+ cmpl %ecx, %edx`
		513694	`+ ja L(return_page_cross_cmp_mem)`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rsi, %rcx), %edi`
		513694	`- cmpl (%rdx, %rcx), %edi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rcx), %eax`
		513694	`- movzbl (%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`-# else`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`- xorl %eax, %eax`
		513694	`- movl (%rsi, %rcx), %edi`
		513694	`- cmpl (%rdx, %rcx), %edi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rcx), %eax`
		513694	`- movzbl (%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`-# endif`
		513694	`VZEROUPPER_RETURN`
		513694	`+# endif`
		513694
		513694	`- .p2align 4`
		513694	`-L(loop_cross_page_2_vec):`
		513694	`- /* The first VEC_SIZE * 2 bytes match or are ignored. */`
		513694	`- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2`
		513694	`- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3`
		513694	`- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5`
		513694	`- VPMINU %ymm2, %ymm5, %ymm5`
		513694	`- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6`
		513694	`- VPCMPEQ %ymm7, %ymm5, %ymm5`
		513694	`- VPMINU %ymm3, %ymm6, %ymm6`
		513694	`- VPCMPEQ %ymm7, %ymm6, %ymm6`
		513694	`-`
		513694	`- vpmovmskb %ymm5, %edi`
		513694	`- vpmovmskb %ymm6, %esi`
		513694	`-`
		513694	`- salq $32, %rsi`
		513694	`- xorq %rsi, %rdi`
		513694
		513694	`- xorl %r8d, %r8d`
		513694	`- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */`
		513694	`- subl $(VEC_SIZE * 2), %ecx`
		513694	`- jle 1f`
		513694	`- /* Skip ECX bytes. */`
		513694	`- shrq %cl, %rdi`
		513694	`- /* R8 has number of bytes skipped. */`
		513694	`- movl %ecx, %r8d`
		513694	`-1:`
		513694	`- /* Before jumping back to the loop, set ESI to the number of`
		513694	`- VEC_SIZE * 4 blocks before page crossing. */`
		513694	`- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi`
		513694	`-`
		513694	`- testq %rdi, %rdi`
		513694	`+ .p2align 4,, 10`
		513694	`+L(more_2x_vec_till_page_cross):`
		513694	`+ /* If more 2x vec till cross we will complete a full loop`
		513694	`+ iteration here. */`
		513694	`+`
		513694	`+ VMOVU VEC_SIZE(%rdi), %ymm0`
		513694	`+ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_1_end)`
		513694	`+`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* At this point, if %rdi value is 0, it already tested`
		513694	`- VEC_SIZE*4+%r10 byte starting from %rax. This label`
		513694	`- checks whether strncmp maximum offset reached or not. */`
		513694	`- je L(string_nbyte_offset_check)`
		513694	`-# else`
		513694	`- je L(back_to_loop)`
		513694	`+ cmpq $(VEC_SIZE * 2), %rdx`
		513694	`+ jbe L(ret_zero_in_loop_page_cross)`
		513694	`# endif`
		513694	`- tzcntq %rdi, %rcx`
		513694	`- addq %r10, %rcx`
		513694	`- /* Adjust for number of bytes skipped. */`
		513694	`- addq %r8, %rcx`
		513694	`+`
		513694	`+ subl $-(VEC_SIZE * 4), %eax`
		513694	`+`
		513694	`+ /* Safe to include comparisons from lower bytes. */`
		513694	`+ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0`
		513694	`+ VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_page_cross_0)`
		513694	`+`
		513694	`+ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0`
		513694	`+ VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+ jnz L(return_vec_page_cross_1)`
		513694	`+`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- addq $(VEC_SIZE * 2), %rcx`
		513694	`- subq %rcx, %r11`
		513694	`- jbe L(zero)`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`+ /* Must check length here as length might proclude reading next`
		513694	`+ page. */`
		513694	`+ cmpq %rax, %rdx`
		513694	`+ jbe L(ret_zero_in_loop_page_cross)`
		513694	`+# endif`
		513694	`+`
		513694	`+ /* Finish the loop. */`
		513694	`+ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4`
		513694	`+ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6`
		513694	`+`
		513694	`+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5`
		513694	`+ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7`
		513694	`+ vpand %ymm4, %ymm5, %ymm5`
		513694	`+ vpand %ymm6, %ymm7, %ymm7`
		513694	`+ VPMINU %ymm5, %ymm7, %ymm7`
		513694	`+ VPCMPEQ %ymm7, %ymmZERO, %ymm7`
		513694	`+ vpmovmskb %ymm7, %LOOP_REG`
		513694	`+ testl %LOOP_REG, %LOOP_REG`
		513694	`+ jnz L(return_vec_2_3_end)`
		513694	`+`
		513694	`+ /* Best for code size to include ucond-jmp here. Would be faster`
		513694	`+ if this case is hot to duplicate the L(return_vec_2_3_end) code`
		513694	`+ as fall-through and have jump back to loop on mismatch`
		513694	`+ comparison. */`
		513694	`+ subq $-(VEC_SIZE * 4), %rdi`
		513694	`+ subq $-(VEC_SIZE * 4), %rsi`
		513694	`+ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ subq $(VEC_SIZE * 4), %rdx`
		513694	`+ ja L(loop_skip_page_cross_check)`
		513694	`+L(ret_zero_in_loop_page_cross):`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rsi, %rcx), %edi`
		513694	`- cmpl (%rdx, %rcx), %edi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rax, %rcx), %eax`
		513694	`- movzbl (%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ VZEROUPPER_RETURN`
		513694	`# else`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movq %rax, %rsi`
		513694	`- xorl %eax, %eax`
		513694	`- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi`
		513694	`- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax`
		513694	`- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx`
		513694	`- subl %edx, %eax`
		513694	`-# endif`
		513694	`+ jmp L(loop_skip_page_cross_check)`
		513694	`# endif`
		513694	`- VZEROUPPER_RETURN`
		513694
		513694	`+`
		513694	`+ .p2align 4,, 10`
		513694	`+L(return_vec_page_cross_0):`
		513694	`+ addl $-VEC_SIZE, %eax`
		513694	`+L(return_vec_page_cross_1):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`-L(string_nbyte_offset_check):`
		513694	`- leaq (VEC_SIZE * 4)(%r10), %r10`
		513694	`- cmpq %r10, %r11`
		513694	`- jbe L(zero)`
		513694	`- jmp L(back_to_loop)`
		513694	`+ leal -VEC_SIZE(%rax, %rcx), %ecx`
		513694	`+ cmpq %rcx, %rdx`
		513694	`+ jbe L(ret_zero_in_loop_page_cross)`
		513694	`+# else`
		513694	`+ addl %eax, %ecx`
		513694	`# endif`
		513694
		513694	`- .p2align 4`
		513694	`-L(cross_page_loop):`
		513694	`- /* Check one byte/dword at a time. */`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`- cmpl %ecx, %eax`
		513694	`+ movl VEC_OFFSET(%rdi, %rcx), %edx`
		513694	`+ xorl %eax, %eax`
		513694	`+ cmpl VEC_OFFSET(%rsi, %rcx), %edx`
		513694	`+ je L(ret9)`
		513694	`+ setl %al`
		513694	`+ negl %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`# else`
		513694	`+ movzbl VEC_OFFSET(%rdi, %rcx), %eax`
		513694	`+ movzbl VEC_OFFSET(%rsi, %rcx), %ecx`
		513694	`subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`# endif`
		513694	`- jne L(different)`
		513694	`- addl $SIZE_OF_CHAR, %edx`
		513694	`- cmpl $(VEC_SIZE * 4), %edx`
		513694	`- je L(main_loop_header)`
		513694	`-# ifdef USE_AS_STRNCMP`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`+L(ret9):`
		513694	`+ VZEROUPPER_RETURN`
		513694	`+`
		513694	`+`
		513694	`+ .p2align 4,, 10`
		513694	`+L(page_cross):`
		513694	`+# ifndef USE_AS_STRNCMP`
		513694	`+ /* If both are VEC aligned we don't need any special logic here.`
		513694	`+ Only valid for strcmp where stop condition is guranteed to be`
		513694	`+ reachable by just reading memory. */`
		513694	`+ testl $((VEC_SIZE - 1) << 20), %eax`
		513694	`+ jz L(no_page_cross)`
		513694	`# endif`
		513694	`+`
		513694	`+ movl %edi, %eax`
		513694	`+ movl %esi, %ecx`
		513694	`+ andl $(PAGE_SIZE - 1), %eax`
		513694	`+ andl $(PAGE_SIZE - 1), %ecx`
		513694	`+`
		513694	`+ xorl %OFFSET_REG, %OFFSET_REG`
		513694	`+`
		513694	`+ /* Check which is closer to page cross, s1 or s2. */`
		513694	`+ cmpl %eax, %ecx`
		513694	`+ jg L(page_cross_s2)`
		513694	`+`
		513694	`+ /* The previous page cross check has false positives. Check for`
		513694	`+ true positive as page cross logic is very expensive. */`
		513694	`+ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax`
		513694	`+ jbe L(no_page_cross)`
		513694	`+`
		513694	`+ /* Set r8 to not interfere with normal return value (rdi and rsi`
		513694	`+ did not swap). */`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`- movl (%rdi, %rdx), %eax`
		513694	`- movl (%rsi, %rdx), %ecx`
		513694	`+ /* any non-zero positive value that doesn't inference with 0x1.`
		513694	`+ */`
		513694	`+ movl $2, %r8d`
		513694	`# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %ecx`
		513694	`+ xorl %r8d, %r8d`
		513694	`# endif`
		513694	`- /* Check null char. */`
		513694	`- testl %eax, %eax`
		513694	`- jne L(cross_page_loop)`
		513694	`- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED`
		513694	`- comparisons. */`
		513694	`- subl %ecx, %eax`
		513694	`-# ifndef USE_AS_WCSCMP`
		513694	`-L(different):`
		513694	`+`
		513694	`+ /* Check if less than 1x VEC till page cross. */`
		513694	`+ subl $(VEC_SIZE * 3), %eax`
		513694	`+ jg L(less_1x_vec_till_page)`
		513694	`+`
		513694	`+ /* If more than 1x VEC till page cross, loop throuh safely`
		513694	`+ loadable memory until within 1x VEC of page cross. */`
		513694	`+`
		513694	`+ .p2align 4,, 10`
		513694	`+L(page_cross_loop):`
		513694	`+`
		513694	`+ VMOVU (%rdi, %OFFSET_REG64), %ymm0`
		513694	`+ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incl %ecx`
		513694	`+`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694	`+ addl $VEC_SIZE, %OFFSET_REG`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ cmpq %OFFSET_REG64, %rdx`
		513694	`+ jbe L(ret_zero_page_cross)`
		513694	`# endif`
		513694	`- VZEROUPPER_RETURN`
		513694	`+ addl $VEC_SIZE, %eax`
		513694	`+ jl L(page_cross_loop)`
		513694	`+`
		513694	`+ subl %eax, %OFFSET_REG`
		513694	`+ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed`
		513694	`+ to not cross page so is safe to load. Since we have already`
		513694	`+ loaded at least 1 VEC from rsi it is also guranteed to be safe.`
		513694	`+ */`
		513694	`+`
		513694	`+ VMOVU (%rdi, %OFFSET_REG64), %ymm0`
		513694	`+ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1`
		513694	`+ VPCMPEQ %ymm0, %ymmZERO, %ymm2`
		513694	`+ vpandn %ymm1, %ymm2, %ymm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ leal VEC_SIZE(%OFFSET_REG64), %eax`
		513694	`+ cmpq %rax, %rdx`
		513694	`+ jbe L(check_ret_vec_page_cross2)`
		513694	`+ addq %rdi, %rdx`
		513694	`+# endif`
		513694	`+ incl %ecx`
		513694	`+ jz L(prepare_loop_no_len)`
		513694
		513694	`+ .p2align 4,, 4`
		513694	`+L(ret_vec_page_cross):`
		513694	`+# ifndef USE_AS_STRNCMP`
		513694	`+L(check_ret_vec_page_cross):`
		513694	`+# endif`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`+ addl %OFFSET_REG, %ecx`
		513694	`+L(ret_vec_page_cross_cont):`
		513694	`# ifdef USE_AS_WCSCMP`
		513694	`- .p2align 4`
		513694	`-L(different):`
		513694	`- /* Use movl to avoid modifying EFLAGS. */`
		513694	`- movl $0, %eax`
		513694	`+ movl (%rdi, %rcx), %edx`
		513694	`+ xorl %eax, %eax`
		513694	`+ cmpl (%rsi, %rcx), %edx`
		513694	`+ je L(ret12)`
		513694	`setl %al`
		513694	`negl %eax`
		513694	`- orl $1, %eax`
		513694	`- VZEROUPPER_RETURN`
		513694	`+ xorl %r8d, %eax`
		513694	`+# else`
		513694	`+ movzbl (%rdi, %rcx), %eax`
		513694	`+ movzbl (%rsi, %rcx), %ecx`
		513694	`+ subl %ecx, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`# endif`
		513694	`+L(ret12):`
		513694	`+ VZEROUPPER_RETURN`
		513694
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- .p2align 4`
		513694	`-L(zero):`
		513694	`+ .p2align 4,, 10`
		513694	`+L(check_ret_vec_page_cross2):`
		513694	`+ incl %ecx`
		513694	`+L(check_ret_vec_page_cross):`
		513694	`+ tzcntl %ecx, %ecx`
		513694	`+ addl %OFFSET_REG, %ecx`
		513694	`+ cmpq %rcx, %rdx`
		513694	`+ ja L(ret_vec_page_cross_cont)`
		513694	`+ .p2align 4,, 2`
		513694	`+L(ret_zero_page_cross):`
		513694	`xorl %eax, %eax`
		513694	`VZEROUPPER_RETURN`
		513694	`+# endif`
		513694
		513694	`- .p2align 4`
		513694	`-L(char0):`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- xorl %eax, %eax`
		513694	`- movl (%rdi), %ecx`
		513694	`- cmpl (%rsi), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rsi), %ecx`
		513694	`- movzbl (%rdi), %eax`
		513694	`- subl %ecx, %eax`
		513694	`-# endif`
		513694	`- VZEROUPPER_RETURN`
		513694	`+ .p2align 4,, 4`
		513694	`+L(page_cross_s2):`
		513694	`+ /* Ensure this is a true page cross. */`
		513694	`+ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx`
		513694	`+ jbe L(no_page_cross)`
		513694	`+`
		513694	`+`
		513694	`+ movl %ecx, %eax`
		513694	`+ movq %rdi, %rcx`
		513694	`+ movq %rsi, %rdi`
		513694	`+ movq %rcx, %rsi`
		513694	`+`
		513694	`+ /* set r8 to negate return value as rdi and rsi swapped. */`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ movl $-4, %r8d`
		513694	`+# else`
		513694	`+ movl $-1, %r8d`
		513694	`# endif`
		513694	`+ xorl %OFFSET_REG, %OFFSET_REG`
		513694
		513694	`- .p2align 4`
		513694	`-L(last_vector):`
		513694	`- addq %rdx, %rdi`
		513694	`- addq %rdx, %rsi`
		513694	`+ /* Check if more than 1x VEC till page cross. */`
		513694	`+ subl $(VEC_SIZE * 3), %eax`
		513694	`+ jle L(page_cross_loop)`
		513694	`+`
		513694	`+ .p2align 4,, 6`
		513694	`+L(less_1x_vec_till_page):`
		513694	`+ /* Find largest load size we can use. */`
		513694	`+ cmpl $16, %eax`
		513694	`+ ja L(less_16_till_page)`
		513694	`+`
		513694	`+ VMOVU (%rdi), %xmm0`
		513694	`+ VPCMPEQ (%rsi), %xmm0, %xmm1`
		513694	`+ VPCMPEQ %xmm0, %xmmZERO, %xmm2`
		513694	`+ vpandn %xmm1, %xmm2, %xmm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incw %cx`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694	`+ movl $16, %OFFSET_REG`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- subq %rdx, %r11`
		513694	`+ cmpq %OFFSET_REG64, %rdx`
		513694	`+ jbe L(ret_zero_page_cross_slow_case0)`
		513694	`+ subl %eax, %OFFSET_REG`
		513694	`+# else`
		513694	`+ /* Explicit check for 16 byte alignment. */`
		513694	`+ subl %eax, %OFFSET_REG`
		513694	`+ jz L(prepare_loop)`
		513694	`# endif`
		513694	`- tzcntl %ecx, %edx`
		513694	`+`
		513694	`+ VMOVU (%rdi, %OFFSET_REG64), %xmm0`
		513694	`+ VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1`
		513694	`+ VPCMPEQ %xmm0, %xmmZERO, %xmm2`
		513694	`+ vpandn %xmm1, %xmm2, %xmm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incw %cx`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694	`+`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`+ addl $16, %OFFSET_REG`
		513694	`+ subq %OFFSET_REG64, %rdx`
		513694	`+ jbe L(ret_zero_page_cross_slow_case0)`
		513694	`+ subq $-(VEC_SIZE * 4), %rdx`
		513694	`+`
		513694	`+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi`
		513694	`+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi`
		513694	`+# else`
		513694	`+ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi`
		513694	`+ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi`
		513694	`# endif`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`+ jmp L(prepare_loop_aligned)`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ .p2align 4,, 2`
		513694	`+L(ret_zero_page_cross_slow_case0):`
		513694	`xorl %eax, %eax`
		513694	`- movl (%rdi, %rdx), %ecx`
		513694	`- cmpl (%rsi, %rdx), %ecx`
		513694	`- jne L(wcscmp_return)`
		513694	`-# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %edx`
		513694	`- subl %edx, %eax`
		513694	`+ ret`
		513694	`# endif`
		513694	`- VZEROUPPER_RETURN`
		513694
		513694	`- /* Comparing on page boundary region requires special treatment:`
		513694	`- It must done one vector at the time, starting with the wider`
		513694	`- ymm vector if possible, if not, with xmm. If fetching 16 bytes`
		513694	`- (xmm) still passes the boundary, byte comparison must be done.`
		513694	`- */`
		513694	`- .p2align 4`
		513694	`-L(cross_page):`
		513694	`- /* Try one ymm vector at a time. */`
		513694	`- cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		513694	`- jg L(cross_page_1_vector)`
		513694	`-L(loop_1_vector):`
		513694	`- vmovdqu (%rdi, %rdx), %ymm1`
		513694	`- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0`
		513694	`- VPMINU %ymm1, %ymm0, %ymm0`
		513694	`- VPCMPEQ %ymm7, %ymm0, %ymm0`
		513694	`- vpmovmskb %ymm0, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(last_vector)`
		513694
		513694	`- addl $VEC_SIZE, %edx`
		513694	`+ .p2align 4,, 10`
		513694	`+L(less_16_till_page):`
		513694	`+ /* Find largest load size we can use. */`
		513694	`+ cmpl $24, %eax`
		513694	`+ ja L(less_8_till_page)`
		513694
		513694	`- addl $VEC_SIZE, %eax`
		513694	`-# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the current offset (%rdx) >= the maximum offset`
		513694	`- (%r11). */`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`-# endif`
		513694	`- cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		513694	`- jle L(loop_1_vector)`
		513694	`-L(cross_page_1_vector):`
		513694	`- /* Less than 32 bytes to check, try one xmm vector. */`
		513694	`- cmpl $(PAGE_SIZE - 16), %eax`
		513694	`- jg L(cross_page_1_xmm)`
		513694	`- vmovdqu (%rdi, %rdx), %xmm1`
		513694	`- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0`
		513694	`- VPMINU %xmm1, %xmm0, %xmm0`
		513694	`- VPCMPEQ %xmm7, %xmm0, %xmm0`
		513694	`- vpmovmskb %xmm0, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(last_vector)`
		513694	`+ vmovq (%rdi), %xmm0`
		513694	`+ vmovq (%rsi), %xmm1`
		513694	`+ VPCMPEQ %xmm0, %xmmZERO, %xmm2`
		513694	`+ VPCMPEQ %xmm1, %xmm0, %xmm1`
		513694	`+ vpandn %xmm1, %xmm2, %xmm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incb %cl`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694
		513694	`- addl $16, %edx`
		513694	`-# ifndef USE_AS_WCSCMP`
		513694	`- addl $16, %eax`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ cmpq $8, %rdx`
		513694	`+ jbe L(ret_zero_page_cross_slow_case0)`
		513694	`# endif`
		513694	`+ movl $24, %OFFSET_REG`
		513694	`+ /* Explicit check for 16 byte alignment. */`
		513694	`+ subl %eax, %OFFSET_REG`
		513694	`+`
		513694	`+`
		513694	`+`
		513694	`+ vmovq (%rdi, %OFFSET_REG64), %xmm0`
		513694	`+ vmovq (%rsi, %OFFSET_REG64), %xmm1`
		513694	`+ VPCMPEQ %xmm0, %xmmZERO, %xmm2`
		513694	`+ VPCMPEQ %xmm1, %xmm0, %xmm1`
		513694	`+ vpandn %xmm1, %xmm2, %xmm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ incb %cl`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694	`+`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the current offset (%rdx) >= the maximum offset`
		513694	`- (%r11). */`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`-# endif`
		513694	`-`
		513694	`-L(cross_page_1_xmm):`
		513694	`-# ifndef USE_AS_WCSCMP`
		513694	`- /* Less than 16 bytes to check, try 8 byte vector. NB: No need`
		513694	`- for wcscmp nor wcsncmp since wide char is 4 bytes. */`
		513694	`- cmpl $(PAGE_SIZE - 8), %eax`
		513694	`- jg L(cross_page_8bytes)`
		513694	`- vmovq (%rdi, %rdx), %xmm1`
		513694	`- vmovq (%rsi, %rdx), %xmm0`
		513694	`- VPCMPEQ %xmm0, %xmm1, %xmm0`
		513694	`- VPMINU %xmm1, %xmm0, %xmm0`
		513694	`- VPCMPEQ %xmm7, %xmm0, %xmm0`
		513694	`- vpmovmskb %xmm0, %ecx`
		513694	`- /* Only last 8 bits are valid. */`
		513694	`- andl $0xff, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(last_vector)`
		513694	`+ addl $8, %OFFSET_REG`
		513694	`+ subq %OFFSET_REG64, %rdx`
		513694	`+ jbe L(ret_zero_page_cross_slow_case0)`
		513694	`+ subq $-(VEC_SIZE * 4), %rdx`
		513694
		513694	`- addl $8, %edx`
		513694	`- addl $8, %eax`
		513694	`+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi`
		513694	`+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi`
		513694	`+# else`
		513694	`+ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi`
		513694	`+ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi`
		513694	`+# endif`
		513694	`+ jmp L(prepare_loop_aligned)`
		513694	`+`
		513694	`+`
		513694	`+ .p2align 4,, 10`
		513694	`+L(less_8_till_page):`
		513694	`+# ifdef USE_AS_WCSCMP`
		513694	`+ /* If using wchar then this is the only check before we reach`
		513694	`+ the page boundary. */`
		513694	`+ movl (%rdi), %eax`
		513694	`+ movl (%rsi), %ecx`
		513694	`+ cmpl %ecx, %eax`
		513694	`+ jnz L(ret_less_8_wcs)`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the current offset (%rdx) >= the maximum offset`
		513694	`- (%r11). */`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`+ addq %rdi, %rdx`
		513694	`+ /* We already checked for len <= 1 so cannot hit that case here.`
		513694	`+ */`
		513694	`# endif`
		513694	`+ testl %eax, %eax`
		513694	`+ jnz L(prepare_loop_no_len)`
		513694	`+ ret`
		513694
		513694	`-L(cross_page_8bytes):`
		513694	`- /* Less than 8 bytes to check, try 4 byte vector. */`
		513694	`- cmpl $(PAGE_SIZE - 4), %eax`
		513694	`- jg L(cross_page_4bytes)`
		513694	`- vmovd (%rdi, %rdx), %xmm1`
		513694	`- vmovd (%rsi, %rdx), %xmm0`
		513694	`- VPCMPEQ %xmm0, %xmm1, %xmm0`
		513694	`- VPMINU %xmm1, %xmm0, %xmm0`
		513694	`- VPCMPEQ %xmm7, %xmm0, %xmm0`
		513694	`- vpmovmskb %xmm0, %ecx`
		513694	`- /* Only last 4 bits are valid. */`
		513694	`- andl $0xf, %ecx`
		513694	`- testl %ecx, %ecx`
		513694	`- jne L(last_vector)`
		513694	`+ .p2align 4,, 8`
		513694	`+L(ret_less_8_wcs):`
		513694	`+ setl %OFFSET_REG8`
		513694	`+ negl %OFFSET_REG`
		513694	`+ movl %OFFSET_REG, %eax`
		513694	`+ xorl %r8d, %eax`
		513694	`+ ret`
		513694	`+`
		513694	`+# else`
		513694	`+`
		513694	`+ /* Find largest load size we can use. */`
		513694	`+ cmpl $28, %eax`
		513694	`+ ja L(less_4_till_page)`
		513694	`+`
		513694	`+ vmovd (%rdi), %xmm0`
		513694	`+ vmovd (%rsi), %xmm1`
		513694	`+ VPCMPEQ %xmm0, %xmmZERO, %xmm2`
		513694	`+ VPCMPEQ %xmm1, %xmm0, %xmm1`
		513694	`+ vpandn %xmm1, %xmm2, %xmm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ subl $0xf, %ecx`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694
		513694	`- addl $4, %edx`
		513694	`# ifdef USE_AS_STRNCMP`
		513694	`- /* Return 0 if the current offset (%rdx) >= the maximum offset`
		513694	`- (%r11). */`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`+ cmpq $4, %rdx`
		513694	`+ jbe L(ret_zero_page_cross_slow_case1)`
		513694	`# endif`
		513694	`+ movl $28, %OFFSET_REG`
		513694	`+ /* Explicit check for 16 byte alignment. */`
		513694	`+ subl %eax, %OFFSET_REG`
		513694
		513694	`-L(cross_page_4bytes):`
		513694	`-# endif`
		513694	`- /* Less than 4 bytes to check, try one byte/dword at a time. */`
		513694	`-# ifdef USE_AS_STRNCMP`
		513694	`- cmpq %r11, %rdx`
		513694	`- jae L(zero)`
		513694	`-# endif`
		513694	`-# ifdef USE_AS_WCSCMP`
		513694	`- movl (%rdi, %rdx), %eax`
		513694	`- movl (%rsi, %rdx), %ecx`
		513694	`-# else`
		513694	`- movzbl (%rdi, %rdx), %eax`
		513694	`- movzbl (%rsi, %rdx), %ecx`
		513694	`-# endif`
		513694	`- testl %eax, %eax`
		513694	`- jne L(cross_page_loop)`
		513694	`+`
		513694	`+`
		513694	`+ vmovd (%rdi, %OFFSET_REG64), %xmm0`
		513694	`+ vmovd (%rsi, %OFFSET_REG64), %xmm1`
		513694	`+ VPCMPEQ %xmm0, %xmmZERO, %xmm2`
		513694	`+ VPCMPEQ %xmm1, %xmm0, %xmm1`
		513694	`+ vpandn %xmm1, %xmm2, %xmm1`
		513694	`+ vpmovmskb %ymm1, %ecx`
		513694	`+ subl $0xf, %ecx`
		513694	`+ jnz L(check_ret_vec_page_cross)`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ addl $4, %OFFSET_REG`
		513694	`+ subq %OFFSET_REG64, %rdx`
		513694	`+ jbe L(ret_zero_page_cross_slow_case1)`
		513694	`+ subq $-(VEC_SIZE * 4), %rdx`
		513694	`+`
		513694	`+ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi`
		513694	`+ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi`
		513694	`+# else`
		513694	`+ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi`
		513694	`+ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi`
		513694	`+# endif`
		513694	`+ jmp L(prepare_loop_aligned)`
		513694	`+`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ .p2align 4,, 2`
		513694	`+L(ret_zero_page_cross_slow_case1):`
		513694	`+ xorl %eax, %eax`
		513694	`+ ret`
		513694	`+# endif`
		513694	`+`
		513694	`+ .p2align 4,, 10`
		513694	`+L(less_4_till_page):`
		513694	`+ subq %rdi, %rsi`
		513694	`+ /* Extremely slow byte comparison loop. */`
		513694	`+L(less_4_loop):`
		513694	`+ movzbl (%rdi), %eax`
		513694	`+ movzbl (%rsi, %rdi), %ecx`
		513694	`subl %ecx, %eax`
		513694	`- VZEROUPPER_RETURN`
		513694	`-END (STRCMP)`
		513694	`+ jnz L(ret_less_4_loop)`
		513694	`+ testl %ecx, %ecx`
		513694	`+ jz L(ret_zero_4_loop)`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ decq %rdx`
		513694	`+ jz L(ret_zero_4_loop)`
		513694	`+# endif`
		513694	`+ incq %rdi`
		513694	`+ /* end condition is reach page boundary (rdi is aligned). */`
		513694	`+ testl $31, %edi`
		513694	`+ jnz L(less_4_loop)`
		513694	`+ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi`
		513694	`+ addq $-(VEC_SIZE * 4), %rdi`
		513694	`+# ifdef USE_AS_STRNCMP`
		513694	`+ subq $-(VEC_SIZE * 4), %rdx`
		513694	`+# endif`
		513694	`+ jmp L(prepare_loop_aligned)`
		513694	`+`
		513694	`+L(ret_zero_4_loop):`
		513694	`+ xorl %eax, %eax`
		513694	`+ ret`
		513694	`+L(ret_less_4_loop):`
		513694	`+ xorl %r8d, %eax`
		513694	`+ subl %r8d, %eax`
		513694	`+ ret`
		513694	`+# endif`
		513694	`+END(STRCMP)`
		513694	`#endif`
		513694	`--`
		513694	`GitLab`
		513694

rpms / glibc

Source Code

Blame SOURCES/ia-opt-strcmp-avx2.patch