Tree - rpms/glibc - CentOS Git server

rpms / glibc

Blame SOURCES/ia-opt-memcmp-avx2-movbe.patch

Blob History Raw

		190885	`From 65438851072f6131049a0ae471dcab90870e51f3 Mon Sep 17 00:00:00 2001`
		190885	`From: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Date: Mon, 17 May 2021 13:56:52 -0400`
		190885	`Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S`
		190885
		190885	`No bug. This commit optimizes memcmp-avx2.S. The optimizations include`
		190885	`adding a new vec compare path for small sizes, reorganizing the entry`
		190885	`control flow, and removing some unnecissary ALU instructions from the`
		190885	`main loop. test-memcmp and test-wmemcmp are both passing.`
		190885
		190885	`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
		190885	`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
		190885	`(cherry picked from commit 16d12015c57701b08d7bbed6ec536641bcafb428)`
		190885	`---`
		190885	`sysdeps/x86_64/multiarch/ifunc-impl-list.c \| 6 +`
		190885	`sysdeps/x86_64/multiarch/ifunc-memcmp.h \| 1 +`
		190885	`sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S \| 676 +++++++++++--------`
		190885	`3 files changed, 402 insertions(+), 281 deletions(-)`
		190885
		190885	`diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c`
		190885	`index ac097e8d..8be0d78a 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c`
		190885	`+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c`
		190885	`@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		190885	`IFUNC_IMPL (i, name, memcmp,`
		190885	`IFUNC_IMPL_ADD (array, i, memcmp,`
		190885	`(CPU_FEATURE_USABLE (AVX2)`
		190885	`+ && CPU_FEATURE_USABLE (BMI2)`
		190885	`&& CPU_FEATURE_USABLE (MOVBE)),`
		190885	`__memcmp_avx2_movbe)`
		190885	`IFUNC_IMPL_ADD (array, i, memcmp,`
		190885	`(CPU_FEATURE_USABLE (AVX2)`
		190885	`+ && CPU_FEATURE_USABLE (BMI2)`
		190885	`&& CPU_FEATURE_USABLE (MOVBE)`
		190885	`&& CPU_FEATURE_USABLE (RTM)),`
		190885	`__memcmp_avx2_movbe_rtm)`
		190885	`IFUNC_IMPL_ADD (array, i, memcmp,`
		190885	`(CPU_FEATURE_USABLE (AVX512VL)`
		190885	`&& CPU_FEATURE_USABLE (AVX512BW)`
		190885	`+ && CPU_FEATURE_USABLE (BMI2)`
		190885	`&& CPU_FEATURE_USABLE (MOVBE)),`
		190885	`__memcmp_evex_movbe)`
		190885	`IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),`
		190885	`@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char name, struct libc_ifunc_impl array,`
		190885	`IFUNC_IMPL (i, name, wmemcmp,`
		190885	`IFUNC_IMPL_ADD (array, i, wmemcmp,`
		190885	`(CPU_FEATURE_USABLE (AVX2)`
		190885	`+ && CPU_FEATURE_USABLE (BMI2)`
		190885	`&& CPU_FEATURE_USABLE (MOVBE)),`
		190885	`__wmemcmp_avx2_movbe)`
		190885	`IFUNC_IMPL_ADD (array, i, wmemcmp,`
		190885	`(CPU_FEATURE_USABLE (AVX2)`
		190885	`+ && CPU_FEATURE_USABLE (BMI2)`
		190885	`&& CPU_FEATURE_USABLE (MOVBE)`
		190885	`&& CPU_FEATURE_USABLE (RTM)),`
		190885	`__wmemcmp_avx2_movbe_rtm)`
		190885	`IFUNC_IMPL_ADD (array, i, wmemcmp,`
		190885	`(CPU_FEATURE_USABLE (AVX512VL)`
		190885	`&& CPU_FEATURE_USABLE (AVX512BW)`
		190885	`+ && CPU_FEATURE_USABLE (BMI2)`
		190885	`&& CPU_FEATURE_USABLE (MOVBE)),`
		190885	`__wmemcmp_evex_movbe)`
		190885	`IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),`
		190885	`diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h`
		190885	`index 8043c635..690dffe8 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h`
		190885	`+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h`
		190885	`@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)`
		190885
		190885	`if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)`
		190885	`&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)`
		190885	`+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)`
		190885	`&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))`
		190885	`{`
		190885	`if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)`
		190885	`diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S`
		190885	`index 9d5c9c72..16fc673e 100644`
		190885	`--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S`
		190885	`+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S`
		190885	`@@ -19,17 +19,23 @@`
		190885	`#if IS_IN (libc)`
		190885
		190885	`/* memcmp/wmemcmp is implemented as:`
		190885	`- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap`
		190885	`- to avoid branches.`
		190885	`- 2. Use overlapping compare to avoid branch.`
		190885	`- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8`
		190885	`- bytes for wmemcmp.`
		190885	`- 4. If size is 8 * VEC_SIZE or less, unroll the loop.`
		190885	`- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory`
		190885	`+ 1. Use ymm vector compares when possible. The only case where`
		190885	`+ vector compares is not possible for when size < VEC_SIZE`
		190885	`+ and loading from either s1 or s2 would cause a page cross.`
		190885	`+ 2. For size from 2 to 7 bytes on page cross, load as big endian`
		190885	`+ with movbe and bswap to avoid branches.`
		190885	`+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or`
		190885	`+ size >= 8 bytes for wmemcmp.`
		190885	`+ 4. Optimistically compare up to first 4 * VEC_SIZE one at a`
		190885	`+ to check for early mismatches. Only do this if its guranteed the`
		190885	`+ work is not wasted.`
		190885	`+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.`
		190885	`+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory`
		190885	`area.`
		190885	`- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.`
		190885	`- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.`
		190885	`- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */`
		190885	`+ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.`
		190885	`+ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.`
		190885	`+ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */`
		190885	`+`
		190885
		190885	`# include <sysdep.h>`
		190885
		190885	`@@ -38,8 +44,10 @@`
		190885	`# endif`
		190885
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`+# define CHAR_SIZE 4`
		190885	`# define VPCMPEQ vpcmpeqd`
		190885	`# else`
		190885	`+# define CHAR_SIZE 1`
		190885	`# define VPCMPEQ vpcmpeqb`
		190885	`# endif`
		190885
		190885	`@@ -52,7 +60,7 @@`
		190885	`# endif`
		190885
		190885	`# define VEC_SIZE 32`
		190885	`-# define VEC_MASK ((1 << VEC_SIZE) - 1)`
		190885	`+# define PAGE_SIZE 4096`
		190885
		190885	`/* Warning!`
		190885	`wmemcmp has to use SIGNED comparison for elements.`
		190885	`@@ -71,136 +79,359 @@ ENTRY (MEMCMP)`
		190885	`jb L(less_vec)`
		190885
		190885	`/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ vmovdqu (%rsi), %ymm1`
		190885	`+ VPCMPEQ (%rdi), %ymm1, %ymm1`
		190885	`+ vpmovmskb %ymm1, %eax`
		190885	`+ /* NB: eax must be destination register if going to`
		190885	`+ L(return_vec_[0,2]). For L(return_vec_3 destination register`
		190885	`+ must be ecx. */`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_0)`
		190885
		190885	`cmpq $(VEC_SIZE * 2), %rdx`
		190885	`- jbe L(last_vec)`
		190885	`-`
		190885	`- VPCMPEQ %ymm0, %ymm0, %ymm0`
		190885	`- /* More than 2 * VEC. */`
		190885	`- cmpq $(VEC_SIZE * 8), %rdx`
		190885	`- ja L(more_8x_vec)`
		190885	`- cmpq $(VEC_SIZE * 4), %rdx`
		190885	`- jb L(last_4x_vec)`
		190885	`-`
		190885	`- /* From 4 * VEC to 8 * VEC, inclusively. */`
		190885	`- vmovdqu (%rsi), %ymm1`
		190885	`- VPCMPEQ (%rdi), %ymm1, %ymm1`
		190885	`+ jbe L(last_1x_vec)`
		190885
		190885	`+ /* Check second VEC no matter what. */`
		190885	`vmovdqu VEC_SIZE(%rsi), %ymm2`
		190885	`- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2`
		190885	`+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2`
		190885	`+ vpmovmskb %ymm2, %eax`
		190885	`+ /* If all 4 VEC where equal eax will be all 1s so incl will`
		190885	`+ overflow and set zero flag. */`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_1)`
		190885
		190885	`- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3`
		190885	`- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3`
		190885	`+ /* Less than 4 * VEC. */`
		190885	`+ cmpq $(VEC_SIZE * 4), %rdx`
		190885	`+ jbe L(last_2x_vec)`
		190885
		190885	`+ /* Check third and fourth VEC no matter what. */`
		190885	`+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3`
		190885	`+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3`
		190885	`+ vpmovmskb %ymm3, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_2)`
		190885	`vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4`
		190885	`- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4`
		190885	`+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4`
		190885	`+ vpmovmskb %ymm4, %ecx`
		190885	`+ incl %ecx`
		190885	`+ jnz L(return_vec_3)`
		190885
		190885	`- vpand %ymm1, %ymm2, %ymm5`
		190885	`- vpand %ymm3, %ymm4, %ymm6`
		190885	`- vpand %ymm5, %ymm6, %ymm5`
		190885	`+ /* Go to 4x VEC loop. */`
		190885	`+ cmpq $(VEC_SIZE * 8), %rdx`
		190885	`+ ja L(more_8x_vec)`
		190885
		190885	`- vptest %ymm0, %ymm5`
		190885	`- jnc L(4x_vec_end)`
		190885	`+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any`
		190885	`+ branches. */`
		190885
		190885	`+ /* Load first two VEC from s2 before adjusting addresses. */`
		190885	`+ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1`
		190885	`+ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2`
		190885	`leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi`
		190885	`leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi`
		190885	`- vmovdqu (%rsi), %ymm1`
		190885	`- VPCMPEQ (%rdi), %ymm1, %ymm1`
		190885
		190885	`- vmovdqu VEC_SIZE(%rsi), %ymm2`
		190885	`- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2`
		190885	`- vpand %ymm2, %ymm1, %ymm5`
		190885	`+ /* Wait to load from s1 until addressed adjust due to`
		190885	`+ unlamination of microfusion with complex address mode. */`
		190885	`+ VPCMPEQ (%rdi), %ymm1, %ymm1`
		190885	`+ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2`
		190885
		190885	`vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3`
		190885	`- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3`
		190885	`- vpand %ymm3, %ymm5, %ymm5`
		190885	`-`
		190885	`+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3`
		190885	`vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4`
		190885	`- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4`
		190885	`- vpand %ymm4, %ymm5, %ymm5`
		190885	`+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4`
		190885
		190885	`- vptest %ymm0, %ymm5`
		190885	`- jnc L(4x_vec_end)`
		190885	`- xorl %eax, %eax`
		190885	`+ /* Reduce VEC0 - VEC4. */`
		190885	`+ vpand %ymm1, %ymm2, %ymm5`
		190885	`+ vpand %ymm3, %ymm4, %ymm6`
		190885	`+ vpand %ymm5, %ymm6, %ymm7`
		190885	`+ vpmovmskb %ymm7, %ecx`
		190885	`+ incl %ecx`
		190885	`+ jnz L(return_vec_0_1_2_3)`
		190885	`+ /* NB: eax must be zero to reach here. */`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(return_vec_0):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (%rdi, %rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (%rsi, %rax), %ecx`
		190885	`+ /* NB: no partial register stall here because xorl zero idiom`
		190885	`+ above. */`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (%rsi, %rax), %ecx`
		190885	`+ movzbl (%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`L(return_vzeroupper):`
		190885	`ZERO_UPPER_VEC_REGISTERS_RETURN`
		190885
		190885	`.p2align 4`
		190885	`-L(last_2x_vec):`
		190885	`- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+L(return_vec_1):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl VEC_SIZE(%rdi, %rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl VEC_SIZE(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl VEC_SIZE(%rsi, %rax), %ecx`
		190885	`+ movzbl VEC_SIZE(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(return_vec_2):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */`
		190885	`+ .p2align 5`
		190885	`+L(8x_return_vec_0_1_2_3):`
		190885	`+ /* Returning from L(more_8x_vec) requires restoring rsi. */`
		190885	`+ addq %rdi, %rsi`
		190885	`+L(return_vec_0_1_2_3):`
		190885	`+ vpmovmskb %ymm1, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_0)`
		190885
		190885	`-L(last_vec):`
		190885	`- /* Use overlapping loads to avoid branches. */`
		190885	`- leaq -VEC_SIZE(%rdi, %rdx), %rdi`
		190885	`- leaq -VEC_SIZE(%rsi, %rdx), %rsi`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_1)`
		190885	`+`
		190885	`+ vpmovmskb %ymm3, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_2)`
		190885	`+L(return_vec_3):`
		190885	`+ tzcntl %ecx, %ecx`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(more_8x_vec):`
		190885	`+ /* Set end of s1 in rdx. */`
		190885	`+ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx`
		190885	`+ /* rsi stores s2 - s1. This allows loop to only update one`
		190885	`+ pointer. */`
		190885	`+ subq %rdi, %rsi`
		190885	`+ /* Align s1 pointer. */`
		190885	`+ andq $-VEC_SIZE, %rdi`
		190885	`+ /* Adjust because first 4x vec where check already. */`
		190885	`+ subq $-(VEC_SIZE * 4), %rdi`
		190885	`+ .p2align 4`
		190885	`+L(loop_4x_vec):`
		190885	`+ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).`
		190885	`+ */`
		190885	`+ vmovdqu (%rsi, %rdi), %ymm1`
		190885	`+ VPCMPEQ (%rdi), %ymm1, %ymm1`
		190885	`+`
		190885	`+ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2`
		190885	`+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2`
		190885	`+`
		190885	`+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3`
		190885	`+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3`
		190885	`+`
		190885	`+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4`
		190885	`+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4`
		190885	`+`
		190885	`+ vpand %ymm1, %ymm2, %ymm5`
		190885	`+ vpand %ymm3, %ymm4, %ymm6`
		190885	`+ vpand %ymm5, %ymm6, %ymm7`
		190885	`+ vpmovmskb %ymm7, %ecx`
		190885	`+ incl %ecx`
		190885	`+ jnz L(8x_return_vec_0_1_2_3)`
		190885	`+ subq $-(VEC_SIZE * 4), %rdi`
		190885	`+ /* Check if s1 pointer at end. */`
		190885	`+ cmpq %rdx, %rdi`
		190885	`+ jb L(loop_4x_vec)`
		190885	`+`
		190885	`+ subq %rdx, %rdi`
		190885	`+ /* rdi has 4 * VEC_SIZE - remaining length. */`
		190885	`+ cmpl $(VEC_SIZE * 3), %edi`
		190885	`+ jae L(8x_last_1x_vec)`
		190885	`+ /* Load regardless of branch. */`
		190885	`+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3`
		190885	`+ cmpl $(VEC_SIZE * 2), %edi`
		190885	`+ jae L(8x_last_2x_vec)`
		190885	`+`
		190885	`+ /* Check last 4 VEC. */`
		190885	`+ vmovdqu (%rsi, %rdx), %ymm1`
		190885	`+ VPCMPEQ (%rdx), %ymm1, %ymm1`
		190885	`+`
		190885	`+ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2`
		190885	`+ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2`
		190885	`+`
		190885	`+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3`
		190885	`+`
		190885	`+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4`
		190885	`+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4`
		190885	`+`
		190885	`+ vpand %ymm1, %ymm2, %ymm5`
		190885	`+ vpand %ymm3, %ymm4, %ymm6`
		190885	`+ vpand %ymm5, %ymm6, %ymm7`
		190885	`+ vpmovmskb %ymm7, %ecx`
		190885	`+ /* Restore s1 pointer to rdi. */`
		190885	`+ movq %rdx, %rdi`
		190885	`+ incl %ecx`
		190885	`+ jnz L(8x_return_vec_0_1_2_3)`
		190885	`+ /* NB: eax must be zero to reach here. */`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ /* Only entry is from L(more_8x_vec). */`
		190885	`+ .p2align 4`
		190885	`+L(8x_last_2x_vec):`
		190885	`+ /* Check second to last VEC. rdx store end pointer of s1 and`
		190885	`+ ymm3 has already been loaded with second to last VEC from s2.`
		190885	`+ */`
		190885	`+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3`
		190885	`+ vpmovmskb %ymm3, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(8x_return_vec_2)`
		190885	`+ /* Check last VEC. */`
		190885	`+ .p2align 4`
		190885	`+L(8x_last_1x_vec):`
		190885	`+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4`
		190885	`+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4`
		190885	`+ vpmovmskb %ymm4, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(8x_return_vec_3)`
		190885	`VZEROUPPER_RETURN`
		190885
		190885	`.p2align 4`
		190885	`-L(first_vec):`
		190885	`- /* A byte or int32 is different within 16 or 32 bytes. */`
		190885	`- tzcntl %eax, %ecx`
		190885	`+L(last_2x_vec):`
		190885	`+ /* Check second to last VEC. */`
		190885	`+ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1`
		190885	`+ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1`
		190885	`+ vpmovmskb %ymm1, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_1_end)`
		190885	`+ /* Check last VEC. */`
		190885	`+L(last_1x_vec):`
		190885	`+ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1`
		190885	`+ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1`
		190885	`+ vpmovmskb %ymm1, %eax`
		190885	`+ incl %eax`
		190885	`+ jnz L(return_vec_0_end)`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(8x_return_vec_2):`
		190885	`+ subq $VEC_SIZE, %rdx`
		190885	`+L(8x_return_vec_3):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+ addq %rdx, %rax`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl (%rdi, %rcx), %edx`
		190885	`- cmpl (%rsi, %rcx), %edx`
		190885	`-L(wmemcmp_return):`
		190885	`- setl %al`
		190885	`- negl %eax`
		190885	`- orl $1, %eax`
		190885	`+ movl (VEC_SIZE * 3)(%rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`# else`
		190885	`- movzbl (%rdi, %rcx), %eax`
		190885	`- movzbl (%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx`
		190885	`+ movzbl (VEC_SIZE * 3)(%rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`# endif`
		190885	`VZEROUPPER_RETURN`
		190885
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`.p2align 4`
		190885	`-L(4):`
		190885	`- xorl %eax, %eax`
		190885	`- movl (%rdi), %edx`
		190885	`- cmpl (%rsi), %edx`
		190885	`- jne L(wmemcmp_return)`
		190885	`- ret`
		190885	`+L(return_vec_1_end):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+ addl %edx, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`# else`
		190885	`+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx`
		190885	`+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ VZEROUPPER_RETURN`
		190885	`+`
		190885	`.p2align 4`
		190885	`-L(between_4_7):`
		190885	`- /* Load as big endian with overlapping movbe to avoid branches. */`
		190885	`- movbe (%rdi), %eax`
		190885	`- movbe (%rsi), %ecx`
		190885	`- shlq $32, %rax`
		190885	`- shlq $32, %rcx`
		190885	`- movbe -4(%rdi, %rdx), %edi`
		190885	`- movbe -4(%rsi, %rdx), %esi`
		190885	`- orq %rdi, %rax`
		190885	`- orq %rsi, %rcx`
		190885	`- subq %rcx, %rax`
		190885	`- je L(exit)`
		190885	`- sbbl %eax, %eax`
		190885	`- orl $1, %eax`
		190885	`- ret`
		190885	`+L(return_vec_0_end):`
		190885	`+ tzcntl %eax, %eax`
		190885	`+ addl %edx, %eax`
		190885	`+# ifdef USE_AS_WMEMCMP`
		190885	`+ movl -VEC_SIZE(%rdi, %rax), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl -VEC_SIZE(%rsi, %rax), %ecx`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+# else`
		190885	`+ movzbl -VEC_SIZE(%rsi, %rax), %ecx`
		190885	`+ movzbl -VEC_SIZE(%rdi, %rax), %eax`
		190885	`+ subl %ecx, %eax`
		190885	`+# endif`
		190885	`+ VZEROUPPER_RETURN`
		190885
		190885	`.p2align 4`
		190885	`-L(exit):`
		190885	`- ret`
		190885	`+L(less_vec):`
		190885	`+ /* Check if one or less CHAR. This is necessary for size = 0 but`
		190885	`+ is also faster for size = CHAR_SIZE. */`
		190885	`+ cmpl $CHAR_SIZE, %edx`
		190885	`+ jbe L(one_or_less)`
		190885	`+`
		190885	`+ /* Check if loading one VEC from either s1 or s2 could cause a`
		190885	`+ page cross. This can have false positives but is by far the`
		190885	`+ fastest method. */`
		190885	`+ movl %edi, %eax`
		190885	`+ orl %esi, %eax`
		190885	`+ andl $(PAGE_SIZE - 1), %eax`
		190885	`+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax`
		190885	`+ jg L(page_cross_less_vec)`
		190885	`+`
		190885	`+ /* No page cross possible. */`
		190885	`+ vmovdqu (%rsi), %ymm2`
		190885	`+ VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`+ vpmovmskb %ymm2, %eax`
		190885	`+ incl %eax`
		190885	`+ /* Result will be zero if s1 and s2 match. Otherwise first set`
		190885	`+ bit will be first mismatch. */`
		190885	`+ bzhil %edx, %eax, %edx`
		190885	`+ jnz L(return_vec_0)`
		190885	`+ xorl %eax, %eax`
		190885	`+ VZEROUPPER_RETURN`
		190885
		190885	`.p2align 4`
		190885	`-L(between_2_3):`
		190885	`+L(page_cross_less_vec):`
		190885	`+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28`
		190885	`+ bytes. */`
		190885	`+ cmpl $16, %edx`
		190885	`+ jae L(between_16_31)`
		190885	`+# ifndef USE_AS_WMEMCMP`
		190885	`+ cmpl $8, %edx`
		190885	`+ jae L(between_8_15)`
		190885	`+ cmpl $4, %edx`
		190885	`+ jae L(between_4_7)`
		190885	`+`
		190885	`/* Load as big endian to avoid branches. */`
		190885	`movzwl (%rdi), %eax`
		190885	`movzwl (%rsi), %ecx`
		190885	`@@ -208,223 +439,106 @@ L(between_2_3):`
		190885	`shll $8, %ecx`
		190885	`bswap %eax`
		190885	`bswap %ecx`
		190885	`- movb -1(%rdi, %rdx), %al`
		190885	`- movb -1(%rsi, %rdx), %cl`
		190885	`+ movzbl -1(%rdi, %rdx), %edi`
		190885	`+ movzbl -1(%rsi, %rdx), %esi`
		190885	`+ orl %edi, %eax`
		190885	`+ orl %esi, %ecx`
		190885	`/* Subtraction is okay because the upper 8 bits are zero. */`
		190885	`subl %ecx, %eax`
		190885	`+ /* No ymm register was touched. */`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(1):`
		190885	`- movzbl (%rdi), %eax`
		190885	`+L(one_or_less):`
		190885	`+ jb L(zero)`
		190885	`movzbl (%rsi), %ecx`
		190885	`+ movzbl (%rdi), %eax`
		190885	`subl %ecx, %eax`
		190885	`- ret`
		190885	`-# endif`
		190885	`-`
		190885	`- .p2align 4`
		190885	`-L(zero):`
		190885	`- xorl %eax, %eax`
		190885	`+ /* No ymm register was touched. */`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`-L(less_vec):`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */`
		190885	`- cmpb $4, %dl`
		190885	`- je L(4)`
		190885	`- jb L(zero)`
		190885	`-# else`
		190885	`- cmpb $1, %dl`
		190885	`- je L(1)`
		190885	`- jb L(zero)`
		190885	`- cmpb $4, %dl`
		190885	`- jb L(between_2_3)`
		190885	`- cmpb $8, %dl`
		190885	`- jb L(between_4_7)`
		190885	`+L(between_8_15):`
		190885	`# endif`
		190885	`- cmpb $16, %dl`
		190885	`- jae L(between_16_31)`
		190885	`- /* It is between 8 and 15 bytes. */`
		190885	`+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */`
		190885	`vmovq (%rdi), %xmm1`
		190885	`vmovq (%rsi), %xmm2`
		190885	`- VPCMPEQ %xmm1, %xmm2, %xmm2`
		190885	`+ VPCMPEQ %xmm1, %xmm2, %xmm2`
		190885	`vpmovmskb %xmm2, %eax`
		190885	`- subl $0xffff, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ subl $0xffff, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`/* Use overlapping loads to avoid branches. */`
		190885	`leaq -8(%rdi, %rdx), %rdi`
		190885	`leaq -8(%rsi, %rdx), %rsi`
		190885	`vmovq (%rdi), %xmm1`
		190885	`vmovq (%rsi), %xmm2`
		190885	`- VPCMPEQ %xmm1, %xmm2, %xmm2`
		190885	`+ VPCMPEQ %xmm1, %xmm2, %xmm2`
		190885	`vpmovmskb %xmm2, %eax`
		190885	`- subl $0xffff, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ subl $0xffff, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`+ /* No ymm register was touched. */`
		190885	`+ ret`
		190885	`+`
		190885	`+ .p2align 4`
		190885	`+L(zero):`
		190885	`+ xorl %eax, %eax`
		190885	`ret`
		190885
		190885	`.p2align 4`
		190885	`L(between_16_31):`
		190885	`/* From 16 to 31 bytes. No branch when size == 16. */`
		190885	`vmovdqu (%rsi), %xmm2`
		190885	`- VPCMPEQ (%rdi), %xmm2, %xmm2`
		190885	`+ VPCMPEQ (%rdi), %xmm2, %xmm2`
		190885	`vpmovmskb %xmm2, %eax`
		190885	`- subl $0xffff, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ subl $0xffff, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885
		190885	`/* Use overlapping loads to avoid branches. */`
		190885	`+`
		190885	`+ vmovdqu -16(%rsi, %rdx), %xmm2`
		190885	`leaq -16(%rdi, %rdx), %rdi`
		190885	`leaq -16(%rsi, %rdx), %rsi`
		190885	`- vmovdqu (%rsi), %xmm2`
		190885	`- VPCMPEQ (%rdi), %xmm2, %xmm2`
		190885	`+ VPCMPEQ (%rdi), %xmm2, %xmm2`
		190885	`vpmovmskb %xmm2, %eax`
		190885	`- subl $0xffff, %eax`
		190885	`- jnz L(first_vec)`
		190885	`+ subl $0xffff, %eax`
		190885	`+ jnz L(return_vec_0)`
		190885	`+ /* No ymm register was touched. */`
		190885	`ret`
		190885
		190885	`- .p2align 4`
		190885	`-L(more_8x_vec):`
		190885	`- /* More than 8 * VEC. Check the first VEC. */`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- /* Align the first memory area for aligned loads in the loop.`
		190885	`- Compute how much the first memory area is misaligned. */`
		190885	`- movq %rdi, %rcx`
		190885	`- andl $(VEC_SIZE - 1), %ecx`
		190885	`- /* Get the negative of offset for alignment. */`
		190885	`- subq $VEC_SIZE, %rcx`
		190885	`- /* Adjust the second memory area. */`
		190885	`- subq %rcx, %rsi`
		190885	`- /* Adjust the first memory area which should be aligned now. */`
		190885	`- subq %rcx, %rdi`
		190885	`- /* Adjust length. */`
		190885	`- addq %rcx, %rdx`
		190885	`-`
		190885	`-L(loop_4x_vec):`
		190885	`- /* Compare 4 * VEC at a time forward. */`
		190885	`- vmovdqu (%rsi), %ymm1`
		190885	`- VPCMPEQ (%rdi), %ymm1, %ymm1`
		190885	`-`
		190885	`- vmovdqu VEC_SIZE(%rsi), %ymm2`
		190885	`- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2`
		190885	`- vpand %ymm2, %ymm1, %ymm5`
		190885	`-`
		190885	`- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3`
		190885	`- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3`
		190885	`- vpand %ymm3, %ymm5, %ymm5`
		190885	`-`
		190885	`- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4`
		190885	`- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4`
		190885	`- vpand %ymm4, %ymm5, %ymm5`
		190885	`-`
		190885	`- vptest %ymm0, %ymm5`
		190885	`- jnc L(4x_vec_end)`
		190885	`-`
		190885	`- addq $(VEC_SIZE * 4), %rdi`
		190885	`- addq $(VEC_SIZE * 4), %rsi`
		190885	`-`
		190885	`- subq $(VEC_SIZE * 4), %rdx`
		190885	`- cmpq $(VEC_SIZE * 4), %rdx`
		190885	`- jae L(loop_4x_vec)`
		190885	`-`
		190885	`- /* Less than 4 * VEC. */`
		190885	`- cmpq $VEC_SIZE, %rdx`
		190885	`- jbe L(last_vec)`
		190885	`- cmpq $(VEC_SIZE * 2), %rdx`
		190885	`- jbe L(last_2x_vec)`
		190885	`-`
		190885	`-L(last_4x_vec):`
		190885	`- /* From 2 * VEC to 4 * VEC. */`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- addq $VEC_SIZE, %rdi`
		190885	`- addq $VEC_SIZE, %rsi`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- /* Use overlapping loads to avoid branches. */`
		190885	`- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi`
		190885	`- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`-`
		190885	`- addq $VEC_SIZE, %rdi`
		190885	`- addq $VEC_SIZE, %rsi`
		190885	`- vmovdqu (%rsi), %ymm2`
		190885	`- VPCMPEQ (%rdi), %ymm2, %ymm2`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`- VZEROUPPER_RETURN`
		190885	`-`
		190885	`- .p2align 4`
		190885	`-L(4x_vec_end):`
		190885	`- vpmovmskb %ymm1, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec)`
		190885	`- vpmovmskb %ymm2, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec_x1)`
		190885	`- vpmovmskb %ymm3, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- jnz L(first_vec_x2)`
		190885	`- vpmovmskb %ymm4, %eax`
		190885	`- subl $VEC_MASK, %eax`
		190885	`- tzcntl %eax, %ecx`
		190885	`# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx`
		190885	`- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx`
		190885	`- jmp L(wmemcmp_return)`
		190885	`-# else`
		190885	`- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax`
		190885	`- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`-# endif`
		190885	`- VZEROUPPER_RETURN`
		190885	`-`
		190885	`.p2align 4`
		190885	`-L(first_vec_x1):`
		190885	`- tzcntl %eax, %ecx`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl VEC_SIZE(%rdi, %rcx), %edx`
		190885	`- cmpl VEC_SIZE(%rsi, %rcx), %edx`
		190885	`- jmp L(wmemcmp_return)`
		190885	`+L(one_or_less):`
		190885	`+ jb L(zero)`
		190885	`+ movl (%rdi), %ecx`
		190885	`+ xorl %edx, %edx`
		190885	`+ cmpl (%rsi), %ecx`
		190885	`+ je L(zero)`
		190885	`+ setg %dl`
		190885	`+ leal -1(%rdx, %rdx), %eax`
		190885	`+ /* No ymm register was touched. */`
		190885	`+ ret`
		190885	`# else`
		190885	`- movzbl VEC_SIZE(%rdi, %rcx), %eax`
		190885	`- movzbl VEC_SIZE(%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`-# endif`
		190885	`- VZEROUPPER_RETURN`
		190885
		190885	`.p2align 4`
		190885	`-L(first_vec_x2):`
		190885	`- tzcntl %eax, %ecx`
		190885	`-# ifdef USE_AS_WMEMCMP`
		190885	`- xorl %eax, %eax`
		190885	`- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx`
		190885	`- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx`
		190885	`- jmp L(wmemcmp_return)`
		190885	`-# else`
		190885	`- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax`
		190885	`- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx`
		190885	`- sub %edx, %eax`
		190885	`+L(between_4_7):`
		190885	`+ /* Load as big endian with overlapping movbe to avoid branches.`
		190885	`+ */`
		190885	`+ movbe (%rdi), %eax`
		190885	`+ movbe (%rsi), %ecx`
		190885	`+ shlq $32, %rax`
		190885	`+ shlq $32, %rcx`
		190885	`+ movbe -4(%rdi, %rdx), %edi`
		190885	`+ movbe -4(%rsi, %rdx), %esi`
		190885	`+ orq %rdi, %rax`
		190885	`+ orq %rsi, %rcx`
		190885	`+ subq %rcx, %rax`
		190885	`+ jz L(zero_4_7)`
		190885	`+ sbbl %eax, %eax`
		190885	`+ orl $1, %eax`
		190885	`+L(zero_4_7):`
		190885	`+ /* No ymm register was touched. */`
		190885	`+ ret`
		190885	`# endif`
		190885	`- VZEROUPPER_RETURN`
		190885	`+`
		190885	`END (MEMCMP)`
		190885	`#endif`
		190885	`--`
		190885	`GitLab`
		190885

rpms / glibc

Source Code

Blame SOURCES/ia-opt-memcmp-avx2-movbe.patch