Blame SOURCES/ia-refacto-imp-prf-strchr-avx2.patch

190885
From 43847e49c2dab633146c9b6c682ed5768ccda7cd Mon Sep 17 00:00:00 2001
190885
From: noah <goldstein.w.n@gmail.com>
190885
Date: Wed, 3 Feb 2021 00:38:59 -0500
190885
Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
190885
190885
No bug. Just seemed the performance could be improved a bit. Observed
190885
and expected behavior are unchanged. Optimized body of main
190885
loop. Updated page cross logic and optimized accordingly. Made a few
190885
minor instruction selection modifications. No regressions in test
190885
suite. Both test-strchrnul and test-strchr passed.
190885
190885
(cherry picked from commit 1f745ecc2109890886b161d4791e1406fdfc29b8)
190885
---
190885
 sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
190885
 sysdeps/x86_64/multiarch/strchr.c      |   4 +-
190885
 2 files changed, 114 insertions(+), 115 deletions(-)
190885
190885
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
190885
index da7d2620..919d256c 100644
190885
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
190885
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
190885
@@ -27,10 +27,12 @@
190885
 # ifdef USE_AS_WCSCHR
190885
 #  define VPBROADCAST	vpbroadcastd
190885
 #  define VPCMPEQ	vpcmpeqd
190885
+#  define VPMINU	vpminud
190885
 #  define CHAR_REG	esi
190885
 # else
190885
 #  define VPBROADCAST	vpbroadcastb
190885
 #  define VPCMPEQ	vpcmpeqb
190885
+#  define VPMINU	vpminub
190885
 #  define CHAR_REG	sil
190885
 # endif
190885
 
190885
@@ -43,71 +45,54 @@
190885
 # endif
190885
 
190885
 # define VEC_SIZE 32
190885
+# define PAGE_SIZE 4096
190885
 
190885
 	.section SECTION(.text),"ax",@progbits
190885
 ENTRY (STRCHR)
190885
 	movl	%edi, %ecx
190885
-	/* Broadcast CHAR to YMM0.  */
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	xorl	%edx, %edx
190885
+# endif
190885
+
190885
+	/* Broadcast CHAR to YMM0.	*/
190885
 	vmovd	%esi, %xmm0
190885
 	vpxor	%xmm9, %xmm9, %xmm9
190885
 	VPBROADCAST %xmm0, %ymm0
190885
-	/* Check if we may cross page boundary with one vector load.  */
190885
-	andl	$(2 * VEC_SIZE - 1), %ecx
190885
-	cmpl	$VEC_SIZE, %ecx
190885
-	ja	L(cros_page_boundary)
190885
 
190885
-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
190885
-	   null byte.  */
190885
-	vmovdqu	(%rdi), %ymm8
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
-	vpor	%ymm1, %ymm2, %ymm1
190885
-	vpmovmskb %ymm1, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x0)
190885
+	/* Check if we cross page boundary with one vector load.  */
190885
+	andl	$(PAGE_SIZE - 1), %ecx
190885
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
190885
+	ja  L(cross_page_boundary)
190885
 
190885
-	/* Align data for aligned loads in the loop.  */
190885
-	addq	$VEC_SIZE, %rdi
190885
-	andl	$(VEC_SIZE - 1), %ecx
190885
-	andq	$-VEC_SIZE, %rdi
190885
-
190885
-	jmp	L(more_4x_vec)
190885
-
190885
-	.p2align 4
190885
-L(cros_page_boundary):
190885
-	andl	$(VEC_SIZE - 1), %ecx
190885
-	andq	$-VEC_SIZE, %rdi
190885
+	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
190885
+	   null byte.  */
190885
 	vmovdqu	(%rdi), %ymm8
190885
 	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
 	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
 	vpor	%ymm1, %ymm2, %ymm1
190885
 	vpmovmskb %ymm1, %eax
190885
-	/* Remove the leading bytes.  */
190885
-	sarl	%cl, %eax
190885
 	testl	%eax, %eax
190885
-	jz	L(aligned_more)
190885
-	/* Found CHAR or the null byte.  */
190885
+	jz	L(more_vecs)
190885
 	tzcntl	%eax, %eax
190885
-	addq	%rcx, %rax
190885
-# ifdef USE_AS_STRCHRNUL
190885
+	/* Found CHAR or the null byte.	 */
190885
 	addq	%rdi, %rax
190885
-# else
190885
-	xorl	%edx, %edx
190885
-	leaq	(%rdi, %rax), %rax
190885
-	cmp	(%rax), %CHAR_REG
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
 	cmovne	%rdx, %rax
190885
 # endif
190885
 L(return_vzeroupper):
190885
 	ZERO_UPPER_VEC_REGISTERS_RETURN
190885
 
190885
 	.p2align 4
190885
+L(more_vecs):
190885
+	/* Align data for aligned loads in the loop.  */
190885
+	andq	$-VEC_SIZE, %rdi
190885
 L(aligned_more):
190885
-	addq	$VEC_SIZE, %rdi
190885
 
190885
-L(more_4x_vec):
190885
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
190885
-	   since data is only aligned to VEC_SIZE.  */
190885
-	vmovdqa	(%rdi), %ymm8
190885
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
190885
+	   since data is only aligned to VEC_SIZE.	*/
190885
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
190885
+	addq	$VEC_SIZE, %rdi
190885
 	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
 	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
 	vpor	%ymm1, %ymm2, %ymm1
190885
@@ -137,61 +122,24 @@ L(more_4x_vec):
190885
 	vpor	%ymm1, %ymm2, %ymm1
190885
 	vpmovmskb %ymm1, %eax
190885
 	testl	%eax, %eax
190885
-	jnz	L(first_vec_x3)
190885
-
190885
-	addq	$(VEC_SIZE * 4), %rdi
190885
-
190885
-	/* Align data to 4 * VEC_SIZE.  */
190885
-	movq	%rdi, %rcx
190885
-	andl	$(4 * VEC_SIZE - 1), %ecx
190885
-	andq	$-(4 * VEC_SIZE), %rdi
190885
-
190885
-	.p2align 4
190885
-L(loop_4x_vec):
190885
-	/* Compare 4 * VEC at a time forward.  */
190885
-	vmovdqa	(%rdi), %ymm5
190885
-	vmovdqa	VEC_SIZE(%rdi), %ymm6
190885
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
190885
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
190885
-
190885
-	VPCMPEQ %ymm5, %ymm0, %ymm1
190885
-	VPCMPEQ %ymm6, %ymm0, %ymm2
190885
-	VPCMPEQ %ymm7, %ymm0, %ymm3
190885
-	VPCMPEQ %ymm8, %ymm0, %ymm4
190885
-
190885
-	VPCMPEQ %ymm5, %ymm9, %ymm5
190885
-	VPCMPEQ %ymm6, %ymm9, %ymm6
190885
-	VPCMPEQ %ymm7, %ymm9, %ymm7
190885
-	VPCMPEQ %ymm8, %ymm9, %ymm8
190885
-
190885
-	vpor	%ymm1, %ymm5, %ymm1
190885
-	vpor	%ymm2, %ymm6, %ymm2
190885
-	vpor	%ymm3, %ymm7, %ymm3
190885
-	vpor	%ymm4, %ymm8, %ymm4
190885
-
190885
-	vpor	%ymm1, %ymm2, %ymm5
190885
-	vpor	%ymm3, %ymm4, %ymm6
190885
-
190885
-	vpor	%ymm5, %ymm6, %ymm5
190885
-
190885
-	vpmovmskb %ymm5, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(4x_vec_end)
190885
-
190885
-	addq	$(VEC_SIZE * 4), %rdi
190885
+	jz	L(prep_loop_4x)
190885
 
190885
-	jmp	L(loop_4x_vec)
190885
+	tzcntl	%eax, %eax
190885
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	VZEROUPPER
190885
+	ret
190885
 
190885
 	.p2align 4
190885
 L(first_vec_x0):
190885
-	/* Found CHAR or the null byte.  */
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_STRCHRNUL
190885
+	/* Found CHAR or the null byte.	 */
190885
 	addq	%rdi, %rax
190885
-# else
190885
-	xorl	%edx, %edx
190885
-	leaq	(%rdi, %rax), %rax
190885
-	cmp	(%rax), %CHAR_REG
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
 	cmovne	%rdx, %rax
190885
 # endif
190885
 	VZEROUPPER_RETURN
190885
@@ -199,13 +147,9 @@ L(first_vec_x0):
190885
 	.p2align 4
190885
 L(first_vec_x1):
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_STRCHRNUL
190885
-	addq	$VEC_SIZE, %rax
190885
-	addq	%rdi, %rax
190885
-# else
190885
-	xorl	%edx, %edx
190885
 	leaq	VEC_SIZE(%rdi, %rax), %rax
190885
-	cmp	(%rax), %CHAR_REG
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
 	cmovne	%rdx, %rax
190885
 # endif
190885
 	VZEROUPPER_RETURN
190885
@@ -213,42 +157,97 @@ L(first_vec_x1):
190885
 	.p2align 4
190885
 L(first_vec_x2):
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_STRCHRNUL
190885
-	addq	$(VEC_SIZE * 2), %rax
190885
-	addq	%rdi, %rax
190885
-# else
190885
-	xorl	%edx, %edx
190885
+	/* Found CHAR or the null byte.	 */
190885
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
-	cmp	(%rax), %CHAR_REG
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
 	cmovne	%rdx, %rax
190885
 # endif
190885
 	VZEROUPPER_RETURN
190885
 
190885
+L(prep_loop_4x):
190885
+	/* Align data to 4 * VEC_SIZE.	*/
190885
+	andq	$-(VEC_SIZE * 4), %rdi
190885
+
190885
 	.p2align 4
190885
-L(4x_vec_end):
190885
+L(loop_4x_vec):
190885
+	/* Compare 4 * VEC at a time forward.  */
190885
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
190885
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
190885
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
190885
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
190885
+
190885
+	/* Leaves only CHARS matching esi as 0.	 */
190885
+	vpxor	%ymm5, %ymm0, %ymm1
190885
+	vpxor	%ymm6, %ymm0, %ymm2
190885
+	vpxor	%ymm7, %ymm0, %ymm3
190885
+	vpxor	%ymm8, %ymm0, %ymm4
190885
+
190885
+	VPMINU	%ymm1, %ymm5, %ymm1
190885
+	VPMINU	%ymm2, %ymm6, %ymm2
190885
+	VPMINU	%ymm3, %ymm7, %ymm3
190885
+	VPMINU	%ymm4, %ymm8, %ymm4
190885
+
190885
+	VPMINU	%ymm1, %ymm2, %ymm5
190885
+	VPMINU	%ymm3, %ymm4, %ymm6
190885
+
190885
+	VPMINU	%ymm5, %ymm6, %ymm5
190885
+
190885
+	VPCMPEQ %ymm5, %ymm9, %ymm5
190885
+	vpmovmskb %ymm5, %eax
190885
+
190885
+	addq	$(VEC_SIZE * 4), %rdi
190885
+	testl	%eax, %eax
190885
+	jz  L(loop_4x_vec)
190885
+
190885
+	VPCMPEQ %ymm1, %ymm9, %ymm1
190885
 	vpmovmskb %ymm1, %eax
190885
 	testl	%eax, %eax
190885
 	jnz	L(first_vec_x0)
190885
+
190885
+	VPCMPEQ %ymm2, %ymm9, %ymm2
190885
 	vpmovmskb %ymm2, %eax
190885
 	testl	%eax, %eax
190885
 	jnz	L(first_vec_x1)
190885
-	vpmovmskb %ymm3, %eax
190885
-	testl	%eax, %eax
190885
-	jnz	L(first_vec_x2)
190885
+
190885
+	VPCMPEQ %ymm3, %ymm9, %ymm3
190885
+	VPCMPEQ %ymm4, %ymm9, %ymm4
190885
+	vpmovmskb %ymm3, %ecx
190885
 	vpmovmskb %ymm4, %eax
190885
+	salq	$32, %rax
190885
+	orq %rcx, %rax
190885
+	tzcntq  %rax, %rax
190885
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
+	cmovne	%rdx, %rax
190885
+# endif
190885
+	VZEROUPPER
190885
+	ret
190885
+
190885
+	/* Cold case for crossing page with first load.	 */
190885
+	.p2align 4
190885
+L(cross_page_boundary):
190885
+	andq	$-VEC_SIZE, %rdi
190885
+	andl	$(VEC_SIZE - 1), %ecx
190885
+
190885
+	vmovdqa	(%rdi), %ymm8
190885
+	VPCMPEQ %ymm8, %ymm0, %ymm1
190885
+	VPCMPEQ %ymm8, %ymm9, %ymm2
190885
+	vpor	%ymm1, %ymm2, %ymm1
190885
+	vpmovmskb %ymm1, %eax
190885
+	/* Remove the leading bits.	 */
190885
+	sarxl	%ecx, %eax, %eax
190885
 	testl	%eax, %eax
190885
-L(first_vec_x3):
190885
+	jz	L(aligned_more)
190885
 	tzcntl	%eax, %eax
190885
-# ifdef USE_AS_STRCHRNUL
190885
-	addq	$(VEC_SIZE * 3), %rax
190885
+	addq	%rcx, %rdi
190885
 	addq	%rdi, %rax
190885
-# else
190885
-	xorl	%edx, %edx
190885
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
190885
-	cmp	(%rax), %CHAR_REG
190885
+# ifndef USE_AS_STRCHRNUL
190885
+	cmp (%rax), %CHAR_REG
190885
 	cmovne	%rdx, %rax
190885
 # endif
190885
 	VZEROUPPER_RETURN
190885
 
190885
 END (STRCHR)
190885
-#endif
190885
+# endif
190885
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
190885
index 7e582f02..5225bd4f 100644
190885
--- a/sysdeps/x86_64/multiarch/strchr.c
190885
+++ b/sysdeps/x86_64/multiarch/strchr.c
190885
@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
190885
   const struct cpu_features* cpu_features = __get_cpu_features ();
190885
 
190885
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
190885
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
190885
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
190885
     {
190885
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
190885
-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
190885
-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
190885
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
190885
 	return OPTIMIZE (evex);
190885
 
190885
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
190885
-- 
190885
GitLab
190885