Blame SOURCES/ia-opt-strxcasecmp-srtcmp-sse42.patch

513694
From 371154789e234ff53a97adfc92355a3871f66847 Mon Sep 17 00:00:00 2001
513694
From: Noah Goldstein <goldstein.w.n@gmail.com>
513694
Date: Wed, 23 Mar 2022 16:57:38 -0500
513694
Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
513694
513694
Slightly faster method of doing TOLOWER that saves an
513694
instruction.
513694
513694
Also replace the hard coded 5-byte no with .p2align 4. On builds with
513694
CET enabled this misaligned entry to strcasecmp.
513694
513694
geometric_mean(N=40) of all benchmarks New / Original: .920
513694
513694
All string/memory tests pass.
513694
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
513694
513694
(cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226)
513694
---
513694
 sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
513694
 1 file changed, 35 insertions(+), 48 deletions(-)
513694
513694
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
513694
index d8fdeb3a..59e8ddfc 100644
513694
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
513694
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
513694
@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
513694
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
513694
 	mov	%fs:(%rax),%RDX_LP
513694
 
513694
-	// XXX 5 byte should be before the function
513694
-	/* 5-byte NOP.  */
513694
-	.byte	0x0f,0x1f,0x44,0x00,0x00
513694
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
513694
+	.p2align 4
513694
 END (GLABEL(__strcasecmp))
513694
 	/* FALLTHROUGH to strcasecmp_l.  */
513694
 #endif
513694
@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
513694
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
513694
 	mov	%fs:(%rax),%RCX_LP
513694
 
513694
-	// XXX 5 byte should be before the function
513694
-	/* 5-byte NOP.  */
513694
-	.byte	0x0f,0x1f,0x44,0x00,0x00
513694
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
513694
+	.p2align 4
513694
 END (GLABEL(__strncasecmp))
513694
 	/* FALLTHROUGH to strncasecmp_l.  */
513694
 #endif
513694
@@ -170,27 +168,22 @@ STRCMP_SSE42:
513694
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
513694
 	.section .rodata.cst16,"aM",@progbits,16
513694
 	.align 16
513694
-LABEL(belowupper):
513694
-	.quad	0x4040404040404040
513694
-	.quad	0x4040404040404040
513694
-LABEL(topupper):
513694
-# ifdef USE_AVX
513694
-	.quad	0x5a5a5a5a5a5a5a5a
513694
-	.quad	0x5a5a5a5a5a5a5a5a
513694
-# else
513694
-	.quad	0x5b5b5b5b5b5b5b5b
513694
-	.quad	0x5b5b5b5b5b5b5b5b
513694
-# endif
513694
-LABEL(touppermask):
513694
+LABEL(lcase_min):
513694
+	.quad	0x3f3f3f3f3f3f3f3f
513694
+	.quad	0x3f3f3f3f3f3f3f3f
513694
+LABEL(lcase_max):
513694
+	.quad	0x9999999999999999
513694
+	.quad	0x9999999999999999
513694
+LABEL(case_add):
513694
 	.quad	0x2020202020202020
513694
 	.quad	0x2020202020202020
513694
 	.previous
513694
-	movdqa	LABEL(belowupper)(%rip), %xmm4
513694
-# define UCLOW_reg %xmm4
513694
-	movdqa	LABEL(topupper)(%rip), %xmm5
513694
-# define UCHIGH_reg %xmm5
513694
-	movdqa	LABEL(touppermask)(%rip), %xmm6
513694
-# define LCQWORD_reg %xmm6
513694
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
513694
+# define LCASE_MIN_reg %xmm4
513694
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
513694
+# define LCASE_MAX_reg %xmm5
513694
+	movdqa	LABEL(case_add)(%rip), %xmm6
513694
+# define CASE_ADD_reg %xmm6
513694
 #endif
513694
 	cmp	$0x30, %ecx
513694
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
513694
@@ -201,32 +194,26 @@ LABEL(touppermask):
513694
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
513694
 # ifdef USE_AVX
513694
 #  define TOLOWER(reg1, reg2) \
513694
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
513694
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
513694
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
513694
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
513694
-	vpandn	%xmm7, %xmm8, %xmm8;					\
513694
-	vpandn	%xmm9, %xmm10, %xmm10;					\
513694
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
513694
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
513694
-	vpor	reg1, %xmm8, reg1;					\
513694
-	vpor	reg2, %xmm10, reg2
513694
+	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
513694
+	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
513694
+	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
513694
+	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
513694
+	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
513694
+	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
513694
+	vpaddb	%xmm7, reg1, reg1;					\
513694
+	vpaddb	%xmm8, reg2, reg2
513694
 # else
513694
 #  define TOLOWER(reg1, reg2) \
513694
-	movdqa	reg1, %xmm7;					\
513694
-	movdqa	UCHIGH_reg, %xmm8;				\
513694
-	movdqa	reg2, %xmm9;					\
513694
-	movdqa	UCHIGH_reg, %xmm10;				\
513694
-	pcmpgtb	UCLOW_reg, %xmm7;				\
513694
-	pcmpgtb	reg1, %xmm8;					\
513694
-	pcmpgtb	UCLOW_reg, %xmm9;				\
513694
-	pcmpgtb	reg2, %xmm10;					\
513694
-	pand	%xmm8, %xmm7;					\
513694
-	pand	%xmm10, %xmm9;					\
513694
-	pand	LCQWORD_reg, %xmm7;				\
513694
-	pand	LCQWORD_reg, %xmm9;				\
513694
-	por	%xmm7, reg1;					\
513694
-	por	%xmm9, reg2
513694
+	movdqa	LCASE_MIN_reg, %xmm7;					\
513694
+	movdqa	LCASE_MIN_reg, %xmm8;					\
513694
+	paddb	reg1, %xmm7;					\
513694
+	paddb	reg2, %xmm8;					\
513694
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
513694
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
513694
+	pandn	CASE_ADD_reg, %xmm7;					\
513694
+	pandn	CASE_ADD_reg, %xmm8;					\
513694
+	paddb	%xmm7, reg1;					\
513694
+	paddb	%xmm8, reg2
513694
 # endif
513694
 	TOLOWER (%xmm1, %xmm2)
513694
 #else
513694
-- 
513694
GitLab
513694