08c3a6
commit 5997011826b7bbb7015f56bf143a6e4fd0f5a7df
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Wed Mar 23 16:57:36 2022 -0500
08c3a6
08c3a6
    x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
08c3a6
    
08c3a6
    Slightly faster method of doing TOLOWER that saves an
08c3a6
    instruction.
08c3a6
    
08c3a6
    Also replace the hard coded 5-byte no with .p2align 4. On builds with
08c3a6
    CET enabled this misaligned entry to strcasecmp.
08c3a6
    
08c3a6
    geometric_mean(N=40) of all benchmarks New / Original: .894
08c3a6
    
08c3a6
    All string/memory tests pass.
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    
08c3a6
    (cherry picked from commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73)
08c3a6
08c3a6
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
08c3a6
index 7f8a1bc756f86aee..ca70b540eb2dd190 100644
08c3a6
--- a/sysdeps/x86_64/strcmp.S
08c3a6
+++ b/sysdeps/x86_64/strcmp.S
08c3a6
@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp)
08c3a6
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
08c3a6
 	mov	%fs:(%rax),%RDX_LP
08c3a6
 
08c3a6
-	// XXX 5 byte should be before the function
08c3a6
-	/* 5-byte NOP.  */
08c3a6
-	.byte	0x0f,0x1f,0x44,0x00,0x00
08c3a6
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
08c3a6
+	.p2align 4
08c3a6
 END2 (__strcasecmp)
08c3a6
 # ifndef NO_NOLOCALE_ALIAS
08c3a6
 weak_alias (__strcasecmp, strcasecmp)
08c3a6
@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp)
08c3a6
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
08c3a6
 	mov	%fs:(%rax),%RCX_LP
08c3a6
 
08c3a6
-	// XXX 5 byte should be before the function
08c3a6
-	/* 5-byte NOP.  */
08c3a6
-	.byte	0x0f,0x1f,0x44,0x00,0x00
08c3a6
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
08c3a6
+	.p2align 4
08c3a6
 END2 (__strncasecmp)
08c3a6
 # ifndef NO_NOLOCALE_ALIAS
08c3a6
 weak_alias (__strncasecmp, strncasecmp)
08c3a6
@@ -149,22 +147,22 @@ ENTRY (STRCMP)
08c3a6
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
08c3a6
 	.section .rodata.cst16,"aM",@progbits,16
08c3a6
 	.align 16
08c3a6
-.Lbelowupper:
08c3a6
-	.quad	0x4040404040404040
08c3a6
-	.quad	0x4040404040404040
08c3a6
-.Ltopupper:
08c3a6
-	.quad	0x5b5b5b5b5b5b5b5b
08c3a6
-	.quad	0x5b5b5b5b5b5b5b5b
08c3a6
-.Ltouppermask:
08c3a6
+.Llcase_min:
08c3a6
+	.quad	0x3f3f3f3f3f3f3f3f
08c3a6
+	.quad	0x3f3f3f3f3f3f3f3f
08c3a6
+.Llcase_max:
08c3a6
+	.quad	0x9999999999999999
08c3a6
+	.quad	0x9999999999999999
08c3a6
+.Lcase_add:
08c3a6
 	.quad	0x2020202020202020
08c3a6
 	.quad	0x2020202020202020
08c3a6
 	.previous
08c3a6
-	movdqa	.Lbelowupper(%rip), %xmm5
08c3a6
-# define UCLOW_reg %xmm5
08c3a6
-	movdqa	.Ltopupper(%rip), %xmm6
08c3a6
-# define UCHIGH_reg %xmm6
08c3a6
-	movdqa	.Ltouppermask(%rip), %xmm7
08c3a6
-# define LCQWORD_reg %xmm7
08c3a6
+	movdqa	.Llcase_min(%rip), %xmm5
08c3a6
+# define LCASE_MIN_reg %xmm5
08c3a6
+	movdqa	.Llcase_max(%rip), %xmm6
08c3a6
+# define LCASE_MAX_reg %xmm6
08c3a6
+	movdqa	.Lcase_add(%rip), %xmm7
08c3a6
+# define CASE_ADD_reg %xmm7
08c3a6
 #endif
08c3a6
 	cmp	$0x30, %ecx
08c3a6
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
08c3a6
@@ -175,22 +173,18 @@ ENTRY (STRCMP)
08c3a6
 	movhpd	8(%rdi), %xmm1
08c3a6
 	movhpd	8(%rsi), %xmm2
08c3a6
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
08c3a6
-# define TOLOWER(reg1, reg2) \
08c3a6
-	movdqa	reg1, %xmm8;					\
08c3a6
-	movdqa	UCHIGH_reg, %xmm9;				\
08c3a6
-	movdqa	reg2, %xmm10;					\
08c3a6
-	movdqa	UCHIGH_reg, %xmm11;				\
08c3a6
-	pcmpgtb	UCLOW_reg, %xmm8;				\
08c3a6
-	pcmpgtb	reg1, %xmm9;					\
08c3a6
-	pcmpgtb	UCLOW_reg, %xmm10;				\
08c3a6
-	pcmpgtb	reg2, %xmm11;					\
08c3a6
-	pand	%xmm9, %xmm8;					\
08c3a6
-	pand	%xmm11, %xmm10;					\
08c3a6
-	pand	LCQWORD_reg, %xmm8;				\
08c3a6
-	pand	LCQWORD_reg, %xmm10;				\
08c3a6
-	por	%xmm8, reg1;					\
08c3a6
-	por	%xmm10, reg2
08c3a6
-	TOLOWER (%xmm1, %xmm2)
08c3a6
+#  define TOLOWER(reg1, reg2) \
08c3a6
+	movdqa	LCASE_MIN_reg, %xmm8;					\
08c3a6
+	movdqa	LCASE_MIN_reg, %xmm9;					\
08c3a6
+	paddb	reg1, %xmm8;					\
08c3a6
+	paddb	reg2, %xmm9;					\
08c3a6
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
08c3a6
+	pcmpgtb	LCASE_MAX_reg, %xmm9;				\
08c3a6
+	pandn	CASE_ADD_reg, %xmm8;					\
08c3a6
+	pandn	CASE_ADD_reg, %xmm9;					\
08c3a6
+	paddb	%xmm8, reg1;					\
08c3a6
+	paddb	%xmm9, reg2
08c3a6
+	TOLOWER	(%xmm1, %xmm2)
08c3a6
 #else
08c3a6
 # define TOLOWER(reg1, reg2)
08c3a6
 #endif