|
|
513694 |
From 017773f93b0e41f3b164e5db86d0c7b7f75675e9 Mon Sep 17 00:00:00 2001
|
|
|
513694 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
513694 |
Date: Wed, 23 Mar 2022 16:57:36 -0500
|
|
|
513694 |
Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
|
|
|
513694 |
|
|
|
513694 |
Slightly faster method of doing TOLOWER that saves an
|
|
|
513694 |
instruction.
|
|
|
513694 |
|
|
|
513694 |
Also replace the hard coded 5-byte no with .p2align 4. On builds with
|
|
|
513694 |
CET enabled this misaligned entry to strcasecmp.
|
|
|
513694 |
|
|
|
513694 |
geometric_mean(N=40) of all benchmarks New / Original: .894
|
|
|
513694 |
|
|
|
513694 |
All string/memory tests pass.
|
|
|
513694 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
513694 |
|
|
|
513694 |
(cherry picked from commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73)
|
|
|
513694 |
---
|
|
|
513694 |
sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
|
|
|
513694 |
1 file changed, 29 insertions(+), 35 deletions(-)
|
|
|
513694 |
|
|
|
513694 |
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
|
|
|
513694 |
index aa6df898..f454ce5b 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/strcmp.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/strcmp.S
|
|
|
513694 |
@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp)
|
|
|
513694 |
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
|
|
513694 |
mov %fs:(%rax),%RDX_LP
|
|
|
513694 |
|
|
|
513694 |
- // XXX 5 byte should be before the function
|
|
|
513694 |
- /* 5-byte NOP. */
|
|
|
513694 |
- .byte 0x0f,0x1f,0x44,0x00,0x00
|
|
|
513694 |
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
|
|
513694 |
+ .p2align 4
|
|
|
513694 |
END2 (__strcasecmp)
|
|
|
513694 |
# ifndef NO_NOLOCALE_ALIAS
|
|
|
513694 |
weak_alias (__strcasecmp, strcasecmp)
|
|
|
513694 |
@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp)
|
|
|
513694 |
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
|
|
|
513694 |
mov %fs:(%rax),%RCX_LP
|
|
|
513694 |
|
|
|
513694 |
- // XXX 5 byte should be before the function
|
|
|
513694 |
- /* 5-byte NOP. */
|
|
|
513694 |
- .byte 0x0f,0x1f,0x44,0x00,0x00
|
|
|
513694 |
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
|
|
|
513694 |
+ .p2align 4
|
|
|
513694 |
END2 (__strncasecmp)
|
|
|
513694 |
# ifndef NO_NOLOCALE_ALIAS
|
|
|
513694 |
weak_alias (__strncasecmp, strncasecmp)
|
|
|
513694 |
@@ -149,22 +147,22 @@ ENTRY (STRCMP)
|
|
|
513694 |
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
|
|
513694 |
.section .rodata.cst16,"aM",@progbits,16
|
|
|
513694 |
.align 16
|
|
|
513694 |
-.Lbelowupper:
|
|
|
513694 |
- .quad 0x4040404040404040
|
|
|
513694 |
- .quad 0x4040404040404040
|
|
|
513694 |
-.Ltopupper:
|
|
|
513694 |
- .quad 0x5b5b5b5b5b5b5b5b
|
|
|
513694 |
- .quad 0x5b5b5b5b5b5b5b5b
|
|
|
513694 |
-.Ltouppermask:
|
|
|
513694 |
+.Llcase_min:
|
|
|
513694 |
+ .quad 0x3f3f3f3f3f3f3f3f
|
|
|
513694 |
+ .quad 0x3f3f3f3f3f3f3f3f
|
|
|
513694 |
+.Llcase_max:
|
|
|
513694 |
+ .quad 0x9999999999999999
|
|
|
513694 |
+ .quad 0x9999999999999999
|
|
|
513694 |
+.Lcase_add:
|
|
|
513694 |
.quad 0x2020202020202020
|
|
|
513694 |
.quad 0x2020202020202020
|
|
|
513694 |
.previous
|
|
|
513694 |
- movdqa .Lbelowupper(%rip), %xmm5
|
|
|
513694 |
-# define UCLOW_reg %xmm5
|
|
|
513694 |
- movdqa .Ltopupper(%rip), %xmm6
|
|
|
513694 |
-# define UCHIGH_reg %xmm6
|
|
|
513694 |
- movdqa .Ltouppermask(%rip), %xmm7
|
|
|
513694 |
-# define LCQWORD_reg %xmm7
|
|
|
513694 |
+ movdqa .Llcase_min(%rip), %xmm5
|
|
|
513694 |
+# define LCASE_MIN_reg %xmm5
|
|
|
513694 |
+ movdqa .Llcase_max(%rip), %xmm6
|
|
|
513694 |
+# define LCASE_MAX_reg %xmm6
|
|
|
513694 |
+ movdqa .Lcase_add(%rip), %xmm7
|
|
|
513694 |
+# define CASE_ADD_reg %xmm7
|
|
|
513694 |
#endif
|
|
|
513694 |
cmp $0x30, %ecx
|
|
|
513694 |
ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
|
|
|
513694 |
@@ -175,22 +173,18 @@ ENTRY (STRCMP)
|
|
|
513694 |
movhpd 8(%rdi), %xmm1
|
|
|
513694 |
movhpd 8(%rsi), %xmm2
|
|
|
513694 |
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
|
|
513694 |
-# define TOLOWER(reg1, reg2) \
|
|
|
513694 |
- movdqa reg1, %xmm8; \
|
|
|
513694 |
- movdqa UCHIGH_reg, %xmm9; \
|
|
|
513694 |
- movdqa reg2, %xmm10; \
|
|
|
513694 |
- movdqa UCHIGH_reg, %xmm11; \
|
|
|
513694 |
- pcmpgtb UCLOW_reg, %xmm8; \
|
|
|
513694 |
- pcmpgtb reg1, %xmm9; \
|
|
|
513694 |
- pcmpgtb UCLOW_reg, %xmm10; \
|
|
|
513694 |
- pcmpgtb reg2, %xmm11; \
|
|
|
513694 |
- pand %xmm9, %xmm8; \
|
|
|
513694 |
- pand %xmm11, %xmm10; \
|
|
|
513694 |
- pand LCQWORD_reg, %xmm8; \
|
|
|
513694 |
- pand LCQWORD_reg, %xmm10; \
|
|
|
513694 |
- por %xmm8, reg1; \
|
|
|
513694 |
- por %xmm10, reg2
|
|
|
513694 |
- TOLOWER (%xmm1, %xmm2)
|
|
|
513694 |
+# define TOLOWER(reg1, reg2) \
|
|
|
513694 |
+ movdqa LCASE_MIN_reg, %xmm8; \
|
|
|
513694 |
+ movdqa LCASE_MIN_reg, %xmm9; \
|
|
|
513694 |
+ paddb reg1, %xmm8; \
|
|
|
513694 |
+ paddb reg2, %xmm9; \
|
|
|
513694 |
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
|
|
|
513694 |
+ pcmpgtb LCASE_MAX_reg, %xmm9; \
|
|
|
513694 |
+ pandn CASE_ADD_reg, %xmm8; \
|
|
|
513694 |
+ pandn CASE_ADD_reg, %xmm9; \
|
|
|
513694 |
+ paddb %xmm8, reg1; \
|
|
|
513694 |
+ paddb %xmm9, reg2
|
|
|
513694 |
+ TOLOWER (%xmm1, %xmm2)
|
|
|
513694 |
#else
|
|
|
513694 |
# define TOLOWER(reg1, reg2)
|
|
|
513694 |
#endif
|
|
|
513694 |
--
|
|
|
513694 |
GitLab
|
|
|
513694 |
|