commit 3605c744078bb048d876298aaf12a2869e8071b8 Author: Noah Goldstein Date: Wed Mar 23 16:57:38 2022 -0500 x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Slightly faster method of doing TOLOWER that saves an instruction. Also replace the hard coded 5-byte no with .p2align 4. On builds with CET enabled this misaligned entry to strcasecmp. geometric_mean(N=40) of all benchmarks New / Original: .920 All string/memory tests pass. Reviewed-by: H.J. Lu (cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226) diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S index 6197a723b9e0606e..a6825de8195ad8c6 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RDX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strcasecmp)) /* FALLTHROUGH to strcasecmp_l. */ #endif @@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp)) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RCX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END (GLABEL(__strncasecmp)) /* FALLTHROUGH to strncasecmp_l. */ #endif @@ -170,27 +168,22 @@ STRCMP_SSE42: #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L .section .rodata.cst16,"aM",@progbits,16 .align 16 -LABEL(belowupper): - .quad 0x4040404040404040 - .quad 0x4040404040404040 -LABEL(topupper): -# ifdef USE_AVX - .quad 0x5a5a5a5a5a5a5a5a - .quad 0x5a5a5a5a5a5a5a5a -# else - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -# endif -LABEL(touppermask): +LABEL(lcase_min): + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +LABEL(lcase_max): + .quad 0x9999999999999999 + .quad 0x9999999999999999 +LABEL(case_add): .quad 0x2020202020202020 .quad 0x2020202020202020 .previous - movdqa LABEL(belowupper)(%rip), %xmm4 -# define UCLOW_reg %xmm4 - movdqa LABEL(topupper)(%rip), %xmm5 -# define UCHIGH_reg %xmm5 - movdqa LABEL(touppermask)(%rip), %xmm6 -# define LCQWORD_reg %xmm6 + movdqa LABEL(lcase_min)(%rip), %xmm4 +# define LCASE_MIN_reg %xmm4 + movdqa LABEL(lcase_max)(%rip), %xmm5 +# define LCASE_MAX_reg %xmm5 + movdqa LABEL(case_add)(%rip), %xmm6 +# define CASE_ADD_reg %xmm6 #endif cmp $0x30, %ecx ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ @@ -201,32 +194,26 @@ LABEL(touppermask): #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L # ifdef USE_AVX # define TOLOWER(reg1, reg2) \ - vpcmpgtb UCLOW_reg, reg1, %xmm7; \ - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ - vpcmpgtb UCLOW_reg, reg2, %xmm9; \ - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ - vpandn %xmm7, %xmm8, %xmm8; \ - vpandn %xmm9, %xmm10, %xmm10; \ - vpand LCQWORD_reg, %xmm8, %xmm8; \ - vpand LCQWORD_reg, %xmm10, %xmm10; \ - vpor reg1, %xmm8, reg1; \ - vpor reg2, %xmm10, reg2 + vpaddb LCASE_MIN_reg, reg1, %xmm7; \ + vpaddb LCASE_MIN_reg, reg2, %xmm8; \ + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ + vpandn CASE_ADD_reg, %xmm7, %xmm7; \ + vpandn CASE_ADD_reg, %xmm8, %xmm8; \ + vpaddb %xmm7, reg1, reg1; \ + vpaddb %xmm8, reg2, reg2 # else # define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm7; \ - movdqa UCHIGH_reg, %xmm8; \ - movdqa reg2, %xmm9; \ - movdqa UCHIGH_reg, %xmm10; \ - pcmpgtb UCLOW_reg, %xmm7; \ - pcmpgtb reg1, %xmm8; \ - pcmpgtb UCLOW_reg, %xmm9; \ - pcmpgtb reg2, %xmm10; \ - pand %xmm8, %xmm7; \ - pand %xmm10, %xmm9; \ - pand LCQWORD_reg, %xmm7; \ - pand LCQWORD_reg, %xmm9; \ - por %xmm7, reg1; \ - por %xmm9, reg2 + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ + paddb reg2, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm7; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm7; \ + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 # endif TOLOWER (%xmm1, %xmm2) #else