| From 017773f93b0e41f3b164e5db86d0c7b7f75675e9 Mon Sep 17 00:00:00 2001 |
| From: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Wed, 23 Mar 2022 16:57:36 -0500 |
| Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S |
| |
| Slightly faster method of doing TOLOWER that saves an |
| instruction. |
| |
| Also replace the hard coded 5-byte no with .p2align 4. On builds with |
| CET enabled this misaligned entry to strcasecmp. |
| |
| geometric_mean(N=40) of all benchmarks New / Original: .894 |
| |
| All string/memory tests pass. |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| |
| (cherry picked from commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73) |
| |
| sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++---------------------- |
| 1 file changed, 29 insertions(+), 35 deletions(-) |
| |
| diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S |
| index aa6df898..f454ce5b 100644 |
| |
| |
| @@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp) |
| movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
| mov %fs:(%rax),%RDX_LP |
| |
| - // XXX 5 byte should be before the function |
| - /* 5-byte NOP. */ |
| - .byte 0x0f,0x1f,0x44,0x00,0x00 |
| + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ |
| + .p2align 4 |
| END2 (__strcasecmp) |
| # ifndef NO_NOLOCALE_ALIAS |
| weak_alias (__strcasecmp, strcasecmp) |
| @@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp) |
| movq __libc_tsd_LOCALE@gottpoff(%rip),%rax |
| mov %fs:(%rax),%RCX_LP |
| |
| - // XXX 5 byte should be before the function |
| - /* 5-byte NOP. */ |
| - .byte 0x0f,0x1f,0x44,0x00,0x00 |
| + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ |
| + .p2align 4 |
| END2 (__strncasecmp) |
| # ifndef NO_NOLOCALE_ALIAS |
| weak_alias (__strncasecmp, strncasecmp) |
| @@ -149,22 +147,22 @@ ENTRY (STRCMP) |
| #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
| .section .rodata.cst16,"aM",@progbits,16 |
| .align 16 |
| -.Lbelowupper: |
| - .quad 0x4040404040404040 |
| - .quad 0x4040404040404040 |
| -.Ltopupper: |
| - .quad 0x5b5b5b5b5b5b5b5b |
| - .quad 0x5b5b5b5b5b5b5b5b |
| -.Ltouppermask: |
| +.Llcase_min: |
| + .quad 0x3f3f3f3f3f3f3f3f |
| + .quad 0x3f3f3f3f3f3f3f3f |
| +.Llcase_max: |
| + .quad 0x9999999999999999 |
| + .quad 0x9999999999999999 |
| +.Lcase_add: |
| .quad 0x2020202020202020 |
| .quad 0x2020202020202020 |
| .previous |
| - movdqa .Lbelowupper(%rip), %xmm5 |
| -# define UCLOW_reg %xmm5 |
| - movdqa .Ltopupper(%rip), %xmm6 |
| -# define UCHIGH_reg %xmm6 |
| - movdqa .Ltouppermask(%rip), %xmm7 |
| -# define LCQWORD_reg %xmm7 |
| + movdqa .Llcase_min(%rip), %xmm5 |
| +# define LCASE_MIN_reg %xmm5 |
| + movdqa .Llcase_max(%rip), %xmm6 |
| +# define LCASE_MAX_reg %xmm6 |
| + movdqa .Lcase_add(%rip), %xmm7 |
| +# define CASE_ADD_reg %xmm7 |
| #endif |
| cmp $0x30, %ecx |
| ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ |
| @@ -175,22 +173,18 @@ ENTRY (STRCMP) |
| movhpd 8(%rdi), %xmm1 |
| movhpd 8(%rsi), %xmm2 |
| #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L |
| -# define TOLOWER(reg1, reg2) \ |
| - movdqa reg1, %xmm8; \ |
| - movdqa UCHIGH_reg, %xmm9; \ |
| - movdqa reg2, %xmm10; \ |
| - movdqa UCHIGH_reg, %xmm11; \ |
| - pcmpgtb UCLOW_reg, %xmm8; \ |
| - pcmpgtb reg1, %xmm9; \ |
| - pcmpgtb UCLOW_reg, %xmm10; \ |
| - pcmpgtb reg2, %xmm11; \ |
| - pand %xmm9, %xmm8; \ |
| - pand %xmm11, %xmm10; \ |
| - pand LCQWORD_reg, %xmm8; \ |
| - pand LCQWORD_reg, %xmm10; \ |
| - por %xmm8, reg1; \ |
| - por %xmm10, reg2 |
| - TOLOWER (%xmm1, %xmm2) |
| +# define TOLOWER(reg1, reg2) \ |
| + movdqa LCASE_MIN_reg, %xmm8; \ |
| + movdqa LCASE_MIN_reg, %xmm9; \ |
| + paddb reg1, %xmm8; \ |
| + paddb reg2, %xmm9; \ |
| + pcmpgtb LCASE_MAX_reg, %xmm8; \ |
| + pcmpgtb LCASE_MAX_reg, %xmm9; \ |
| + pandn CASE_ADD_reg, %xmm8; \ |
| + pandn CASE_ADD_reg, %xmm9; \ |
| + paddb %xmm8, reg1; \ |
| + paddb %xmm9, reg2 |
| + TOLOWER (%xmm1, %xmm2) |
| #else |
| # define TOLOWER(reg1, reg2) |
| #endif |
| -- |
| GitLab |
| |