| From 3848cd2cab96c673c98ea339aeefd5a27837f587 Mon Sep 17 00:00:00 2001 |
| From: "H.J. Lu" <hjl.tools@gmail.com> |
| Date: Tue, 22 Jun 2021 20:42:10 -0700 |
| Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S |
| |
| Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1 |
| version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S |
| and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants. |
| This also removes the unused symbols, __GI___strlen_sse2 and |
| __GI___wcsnlen_sse4_1. |
| |
| (cherry picked from commit a0db678071c60b6c47c468d231dd0b3694ba7a98) |
| |
| sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +- |
| sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++ |
| sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +- |
| sysdeps/x86_64/strlen.S | 243 +------------------- |
| 4 files changed, 262 insertions(+), 242 deletions(-) |
| create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S |
| |
| diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S |
| index 7bc57b8d..449c8a7f 100644 |
| |
| |
| @@ -20,4 +20,4 @@ |
| # define strlen __strlen_sse2 |
| #endif |
| |
| -#include "../strlen.S" |
| +#include "strlen-vec.S" |
| diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S |
| new file mode 100644 |
| index 00000000..8f660bb9 |
| |
| |
| @@ -0,0 +1,257 @@ |
| +/* SSE2 version of strlen and SSE4.1 version of wcslen. |
| + Copyright (C) 2012-2021 Free Software Foundation, Inc. |
| + This file is part of the GNU C Library. |
| + |
| + The GNU C Library is free software; you can redistribute it and/or |
| + modify it under the terms of the GNU Lesser General Public |
| + License as published by the Free Software Foundation; either |
| + version 2.1 of the License, or (at your option) any later version. |
| + |
| + The GNU C Library is distributed in the hope that it will be useful, |
| + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| + Lesser General Public License for more details. |
| + |
| + You should have received a copy of the GNU Lesser General Public |
| + License along with the GNU C Library; if not, see |
| + <https://www.gnu.org/licenses/>. */ |
| + |
| +#include <sysdep.h> |
| + |
| +#ifdef AS_WCSLEN |
| +# define PMINU pminud |
| +# define PCMPEQ pcmpeqd |
| +# define SHIFT_RETURN shrq $2, %rax |
| +#else |
| +# define PMINU pminub |
| +# define PCMPEQ pcmpeqb |
| +# define SHIFT_RETURN |
| +#endif |
| + |
| +/* Long lived register in strlen(s), strnlen(s, n) are: |
| + |
| + %xmm3 - zero |
| + %rdi - s |
| + %r10 (s+n) & (~(64-1)) |
| + %r11 s+n |
| +*/ |
| + |
| + |
| +.text |
| +ENTRY(strlen) |
| + |
| +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ |
| +#define FIND_ZERO \ |
| + PCMPEQ (%rax), %xmm0; \ |
| + PCMPEQ 16(%rax), %xmm1; \ |
| + PCMPEQ 32(%rax), %xmm2; \ |
| + PCMPEQ 48(%rax), %xmm3; \ |
| + pmovmskb %xmm0, %esi; \ |
| + pmovmskb %xmm1, %edx; \ |
| + pmovmskb %xmm2, %r8d; \ |
| + pmovmskb %xmm3, %ecx; \ |
| + salq $16, %rdx; \ |
| + salq $16, %rcx; \ |
| + orq %rsi, %rdx; \ |
| + orq %r8, %rcx; \ |
| + salq $32, %rcx; \ |
| + orq %rcx, %rdx; |
| + |
| +#ifdef AS_STRNLEN |
| +/* Do not read anything when n==0. */ |
| + test %RSI_LP, %RSI_LP |
| + jne L(n_nonzero) |
| + xor %rax, %rax |
| + ret |
| +L(n_nonzero): |
| +# ifdef AS_WCSLEN |
| + shl $2, %RSI_LP |
| +# endif |
| + |
| +/* Initialize long lived registers. */ |
| + |
| + add %RDI_LP, %RSI_LP |
| + mov %RSI_LP, %R10_LP |
| + and $-64, %R10_LP |
| + mov %RSI_LP, %R11_LP |
| +#endif |
| + |
| + pxor %xmm0, %xmm0 |
| + pxor %xmm1, %xmm1 |
| + pxor %xmm2, %xmm2 |
| + pxor %xmm3, %xmm3 |
| + movq %rdi, %rax |
| + movq %rdi, %rcx |
| + andq $4095, %rcx |
| +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ |
| + cmpq $4047, %rcx |
| +/* We cannot unify this branching as it would be ~6 cycles slower. */ |
| + ja L(cross_page) |
| + |
| +#ifdef AS_STRNLEN |
| +/* Test if end is among first 64 bytes. */ |
| +# define STRNLEN_PROLOG \ |
| + mov %r11, %rsi; \ |
| + subq %rax, %rsi; \ |
| + andq $-64, %rax; \ |
| + testq $-64, %rsi; \ |
| + je L(strnlen_ret) |
| +#else |
| +# define STRNLEN_PROLOG andq $-64, %rax; |
| +#endif |
| + |
| +/* Ignore bits in mask that come before start of string. */ |
| +#define PROLOG(lab) \ |
| + movq %rdi, %rcx; \ |
| + xorq %rax, %rcx; \ |
| + STRNLEN_PROLOG; \ |
| + sarq %cl, %rdx; \ |
| + test %rdx, %rdx; \ |
| + je L(lab); \ |
| + bsfq %rdx, %rax; \ |
| + SHIFT_RETURN; \ |
| + ret |
| + |
| +#ifdef AS_STRNLEN |
| + andq $-16, %rax |
| + FIND_ZERO |
| +#else |
| + /* Test first 16 bytes unaligned. */ |
| + movdqu (%rax), %xmm4 |
| + PCMPEQ %xmm0, %xmm4 |
| + pmovmskb %xmm4, %edx |
| + test %edx, %edx |
| + je L(next48_bytes) |
| + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ |
| + SHIFT_RETURN |
| + ret |
| + |
| +L(next48_bytes): |
| +/* Same as FIND_ZERO except we do not check first 16 bytes. */ |
| + andq $-16, %rax |
| + PCMPEQ 16(%rax), %xmm1 |
| + PCMPEQ 32(%rax), %xmm2 |
| + PCMPEQ 48(%rax), %xmm3 |
| + pmovmskb %xmm1, %edx |
| + pmovmskb %xmm2, %r8d |
| + pmovmskb %xmm3, %ecx |
| + salq $16, %rdx |
| + salq $16, %rcx |
| + orq %r8, %rcx |
| + salq $32, %rcx |
| + orq %rcx, %rdx |
| +#endif |
| + |
| + /* When no zero byte is found xmm1-3 are zero so we do not have to |
| + zero them. */ |
| + PROLOG(loop) |
| + |
| + .p2align 4 |
| +L(cross_page): |
| + andq $-64, %rax |
| + FIND_ZERO |
| + PROLOG(loop_init) |
| + |
| +#ifdef AS_STRNLEN |
| +/* We must do this check to correctly handle strnlen (s, -1). */ |
| +L(strnlen_ret): |
| + bts %rsi, %rdx |
| + sarq %cl, %rdx |
| + test %rdx, %rdx |
| + je L(loop_init) |
| + bsfq %rdx, %rax |
| + SHIFT_RETURN |
| + ret |
| +#endif |
| + .p2align 4 |
| +L(loop_init): |
| + pxor %xmm1, %xmm1 |
| + pxor %xmm2, %xmm2 |
| + pxor %xmm3, %xmm3 |
| +#ifdef AS_STRNLEN |
| + .p2align 4 |
| +L(loop): |
| + |
| + addq $64, %rax |
| + cmpq %rax, %r10 |
| + je L(exit_end) |
| + |
| + movdqa (%rax), %xmm0 |
| + PMINU 16(%rax), %xmm0 |
| + PMINU 32(%rax), %xmm0 |
| + PMINU 48(%rax), %xmm0 |
| + PCMPEQ %xmm3, %xmm0 |
| + pmovmskb %xmm0, %edx |
| + testl %edx, %edx |
| + jne L(exit) |
| + jmp L(loop) |
| + |
| + .p2align 4 |
| +L(exit_end): |
| + cmp %rax, %r11 |
| + je L(first) /* Do not read when end is at page boundary. */ |
| + pxor %xmm0, %xmm0 |
| + FIND_ZERO |
| + |
| +L(first): |
| + bts %r11, %rdx |
| + bsfq %rdx, %rdx |
| + addq %rdx, %rax |
| + subq %rdi, %rax |
| + SHIFT_RETURN |
| + ret |
| + |
| + .p2align 4 |
| +L(exit): |
| + pxor %xmm0, %xmm0 |
| + FIND_ZERO |
| + |
| + bsfq %rdx, %rdx |
| + addq %rdx, %rax |
| + subq %rdi, %rax |
| + SHIFT_RETURN |
| + ret |
| + |
| +#else |
| + |
| + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ |
| + .p2align 4 |
| +L(loop): |
| + |
| + movdqa 64(%rax), %xmm0 |
| + PMINU 80(%rax), %xmm0 |
| + PMINU 96(%rax), %xmm0 |
| + PMINU 112(%rax), %xmm0 |
| + PCMPEQ %xmm3, %xmm0 |
| + pmovmskb %xmm0, %edx |
| + testl %edx, %edx |
| + jne L(exit64) |
| + |
| + subq $-128, %rax |
| + |
| + movdqa (%rax), %xmm0 |
| + PMINU 16(%rax), %xmm0 |
| + PMINU 32(%rax), %xmm0 |
| + PMINU 48(%rax), %xmm0 |
| + PCMPEQ %xmm3, %xmm0 |
| + pmovmskb %xmm0, %edx |
| + testl %edx, %edx |
| + jne L(exit0) |
| + jmp L(loop) |
| + |
| + .p2align 4 |
| +L(exit64): |
| + addq $64, %rax |
| +L(exit0): |
| + pxor %xmm0, %xmm0 |
| + FIND_ZERO |
| + |
| + bsfq %rdx, %rdx |
| + addq %rdx, %rax |
| + subq %rdi, %rax |
| + SHIFT_RETURN |
| + ret |
| + |
| +#endif |
| + |
| +END(strlen) |
| diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |
| index a8cab0cb..5fa51fe0 100644 |
| |
| |
| @@ -2,4 +2,4 @@ |
| #define AS_STRNLEN |
| #define strlen __wcsnlen_sse4_1 |
| |
| -#include "../strlen.S" |
| +#include "strlen-vec.S" |
| diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S |
| index f845f3d4..ad047d84 100644 |
| |
| |
| @@ -1,5 +1,5 @@ |
| -/* SSE2 version of strlen/wcslen. |
| - Copyright (C) 2012-2018 Free Software Foundation, Inc. |
| +/* SSE2 version of strlen. |
| + Copyright (C) 2021 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| @@ -16,243 +16,6 @@ |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| -#include <sysdep.h> |
| +#include "multiarch/strlen-vec.S" |
| |
| -#ifdef AS_WCSLEN |
| -# define PMINU pminud |
| -# define PCMPEQ pcmpeqd |
| -# define SHIFT_RETURN shrq $2, %rax |
| -#else |
| -# define PMINU pminub |
| -# define PCMPEQ pcmpeqb |
| -# define SHIFT_RETURN |
| -#endif |
| - |
| -/* Long lived register in strlen(s), strnlen(s, n) are: |
| - |
| - %xmm3 - zero |
| - %rdi - s |
| - %r10 (s+n) & (~(64-1)) |
| - %r11 s+n |
| -*/ |
| - |
| - |
| -.text |
| -ENTRY(strlen) |
| - |
| -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ |
| -#define FIND_ZERO \ |
| - PCMPEQ (%rax), %xmm0; \ |
| - PCMPEQ 16(%rax), %xmm1; \ |
| - PCMPEQ 32(%rax), %xmm2; \ |
| - PCMPEQ 48(%rax), %xmm3; \ |
| - pmovmskb %xmm0, %esi; \ |
| - pmovmskb %xmm1, %edx; \ |
| - pmovmskb %xmm2, %r8d; \ |
| - pmovmskb %xmm3, %ecx; \ |
| - salq $16, %rdx; \ |
| - salq $16, %rcx; \ |
| - orq %rsi, %rdx; \ |
| - orq %r8, %rcx; \ |
| - salq $32, %rcx; \ |
| - orq %rcx, %rdx; |
| - |
| -#ifdef AS_STRNLEN |
| -/* Do not read anything when n==0. */ |
| - test %RSI_LP, %RSI_LP |
| - jne L(n_nonzero) |
| - xor %rax, %rax |
| - ret |
| -L(n_nonzero): |
| -# ifdef AS_WCSLEN |
| - shl $2, %RSI_LP |
| -# endif |
| - |
| -/* Initialize long lived registers. */ |
| - |
| - add %RDI_LP, %RSI_LP |
| - mov %RSI_LP, %R10_LP |
| - and $-64, %R10_LP |
| - mov %RSI_LP, %R11_LP |
| -#endif |
| - |
| - pxor %xmm0, %xmm0 |
| - pxor %xmm1, %xmm1 |
| - pxor %xmm2, %xmm2 |
| - pxor %xmm3, %xmm3 |
| - movq %rdi, %rax |
| - movq %rdi, %rcx |
| - andq $4095, %rcx |
| -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ |
| - cmpq $4047, %rcx |
| -/* We cannot unify this branching as it would be ~6 cycles slower. */ |
| - ja L(cross_page) |
| - |
| -#ifdef AS_STRNLEN |
| -/* Test if end is among first 64 bytes. */ |
| -# define STRNLEN_PROLOG \ |
| - mov %r11, %rsi; \ |
| - subq %rax, %rsi; \ |
| - andq $-64, %rax; \ |
| - testq $-64, %rsi; \ |
| - je L(strnlen_ret) |
| -#else |
| -# define STRNLEN_PROLOG andq $-64, %rax; |
| -#endif |
| - |
| -/* Ignore bits in mask that come before start of string. */ |
| -#define PROLOG(lab) \ |
| - movq %rdi, %rcx; \ |
| - xorq %rax, %rcx; \ |
| - STRNLEN_PROLOG; \ |
| - sarq %cl, %rdx; \ |
| - test %rdx, %rdx; \ |
| - je L(lab); \ |
| - bsfq %rdx, %rax; \ |
| - SHIFT_RETURN; \ |
| - ret |
| - |
| -#ifdef AS_STRNLEN |
| - andq $-16, %rax |
| - FIND_ZERO |
| -#else |
| - /* Test first 16 bytes unaligned. */ |
| - movdqu (%rax), %xmm4 |
| - PCMPEQ %xmm0, %xmm4 |
| - pmovmskb %xmm4, %edx |
| - test %edx, %edx |
| - je L(next48_bytes) |
| - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ |
| - SHIFT_RETURN |
| - ret |
| - |
| -L(next48_bytes): |
| -/* Same as FIND_ZERO except we do not check first 16 bytes. */ |
| - andq $-16, %rax |
| - PCMPEQ 16(%rax), %xmm1 |
| - PCMPEQ 32(%rax), %xmm2 |
| - PCMPEQ 48(%rax), %xmm3 |
| - pmovmskb %xmm1, %edx |
| - pmovmskb %xmm2, %r8d |
| - pmovmskb %xmm3, %ecx |
| - salq $16, %rdx |
| - salq $16, %rcx |
| - orq %r8, %rcx |
| - salq $32, %rcx |
| - orq %rcx, %rdx |
| -#endif |
| - |
| - /* When no zero byte is found xmm1-3 are zero so we do not have to |
| - zero them. */ |
| - PROLOG(loop) |
| - |
| - .p2align 4 |
| -L(cross_page): |
| - andq $-64, %rax |
| - FIND_ZERO |
| - PROLOG(loop_init) |
| - |
| -#ifdef AS_STRNLEN |
| -/* We must do this check to correctly handle strnlen (s, -1). */ |
| -L(strnlen_ret): |
| - bts %rsi, %rdx |
| - sarq %cl, %rdx |
| - test %rdx, %rdx |
| - je L(loop_init) |
| - bsfq %rdx, %rax |
| - SHIFT_RETURN |
| - ret |
| -#endif |
| - .p2align 4 |
| -L(loop_init): |
| - pxor %xmm1, %xmm1 |
| - pxor %xmm2, %xmm2 |
| - pxor %xmm3, %xmm3 |
| -#ifdef AS_STRNLEN |
| - .p2align 4 |
| -L(loop): |
| - |
| - addq $64, %rax |
| - cmpq %rax, %r10 |
| - je L(exit_end) |
| - |
| - movdqa (%rax), %xmm0 |
| - PMINU 16(%rax), %xmm0 |
| - PMINU 32(%rax), %xmm0 |
| - PMINU 48(%rax), %xmm0 |
| - PCMPEQ %xmm3, %xmm0 |
| - pmovmskb %xmm0, %edx |
| - testl %edx, %edx |
| - jne L(exit) |
| - jmp L(loop) |
| - |
| - .p2align 4 |
| -L(exit_end): |
| - cmp %rax, %r11 |
| - je L(first) /* Do not read when end is at page boundary. */ |
| - pxor %xmm0, %xmm0 |
| - FIND_ZERO |
| - |
| -L(first): |
| - bts %r11, %rdx |
| - bsfq %rdx, %rdx |
| - addq %rdx, %rax |
| - subq %rdi, %rax |
| - SHIFT_RETURN |
| - ret |
| - |
| - .p2align 4 |
| -L(exit): |
| - pxor %xmm0, %xmm0 |
| - FIND_ZERO |
| - |
| - bsfq %rdx, %rdx |
| - addq %rdx, %rax |
| - subq %rdi, %rax |
| - SHIFT_RETURN |
| - ret |
| - |
| -#else |
| - |
| - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ |
| - .p2align 4 |
| -L(loop): |
| - |
| - movdqa 64(%rax), %xmm0 |
| - PMINU 80(%rax), %xmm0 |
| - PMINU 96(%rax), %xmm0 |
| - PMINU 112(%rax), %xmm0 |
| - PCMPEQ %xmm3, %xmm0 |
| - pmovmskb %xmm0, %edx |
| - testl %edx, %edx |
| - jne L(exit64) |
| - |
| - subq $-128, %rax |
| - |
| - movdqa (%rax), %xmm0 |
| - PMINU 16(%rax), %xmm0 |
| - PMINU 32(%rax), %xmm0 |
| - PMINU 48(%rax), %xmm0 |
| - PCMPEQ %xmm3, %xmm0 |
| - pmovmskb %xmm0, %edx |
| - testl %edx, %edx |
| - jne L(exit0) |
| - jmp L(loop) |
| - |
| - .p2align 4 |
| -L(exit64): |
| - addq $64, %rax |
| -L(exit0): |
| - pxor %xmm0, %xmm0 |
| - FIND_ZERO |
| - |
| - bsfq %rdx, %rdx |
| - addq %rdx, %rax |
| - subq %rdi, %rax |
| - SHIFT_RETURN |
| - ret |
| - |
| -#endif |
| - |
| -END(strlen) |
| libc_hidden_builtin_def (strlen) |
| -- |
| GitLab |
| |