|
|
513694 |
From cdcf8794677acba1fc38ac101bcf52deee23d91d Mon Sep 17 00:00:00 2001
|
|
|
513694 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
513694 |
Date: Wed, 23 Mar 2022 16:57:24 -0500
|
|
|
513694 |
Subject: [PATCH] x86: Optimize strspn in strspn-c.c
|
|
|
513694 |
|
|
|
513694 |
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
|
|
|
513694 |
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
|
|
|
513694 |
sign extensions.
|
|
|
513694 |
|
|
|
513694 |
geometric_mean(N=20) of all benchmarks that dont fallback on
|
|
|
513694 |
sse2; New / Original: .901
|
|
|
513694 |
|
|
|
513694 |
All string/memory tests pass.
|
|
|
513694 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
513694 |
|
|
|
513694 |
(cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046)
|
|
|
513694 |
---
|
|
|
513694 |
sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
|
|
|
513694 |
1 file changed, 39 insertions(+), 47 deletions(-)
|
|
|
513694 |
|
|
|
513694 |
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
|
|
|
513694 |
index 4554cff0..87c5e4bf 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/multiarch/strspn-c.c
|
|
|
513694 |
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
|
|
|
513694 |
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
|
|
|
513694 |
return 0;
|
|
|
513694 |
|
|
|
513694 |
const char *aligned;
|
|
|
513694 |
- __m128i mask;
|
|
|
513694 |
- int offset = (int) ((size_t) a & 15);
|
|
|
513694 |
+ __m128i mask, maskz, zero;
|
|
|
513694 |
+ unsigned int maskz_bits;
|
|
|
513694 |
+ unsigned int offset = (int) ((size_t) a & 15);
|
|
|
513694 |
+ zero = _mm_set1_epi8 (0);
|
|
|
513694 |
if (offset != 0)
|
|
|
513694 |
{
|
|
|
513694 |
/* Load masks. */
|
|
|
513694 |
aligned = (const char *) ((size_t) a & -16L);
|
|
|
513694 |
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
|
|
|
513694 |
-
|
|
|
513694 |
- mask = __m128i_shift_right (mask0, offset);
|
|
|
513694 |
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
|
|
|
513694 |
|
|
|
513694 |
/* Find where the NULL terminator is. */
|
|
|
513694 |
- int length = _mm_cmpistri (mask, mask, 0x3a);
|
|
|
513694 |
- if (length == 16 - offset)
|
|
|
513694 |
- {
|
|
|
513694 |
- /* There is no NULL terminator. */
|
|
|
513694 |
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
|
|
|
513694 |
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
|
|
|
513694 |
- length += index;
|
|
|
513694 |
-
|
|
|
513694 |
- /* Don't use SSE4.2 if the length of A > 16. */
|
|
|
513694 |
- if (length > 16)
|
|
|
513694 |
- return __strspn_sse2 (s, a);
|
|
|
513694 |
-
|
|
|
513694 |
- if (index != 0)
|
|
|
513694 |
- {
|
|
|
513694 |
- /* Combine mask0 and mask1. We could play games with
|
|
|
513694 |
- palignr, but frankly this data should be in L1 now
|
|
|
513694 |
- so do the merge via an unaligned load. */
|
|
|
513694 |
- mask = _mm_loadu_si128 ((__m128i *) a);
|
|
|
513694 |
- }
|
|
|
513694 |
- }
|
|
|
513694 |
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
|
|
|
513694 |
+ if (maskz_bits != 0)
|
|
|
513694 |
+ {
|
|
|
513694 |
+ mask = __m128i_shift_right (mask0, offset);
|
|
|
513694 |
+ offset = (unsigned int) ((size_t) s & 15);
|
|
|
513694 |
+ if (offset)
|
|
|
513694 |
+ goto start_unaligned;
|
|
|
513694 |
+
|
|
|
513694 |
+ aligned = s;
|
|
|
513694 |
+ goto start_loop;
|
|
|
513694 |
+ }
|
|
|
513694 |
}
|
|
|
513694 |
- else
|
|
|
513694 |
- {
|
|
|
513694 |
- /* A is aligned. */
|
|
|
513694 |
- mask = _mm_load_si128 ((__m128i *) a);
|
|
|
513694 |
|
|
|
513694 |
- /* Find where the NULL terminator is. */
|
|
|
513694 |
- int length = _mm_cmpistri (mask, mask, 0x3a);
|
|
|
513694 |
- if (length == 16)
|
|
|
513694 |
- {
|
|
|
513694 |
- /* There is no NULL terminator. Don't use SSE4.2 if the length
|
|
|
513694 |
- of A > 16. */
|
|
|
513694 |
- if (a[16] != 0)
|
|
|
513694 |
- return __strspn_sse2 (s, a);
|
|
|
513694 |
- }
|
|
|
513694 |
+ /* A is aligned. */
|
|
|
513694 |
+ mask = _mm_loadu_si128 ((__m128i *) a);
|
|
|
513694 |
+
|
|
|
513694 |
+ /* Find where the NULL terminator is. */
|
|
|
513694 |
+ maskz = _mm_cmpeq_epi8 (mask, zero);
|
|
|
513694 |
+ maskz_bits = _mm_movemask_epi8 (maskz);
|
|
|
513694 |
+ if (maskz_bits == 0)
|
|
|
513694 |
+ {
|
|
|
513694 |
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
|
|
|
513694 |
+ of A > 16. */
|
|
|
513694 |
+ if (a[16] != 0)
|
|
|
513694 |
+ return __strspn_sse2 (s, a);
|
|
|
513694 |
}
|
|
|
513694 |
+ aligned = s;
|
|
|
513694 |
+ offset = (unsigned int) ((size_t) s & 15);
|
|
|
513694 |
|
|
|
513694 |
- offset = (int) ((size_t) s & 15);
|
|
|
513694 |
if (offset != 0)
|
|
|
513694 |
{
|
|
|
513694 |
+ start_unaligned:
|
|
|
513694 |
/* Check partial string. */
|
|
|
513694 |
aligned = (const char *) ((size_t) s & -16L);
|
|
|
513694 |
__m128i value = _mm_load_si128 ((__m128i *) aligned);
|
|
|
513694 |
+ __m128i adj_value = __m128i_shift_right (value, offset);
|
|
|
513694 |
|
|
|
513694 |
- value = __m128i_shift_right (value, offset);
|
|
|
513694 |
-
|
|
|
513694 |
- int length = _mm_cmpistri (mask, value, 0x12);
|
|
|
513694 |
+ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
|
|
|
513694 |
/* No need to check CFlag since it is always 1. */
|
|
|
513694 |
if (length < 16 - offset)
|
|
|
513694 |
return length;
|
|
|
513694 |
/* Find where the NULL terminator is. */
|
|
|
513694 |
- int index = _mm_cmpistri (value, value, 0x3a);
|
|
|
513694 |
- if (index < 16 - offset)
|
|
|
513694 |
+ maskz = _mm_cmpeq_epi8 (value, zero);
|
|
|
513694 |
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
|
|
|
513694 |
+ if (maskz_bits != 0)
|
|
|
513694 |
return length;
|
|
|
513694 |
aligned += 16;
|
|
|
513694 |
}
|
|
|
513694 |
- else
|
|
|
513694 |
- aligned = s;
|
|
|
513694 |
|
|
|
513694 |
+start_loop:
|
|
|
513694 |
while (1)
|
|
|
513694 |
{
|
|
|
513694 |
__m128i value = _mm_load_si128 ((__m128i *) aligned);
|
|
|
513694 |
- int index = _mm_cmpistri (mask, value, 0x12);
|
|
|
513694 |
- int cflag = _mm_cmpistrc (mask, value, 0x12);
|
|
|
513694 |
+ unsigned int index = _mm_cmpistri (mask, value, 0x12);
|
|
|
513694 |
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
|
|
|
513694 |
if (cflag)
|
|
|
513694 |
return (size_t) (aligned + index - s);
|
|
|
513694 |
aligned += 16;
|
|
|
513694 |
--
|
|
|
513694 |
GitLab
|
|
|
513694 |
|