| commit 5e0a7ecb6629461b28adc1a5aabcc0ede122f201 |
| Author: Wilco Dijkstra <wdijkstr@arm.com> |
| Date: Wed Jun 12 11:38:52 2019 +0100 |
| |
| Improve performance of strstr |
| |
| This patch significantly improves performance of strstr using a novel |
| modified Horspool algorithm. Needles up to size 256 use a bad-character |
| table indexed by hashed pairs of characters to quickly skip past mismatches. |
| Long needles use a self-adapting filtering step to avoid comparing the whole |
| needle repeatedly. |
| |
| By limiting the needle length to 256, the shift table only requires 8 bits |
| per entry, lowering preprocessing overhead and minimizing cache effects. |
| This limit also implies worst-case performance is linear. |
| |
| Small needles up to size 3 use a dedicated linear search. Very long needles |
| use the Two-Way algorithm. |
| |
| The performance gain using the improved bench-strstr on Cortex-A72 is 5.8 |
| times basic_strstr and 3.7 times twoway_strstr. |
| |
| Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test |
| (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). |
| |
| Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> |
| |
| * string/str-two-way.h (two_way_short_needle): Add inline to avoid |
| warning. |
| (two_way_long_needle): Block inlining. |
| * string/strstr.c (strstr2): Add new function. |
| (strstr3): Likewise. |
| (STRSTR): Completely rewrite strstr to improve performance. |
| |
| diff --git a/string/str-two-way.h b/string/str-two-way.h |
| index 523d946c59412e1f..358959bef0fd6f74 100644 |
| |
| |
| @@ -221,7 +221,7 @@ critical_factorization (const unsigned char *needle, size_t needle_len, |
| most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. |
| If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
| HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ |
| -static RETURN_TYPE |
| +static inline RETURN_TYPE |
| two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
| const unsigned char *needle, size_t needle_len) |
| { |
| @@ -382,8 +382,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
| and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. |
| If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
| HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and |
| - sublinear performance is not possible. */ |
| -static RETURN_TYPE |
| + sublinear performance is not possible. |
| + |
| + Since this function is large and complex, block inlining to avoid |
| + slowing down the common case of small needles. */ |
| +__attribute__((noinline)) static RETURN_TYPE |
| two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
| const unsigned char *needle, size_t needle_len) |
| { |
| diff --git a/string/strstr.c b/string/strstr.c |
| index f74d7189ed1319f6..7ffb18ab4285b060 100644 |
| |
| |
| @@ -16,29 +16,17 @@ |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| -/* This particular implementation was written by Eric Blake, 2008. */ |
| - |
| #ifndef _LIBC |
| # include <config.h> |
| #endif |
| |
| -/* Specification of strstr. */ |
| #include <string.h> |
| |
| -#include <stdbool.h> |
| - |
| -#ifndef _LIBC |
| -# define __builtin_expect(expr, val) (expr) |
| -#endif |
| - |
| #define RETURN_TYPE char * |
| #define AVAILABLE(h, h_l, j, n_l) \ |
| (((j) + (n_l) <= (h_l)) \ |
| || ((h_l) += __strnlen ((void*)((h) + (h_l)), (n_l) + 512), \ |
| (j) + (n_l) <= (h_l))) |
| -#define CHECK_EOL (1) |
| -#define RET0_IF_0(a) if (!a) goto ret0 |
| -#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) |
| #include "str-two-way.h" |
| |
| #undef strstr |
| @@ -47,47 +35,128 @@ |
| #define STRSTR strstr |
| #endif |
| |
| -/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK |
| - if NEEDLE is empty, otherwise NULL if NEEDLE is not found in |
| - HAYSTACK. */ |
| -char * |
| -STRSTR (const char *haystack, const char *needle) |
| +static inline char * |
| +strstr2 (const unsigned char *hs, const unsigned char *ne) |
| { |
| - size_t needle_len; /* Length of NEEDLE. */ |
| - size_t haystack_len; /* Known minimum length of HAYSTACK. */ |
| - |
| - /* Handle empty NEEDLE special case. */ |
| - if (needle[0] == '\0') |
| - return (char *) haystack; |
| + uint32_t h1 = (ne[0] << 16) | ne[1]; |
| + uint32_t h2 = 0; |
| + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) |
| + h2 = (h2 << 16) | c; |
| + return h1 == h2 ? (char *)hs - 2 : NULL; |
| +} |
| |
| - /* Skip until we find the first matching char from NEEDLE. */ |
| - haystack = strchr (haystack, needle[0]); |
| - if (haystack == NULL || needle[1] == '\0') |
| - return (char *) haystack; |
| +static inline char * |
| +strstr3 (const unsigned char *hs, const unsigned char *ne) |
| +{ |
| + uint32_t h1 = ((uint32_t)ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); |
| + uint32_t h2 = 0; |
| + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) |
| + h2 = (h2 | c) << 8; |
| + return h1 == h2 ? (char *)hs - 3 : NULL; |
| +} |
| |
| - /* Ensure HAYSTACK length is at least as long as NEEDLE length. |
| - Since a match may occur early on in a huge HAYSTACK, use strnlen |
| +/* Hash character pairs so a small shift table can be used. All bits of |
| + p[0] are included, but not all bits from p[-1]. So if two equal hashes |
| + match on p[-1], p[0] matches too. Hash collisions are harmless and result |
| + in smaller shifts. */ |
| +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift)) |
| + |
| +/* Fast strstr algorithm with guaranteed linear-time performance. |
| + Small needles up to size 3 use a dedicated linear search. Longer needles |
| + up to size 256 use a novel modified Horspool algorithm. It hashes pairs |
| + of characters to quickly skip past mismatches. The main search loop only |
| + exits if the last 2 characters match, avoiding unnecessary calls to memcmp |
| + and allowing for a larger skip if there is no match. A self-adapting |
| + filtering check is used to quickly detect mismatches in long needles. |
| + By limiting the needle length to 256, the shift table can be reduced to 8 |
| + bits per entry, lowering preprocessing overhead and minimizing cache effects. |
| + The limit also implies worst-case performance is linear. |
| + Needles larger than 256 characters use the linear-time Two-Way algorithm. */ |
| +char * |
| +STRSTR (const char *haystack, const char *needle) |
| +{ |
| + const unsigned char *hs = (const unsigned char *) haystack; |
| + const unsigned char *ne = (const unsigned char *) needle; |
| + |
| + /* Handle short needle special cases first. */ |
| + if (ne[0] == '\0') |
| + return (char *)hs; |
| + hs = (const unsigned char *)strchr ((const char*)hs, ne[0]); |
| + if (hs == NULL || ne[1] == '\0') |
| + return (char*)hs; |
| + if (ne[2] == '\0') |
| + return strstr2 (hs, ne); |
| + if (ne[3] == '\0') |
| + return strstr3 (hs, ne); |
| + |
| + /* Ensure haystack length is at least as long as needle length. |
| + Since a match may occur early on in a huge haystack, use strnlen |
| and read ahead a few cachelines for improved performance. */ |
| - needle_len = strlen (needle); |
| - haystack_len = __strnlen (haystack, needle_len + 256); |
| - if (haystack_len < needle_len) |
| + size_t ne_len = strlen ((const char*)ne); |
| + size_t hs_len = __strnlen ((const char*)hs, ne_len | 512); |
| + if (hs_len < ne_len) |
| return NULL; |
| |
| - /* Check whether we have a match. This improves performance since we avoid |
| - the initialization overhead of the two-way algorithm. */ |
| - if (memcmp (haystack, needle, needle_len) == 0) |
| - return (char *) haystack; |
| - |
| - /* Perform the search. Abstract memory is considered to be an array |
| - of 'unsigned char' values, not an array of 'char' values. See |
| - ISO C 99 section 6.2.6.1. */ |
| - if (needle_len < LONG_NEEDLE_THRESHOLD) |
| - return two_way_short_needle ((const unsigned char *) haystack, |
| - haystack_len, |
| - (const unsigned char *) needle, needle_len); |
| - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, |
| - (const unsigned char *) needle, needle_len); |
| + /* Check whether we have a match. This improves performance since we |
| + avoid initialization overheads. */ |
| + if (memcmp (hs, ne, ne_len) == 0) |
| + return (char *) hs; |
| + |
| + /* Use Two-Way algorithm for very long needles. */ |
| + if (__glibc_unlikely (ne_len > 256)) |
| + return two_way_long_needle (hs, hs_len, ne, ne_len); |
| + |
| + const unsigned char *end = hs + hs_len - ne_len; |
| + uint8_t shift[256]; |
| + size_t tmp, shift1; |
| + size_t m1 = ne_len - 1; |
| + size_t offset = 0; |
| + |
| + /* Initialize bad character shift hash table. */ |
| + memset (shift, 0, sizeof (shift)); |
| + for (int i = 1; i < m1; i++) |
| + shift[hash2 (ne + i)] = i; |
| + /* Shift1 is the amount we can skip after matching the hash of the |
| + needle end but not the full needle. */ |
| + shift1 = m1 - shift[hash2 (ne + m1)]; |
| + shift[hash2 (ne + m1)] = m1; |
| + |
| + while (1) |
| + { |
| + if (__glibc_unlikely (hs > end)) |
| + { |
| + end += __strnlen ((const char*)end + m1 + 1, 2048); |
| + if (hs > end) |
| + return NULL; |
| + } |
| + |
| + /* Skip past character pairs not in the needle. */ |
| + do |
| + { |
| + hs += m1; |
| + tmp = shift[hash2 (hs)]; |
| + } |
| + while (tmp == 0 && hs <= end); |
| + |
| + /* If the match is not at the end of the needle, shift to the end |
| + and continue until we match the hash of the needle end. */ |
| + hs -= tmp; |
| + if (tmp < m1) |
| + continue; |
| + |
| + /* Hash of the last 2 characters matches. If the needle is long, |
| + try to quickly filter out mismatches. */ |
| + if (m1 < 15 || memcmp (hs + offset, ne + offset, 8) == 0) |
| + { |
| + if (memcmp (hs, ne, m1) == 0) |
| + return (void *) hs; |
| + |
| + /* Adjust filter offset when it doesn't find the mismatch. */ |
| + offset = (offset >= 8 ? offset : m1) - 8; |
| + } |
| + |
| + /* Skip based on matching the hash of the needle end. */ |
| + hs += shift1; |
| + } |
| } |
| libc_hidden_builtin_def (strstr) |
| - |
| -#undef LONG_NEEDLE_THRESHOLD |