|
|
190885 |
From d8d5c44ed7636fdd2b736e152f8207ca063da386 Mon Sep 17 00:00:00 2001
|
|
|
190885 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Date: Wed, 9 Jun 2021 16:25:32 -0400
|
|
|
190885 |
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
|
|
|
190885 |
#27974]
|
|
|
190885 |
|
|
|
190885 |
This commit fixes the bug mentioned in the previous commit.
|
|
|
190885 |
|
|
|
190885 |
The previous implementations of wmemchr in these files relied
|
|
|
190885 |
on n * sizeof(wchar_t) which was not guranteed by the standard.
|
|
|
190885 |
|
|
|
190885 |
The new overflow tests added in the previous commit now
|
|
|
190885 |
pass (As well as all the other tests).
|
|
|
190885 |
|
|
|
190885 |
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
190885 |
(cherry picked from commit 645a158978f9520e74074e8c14047503be4db0f0)
|
|
|
190885 |
---
|
|
|
190885 |
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
|
|
|
190885 |
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
|
|
|
190885 |
2 files changed, 98 insertions(+), 37 deletions(-)
|
|
|
190885 |
|
|
|
190885 |
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
|
|
190885 |
index cb320257..24f9a0c5 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/memchr.S
|
|
|
190885 |
+++ b/sysdeps/x86_64/memchr.S
|
|
|
190885 |
@@ -21,9 +21,11 @@
|
|
|
190885 |
#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
# define MEMCHR wmemchr
|
|
|
190885 |
# define PCMPEQ pcmpeqd
|
|
|
190885 |
+# define CHAR_PER_VEC 4
|
|
|
190885 |
#else
|
|
|
190885 |
# define MEMCHR memchr
|
|
|
190885 |
# define PCMPEQ pcmpeqb
|
|
|
190885 |
+# define CHAR_PER_VEC 16
|
|
|
190885 |
#endif
|
|
|
190885 |
|
|
|
190885 |
/* fast SSE2 version with using pmaxub and 64 byte loop */
|
|
|
190885 |
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
|
|
|
190885 |
movd %esi, %xmm1
|
|
|
190885 |
mov %edi, %ecx
|
|
|
190885 |
|
|
|
190885 |
+#ifdef __ILP32__
|
|
|
190885 |
+ /* Clear the upper 32 bits. */
|
|
|
190885 |
+ movl %edx, %edx
|
|
|
190885 |
+#endif
|
|
|
190885 |
#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
test %RDX_LP, %RDX_LP
|
|
|
190885 |
jz L(return_null)
|
|
|
190885 |
- shl $2, %RDX_LP
|
|
|
190885 |
#else
|
|
|
190885 |
-# ifdef __ILP32__
|
|
|
190885 |
- /* Clear the upper 32 bits. */
|
|
|
190885 |
- movl %edx, %edx
|
|
|
190885 |
-# endif
|
|
|
190885 |
punpcklbw %xmm1, %xmm1
|
|
|
190885 |
test %RDX_LP, %RDX_LP
|
|
|
190885 |
jz L(return_null)
|
|
|
190885 |
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
|
|
|
190885 |
test %eax, %eax
|
|
|
190885 |
|
|
|
190885 |
jnz L(matches_1)
|
|
|
190885 |
- sub $16, %rdx
|
|
|
190885 |
+ sub $CHAR_PER_VEC, %rdx
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
add $16, %rdi
|
|
|
190885 |
and $15, %ecx
|
|
|
190885 |
and $-16, %rdi
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ shr $2, %ecx
|
|
|
190885 |
+#endif
|
|
|
190885 |
add %rcx, %rdx
|
|
|
190885 |
- sub $64, %rdx
|
|
|
190885 |
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
jbe L(exit_loop)
|
|
|
190885 |
jmp L(loop_prolog)
|
|
|
190885 |
|
|
|
190885 |
@@ -77,16 +81,21 @@ L(crosscache):
|
|
|
190885 |
movdqa (%rdi), %xmm0
|
|
|
190885 |
|
|
|
190885 |
PCMPEQ %xmm1, %xmm0
|
|
|
190885 |
-/* Check if there is a match. */
|
|
|
190885 |
+ /* Check if there is a match. */
|
|
|
190885 |
pmovmskb %xmm0, %eax
|
|
|
190885 |
-/* Remove the leading bytes. */
|
|
|
190885 |
+ /* Remove the leading bytes. */
|
|
|
190885 |
sar %cl, %eax
|
|
|
190885 |
test %eax, %eax
|
|
|
190885 |
je L(unaligned_no_match)
|
|
|
190885 |
-/* Check which byte is a match. */
|
|
|
190885 |
+ /* Check which byte is a match. */
|
|
|
190885 |
bsf %eax, %eax
|
|
|
190885 |
-
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ mov %eax, %esi
|
|
|
190885 |
+ shr $2, %esi
|
|
|
190885 |
+ sub %rsi, %rdx
|
|
|
190885 |
+#else
|
|
|
190885 |
sub %rax, %rdx
|
|
|
190885 |
+#endif
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
add %rdi, %rax
|
|
|
190885 |
add %rcx, %rax
|
|
|
190885 |
@@ -94,15 +103,18 @@ L(crosscache):
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(unaligned_no_match):
|
|
|
190885 |
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
|
|
190885 |
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
|
|
190885 |
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
|
|
|
190885 |
possible addition overflow. */
|
|
|
190885 |
neg %rcx
|
|
|
190885 |
add $16, %rcx
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ shr $2, %ecx
|
|
|
190885 |
+#endif
|
|
|
190885 |
sub %rcx, %rdx
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
add $16, %rdi
|
|
|
190885 |
- sub $64, %rdx
|
|
|
190885 |
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
jbe L(exit_loop)
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
@@ -135,7 +147,7 @@ L(loop_prolog):
|
|
|
190885 |
test $0x3f, %rdi
|
|
|
190885 |
jz L(align64_loop)
|
|
|
190885 |
|
|
|
190885 |
- sub $64, %rdx
|
|
|
190885 |
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
jbe L(exit_loop)
|
|
|
190885 |
|
|
|
190885 |
movdqa (%rdi), %xmm0
|
|
|
190885 |
@@ -167,11 +179,14 @@ L(loop_prolog):
|
|
|
190885 |
mov %rdi, %rcx
|
|
|
190885 |
and $-64, %rdi
|
|
|
190885 |
and $63, %ecx
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ shr $2, %ecx
|
|
|
190885 |
+#endif
|
|
|
190885 |
add %rcx, %rdx
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(align64_loop):
|
|
|
190885 |
- sub $64, %rdx
|
|
|
190885 |
+ sub $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
jbe L(exit_loop)
|
|
|
190885 |
movdqa (%rdi), %xmm0
|
|
|
190885 |
movdqa 16(%rdi), %xmm2
|
|
|
190885 |
@@ -218,7 +233,7 @@ L(align64_loop):
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(exit_loop):
|
|
|
190885 |
- add $32, %edx
|
|
|
190885 |
+ add $(CHAR_PER_VEC * 2), %edx
|
|
|
190885 |
jle L(exit_loop_32)
|
|
|
190885 |
|
|
|
190885 |
movdqa (%rdi), %xmm0
|
|
|
190885 |
@@ -238,7 +253,7 @@ L(exit_loop):
|
|
|
190885 |
pmovmskb %xmm3, %eax
|
|
|
190885 |
test %eax, %eax
|
|
|
190885 |
jnz L(matches32_1)
|
|
|
190885 |
- sub $16, %edx
|
|
|
190885 |
+ sub $CHAR_PER_VEC, %edx
|
|
|
190885 |
jle L(return_null)
|
|
|
190885 |
|
|
|
190885 |
PCMPEQ 48(%rdi), %xmm1
|
|
|
190885 |
@@ -250,13 +265,13 @@ L(exit_loop):
|
|
|
190885 |
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(exit_loop_32):
|
|
|
190885 |
- add $32, %edx
|
|
|
190885 |
+ add $(CHAR_PER_VEC * 2), %edx
|
|
|
190885 |
movdqa (%rdi), %xmm0
|
|
|
190885 |
PCMPEQ %xmm1, %xmm0
|
|
|
190885 |
pmovmskb %xmm0, %eax
|
|
|
190885 |
test %eax, %eax
|
|
|
190885 |
jnz L(matches_1)
|
|
|
190885 |
- sub $16, %edx
|
|
|
190885 |
+ sub $CHAR_PER_VEC, %edx
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
|
|
|
190885 |
PCMPEQ 16(%rdi), %xmm1
|
|
|
190885 |
@@ -293,7 +308,13 @@ L(matches32):
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(matches_1):
|
|
|
190885 |
bsf %eax, %eax
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ mov %eax, %esi
|
|
|
190885 |
+ shr $2, %esi
|
|
|
190885 |
+ sub %rsi, %rdx
|
|
|
190885 |
+#else
|
|
|
190885 |
sub %rax, %rdx
|
|
|
190885 |
+#endif
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
add %rdi, %rax
|
|
|
190885 |
ret
|
|
|
190885 |
@@ -301,7 +322,13 @@ L(matches_1):
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(matches16_1):
|
|
|
190885 |
bsf %eax, %eax
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ mov %eax, %esi
|
|
|
190885 |
+ shr $2, %esi
|
|
|
190885 |
+ sub %rsi, %rdx
|
|
|
190885 |
+#else
|
|
|
190885 |
sub %rax, %rdx
|
|
|
190885 |
+#endif
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
lea 16(%rdi, %rax), %rax
|
|
|
190885 |
ret
|
|
|
190885 |
@@ -309,7 +336,13 @@ L(matches16_1):
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(matches32_1):
|
|
|
190885 |
bsf %eax, %eax
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ mov %eax, %esi
|
|
|
190885 |
+ shr $2, %esi
|
|
|
190885 |
+ sub %rsi, %rdx
|
|
|
190885 |
+#else
|
|
|
190885 |
sub %rax, %rdx
|
|
|
190885 |
+#endif
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
lea 32(%rdi, %rax), %rax
|
|
|
190885 |
ret
|
|
|
190885 |
@@ -317,7 +350,13 @@ L(matches32_1):
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(matches48_1):
|
|
|
190885 |
bsf %eax, %eax
|
|
|
190885 |
+#ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ mov %eax, %esi
|
|
|
190885 |
+ shr $2, %esi
|
|
|
190885 |
+ sub %rsi, %rdx
|
|
|
190885 |
+#else
|
|
|
190885 |
sub %rax, %rdx
|
|
|
190885 |
+#endif
|
|
|
190885 |
jbe L(return_null)
|
|
|
190885 |
lea 48(%rdi, %rax), %rax
|
|
|
190885 |
ret
|
|
|
190885 |
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
|
190885 |
index b377f22e..16027abb 100644
|
|
|
190885 |
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
|
190885 |
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
|
|
190885 |
@@ -54,21 +54,19 @@
|
|
|
190885 |
|
|
|
190885 |
# define VEC_SIZE 32
|
|
|
190885 |
# define PAGE_SIZE 4096
|
|
|
190885 |
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
|
190885 |
|
|
|
190885 |
.section SECTION(.text),"ax",@progbits
|
|
|
190885 |
ENTRY (MEMCHR)
|
|
|
190885 |
# ifndef USE_AS_RAWMEMCHR
|
|
|
190885 |
/* Check for zero length. */
|
|
|
190885 |
- test %RDX_LP, %RDX_LP
|
|
|
190885 |
- jz L(null)
|
|
|
190885 |
-# endif
|
|
|
190885 |
-# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
- shl $2, %RDX_LP
|
|
|
190885 |
-# else
|
|
|
190885 |
# ifdef __ILP32__
|
|
|
190885 |
- /* Clear the upper 32 bits. */
|
|
|
190885 |
- movl %edx, %edx
|
|
|
190885 |
+ /* Clear upper bits. */
|
|
|
190885 |
+ and %RDX_LP, %RDX_LP
|
|
|
190885 |
+# else
|
|
|
190885 |
+ test %RDX_LP, %RDX_LP
|
|
|
190885 |
# endif
|
|
|
190885 |
+ jz L(null)
|
|
|
190885 |
# endif
|
|
|
190885 |
/* Broadcast CHAR to YMMMATCH. */
|
|
|
190885 |
vmovd %esi, %xmm0
|
|
|
190885 |
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
|
|
|
190885 |
vpmovmskb %ymm1, %eax
|
|
|
190885 |
# ifndef USE_AS_RAWMEMCHR
|
|
|
190885 |
/* If length < CHAR_PER_VEC handle special. */
|
|
|
190885 |
- cmpq $VEC_SIZE, %rdx
|
|
|
190885 |
+ cmpq $CHAR_PER_VEC, %rdx
|
|
|
190885 |
jbe L(first_vec_x0)
|
|
|
190885 |
# endif
|
|
|
190885 |
testl %eax, %eax
|
|
|
190885 |
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
|
|
|
190885 |
L(first_vec_x0):
|
|
|
190885 |
/* Check if first match was before length. */
|
|
|
190885 |
tzcntl %eax, %eax
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* NB: Multiply length by 4 to get byte count. */
|
|
|
190885 |
+ sall $2, %edx
|
|
|
190885 |
+# endif
|
|
|
190885 |
xorl %ecx, %ecx
|
|
|
190885 |
cmpl %eax, %edx
|
|
|
190885 |
leaq (%rdi, %rax), %rax
|
|
|
190885 |
@@ -110,12 +112,12 @@ L(null):
|
|
|
190885 |
# endif
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(cross_page_boundary):
|
|
|
190885 |
- /* Save pointer before aligning as its original value is necessary
|
|
|
190885 |
- for computer return address if byte is found or adjusting length
|
|
|
190885 |
- if it is not and this is memchr. */
|
|
|
190885 |
+ /* Save pointer before aligning as its original value is
|
|
|
190885 |
+ necessary for computer return address if byte is found or
|
|
|
190885 |
+ adjusting length if it is not and this is memchr. */
|
|
|
190885 |
movq %rdi, %rcx
|
|
|
190885 |
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
|
|
190885 |
- rdi for rawmemchr. */
|
|
|
190885 |
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
|
|
190885 |
+ and rdi for rawmemchr. */
|
|
|
190885 |
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
|
|
190885 |
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
|
|
190885 |
vpmovmskb %ymm1, %eax
|
|
|
190885 |
@@ -124,6 +126,10 @@ L(cross_page_boundary):
|
|
|
190885 |
match). */
|
|
|
190885 |
leaq 1(%ALGN_PTR_REG), %rsi
|
|
|
190885 |
subq %RRAW_PTR_REG, %rsi
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
|
|
190885 |
+ shrl $2, %esi
|
|
|
190885 |
+# endif
|
|
|
190885 |
# endif
|
|
|
190885 |
/* Remove the leading bytes. */
|
|
|
190885 |
sarxl %ERAW_PTR_REG, %eax, %eax
|
|
|
190885 |
@@ -181,6 +187,10 @@ L(cross_page_continue):
|
|
|
190885 |
orq $(VEC_SIZE - 1), %rdi
|
|
|
190885 |
/* esi is for adjusting length to see if near the end. */
|
|
|
190885 |
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
|
190885 |
+ sarl $2, %esi
|
|
|
190885 |
+# endif
|
|
|
190885 |
# else
|
|
|
190885 |
orq $(VEC_SIZE - 1), %rdi
|
|
|
190885 |
L(cross_page_continue):
|
|
|
190885 |
@@ -213,7 +223,7 @@ L(cross_page_continue):
|
|
|
190885 |
|
|
|
190885 |
# ifndef USE_AS_RAWMEMCHR
|
|
|
190885 |
/* Check if at last VEC_SIZE * 4 length. */
|
|
|
190885 |
- subq $(VEC_SIZE * 4), %rdx
|
|
|
190885 |
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
jbe L(last_4x_vec_or_less_cmpeq)
|
|
|
190885 |
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
|
|
190885 |
length. */
|
|
|
190885 |
@@ -221,6 +231,10 @@ L(cross_page_continue):
|
|
|
190885 |
movl %edi, %ecx
|
|
|
190885 |
orq $(VEC_SIZE * 4 - 1), %rdi
|
|
|
190885 |
andl $(VEC_SIZE * 4 - 1), %ecx
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
|
|
190885 |
+ sarl $2, %ecx
|
|
|
190885 |
+# endif
|
|
|
190885 |
addq %rcx, %rdx
|
|
|
190885 |
# else
|
|
|
190885 |
/* Align data to VEC_SIZE * 4 - 1 for loop. */
|
|
|
190885 |
@@ -250,15 +264,19 @@ L(loop_4x_vec):
|
|
|
190885 |
|
|
|
190885 |
subq $-(VEC_SIZE * 4), %rdi
|
|
|
190885 |
|
|
|
190885 |
- subq $(VEC_SIZE * 4), %rdx
|
|
|
190885 |
+ subq $(CHAR_PER_VEC * 4), %rdx
|
|
|
190885 |
ja L(loop_4x_vec)
|
|
|
190885 |
|
|
|
190885 |
- /* Fall through into less than 4 remaining vectors of length case.
|
|
|
190885 |
- */
|
|
|
190885 |
+ /* Fall through into less than 4 remaining vectors of length
|
|
|
190885 |
+ case. */
|
|
|
190885 |
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
|
|
190885 |
vpmovmskb %ymm1, %eax
|
|
|
190885 |
.p2align 4
|
|
|
190885 |
L(last_4x_vec_or_less):
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* NB: Multiply length by 4 to get byte count. */
|
|
|
190885 |
+ sall $2, %edx
|
|
|
190885 |
+# endif
|
|
|
190885 |
/* Check if first VEC contained match. */
|
|
|
190885 |
testl %eax, %eax
|
|
|
190885 |
jnz L(first_vec_x1_check)
|
|
|
190885 |
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
|
|
|
190885 |
L(last_4x_vec_or_less_cmpeq):
|
|
|
190885 |
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
|
|
190885 |
vpmovmskb %ymm1, %eax
|
|
|
190885 |
+# ifdef USE_AS_WMEMCHR
|
|
|
190885 |
+ /* NB: Multiply length by 4 to get byte count. */
|
|
|
190885 |
+ sall $2, %edx
|
|
|
190885 |
+# endif
|
|
|
190885 |
subq $-(VEC_SIZE * 4), %rdi
|
|
|
190885 |
/* Check first VEC regardless. */
|
|
|
190885 |
testl %eax, %eax
|
|
|
190885 |
--
|
|
|
190885 |
GitLab
|
|
|
190885 |
|