|
|
513694 |
From 413e4abc92aeb12fb4c188aa53f0425ceac0ef15 Mon Sep 17 00:00:00 2001
|
|
|
513694 |
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
513694 |
Date: Sun, 6 Feb 2022 00:54:18 -0600
|
|
|
513694 |
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
|
|
|
513694 |
|
|
|
513694 |
No bug.
|
|
|
513694 |
|
|
|
513694 |
Split vec generation into multiple steps. This allows the
|
|
|
513694 |
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
|
|
|
513694 |
case. This saves an expensive lane-cross instruction and removes
|
|
|
513694 |
the need for 'vzeroupper'.
|
|
|
513694 |
|
|
|
513694 |
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
|
|
|
513694 |
byte broadcast.
|
|
|
513694 |
|
|
|
513694 |
Results for memset-avx2 small (geomean of N = 20 benchset runs).
|
|
|
513694 |
|
|
|
513694 |
size, New Time, Old Time, New / Old
|
|
|
513694 |
0, 4.100, 3.831, 0.934
|
|
|
513694 |
1, 5.074, 4.399, 0.867
|
|
|
513694 |
2, 4.433, 4.411, 0.995
|
|
|
513694 |
4, 4.487, 4.415, 0.984
|
|
|
513694 |
8, 4.454, 4.396, 0.987
|
|
|
513694 |
16, 4.502, 4.443, 0.987
|
|
|
513694 |
|
|
|
513694 |
All relevant string/wcsmbs tests are passing.
|
|
|
513694 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
513694 |
|
|
|
513694 |
(cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
|
|
|
513694 |
---
|
|
|
513694 |
sysdeps/x86_64/memset.S | 21 ++-
|
|
|
513694 |
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
|
|
|
513694 |
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
|
|
|
513694 |
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
|
|
|
513694 |
.../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
|
|
|
513694 |
5 files changed, 152 insertions(+), 87 deletions(-)
|
|
|
513694 |
|
|
|
513694 |
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
|
|
513694 |
index 8672b030..27debd2b 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/memset.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/memset.S
|
|
|
513694 |
@@ -28,17 +28,22 @@
|
|
|
513694 |
#define VMOVU movups
|
|
|
513694 |
#define VMOVA movaps
|
|
|
513694 |
|
|
|
513694 |
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
movd d, %xmm0; \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- punpcklbw %xmm0, %xmm0; \
|
|
|
513694 |
- punpcklwd %xmm0, %xmm0; \
|
|
|
513694 |
- pshufd $0, %xmm0, %xmm0
|
|
|
513694 |
+ pxor %xmm1, %xmm1; \
|
|
|
513694 |
+ pshufb %xmm1, %xmm0; \
|
|
|
513694 |
+ movq r, %rax
|
|
|
513694 |
|
|
|
513694 |
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
movd d, %xmm0; \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- pshufd $0, %xmm0, %xmm0
|
|
|
513694 |
+ pshufd $0, %xmm0, %xmm0; \
|
|
|
513694 |
+ movq r, %rax
|
|
|
513694 |
+
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
+
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
|
|
|
513694 |
#define SECTION(p) p
|
|
|
513694 |
|
|
|
513694 |
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
|
|
513694 |
index 1af668af..c0bf2875 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
|
|
513694 |
@@ -10,15 +10,18 @@
|
|
|
513694 |
# define VMOVU vmovdqu
|
|
|
513694 |
# define VMOVA vmovdqa
|
|
|
513694 |
|
|
|
513694 |
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
vmovd d, %xmm0; \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- vpbroadcastb %xmm0, %ymm0
|
|
|
513694 |
+ movq r, %rax;
|
|
|
513694 |
|
|
|
513694 |
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
- vmovd d, %xmm0; \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- vpbroadcastd %xmm0, %ymm0
|
|
|
513694 |
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
|
|
513694 |
+
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
|
|
|
513694 |
+
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
|
|
|
513694 |
|
|
|
513694 |
# ifndef SECTION
|
|
|
513694 |
# define SECTION(p) p##.avx
|
|
|
513694 |
@@ -30,5 +33,6 @@
|
|
|
513694 |
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
|
|
513694 |
# endif
|
|
|
513694 |
|
|
|
513694 |
+# define USE_XMM_LESS_VEC
|
|
|
513694 |
# include "memset-vec-unaligned-erms.S"
|
|
|
513694 |
#endif
|
|
|
513694 |
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
|
513694 |
index f14d6f84..5241216a 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
|
513694 |
@@ -15,13 +15,19 @@
|
|
|
513694 |
|
|
|
513694 |
# define VZEROUPPER
|
|
|
513694 |
|
|
|
513694 |
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- vpbroadcastb d, %VEC0
|
|
|
513694 |
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+ vpbroadcastb d, %VEC0; \
|
|
|
513694 |
+ movq r, %rax
|
|
|
513694 |
|
|
|
513694 |
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- vpbroadcastd d, %VEC0
|
|
|
513694 |
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+ vpbroadcastd d, %VEC0; \
|
|
|
513694 |
+ movq r, %rax
|
|
|
513694 |
+
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
+
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
|
|
|
513694 |
# define SECTION(p) p##.evex512
|
|
|
513694 |
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
|
513694 |
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
|
513694 |
index 64b09e77..63700215 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
|
513694 |
@@ -15,13 +15,19 @@
|
|
|
513694 |
|
|
|
513694 |
# define VZEROUPPER
|
|
|
513694 |
|
|
|
513694 |
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- vpbroadcastb d, %VEC0
|
|
|
513694 |
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+ vpbroadcastb d, %VEC0; \
|
|
|
513694 |
+ movq r, %rax
|
|
|
513694 |
|
|
|
513694 |
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
- movq r, %rax; \
|
|
|
513694 |
- vpbroadcastd d, %VEC0
|
|
|
513694 |
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
|
513694 |
+ vpbroadcastd d, %VEC0; \
|
|
|
513694 |
+ movq r, %rax
|
|
|
513694 |
+
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
+
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
|
|
|
513694 |
# define SECTION(p) p##.evex
|
|
|
513694 |
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
|
|
513694 |
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
|
513694 |
index f08b7323..a67f9833 100644
|
|
|
513694 |
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
|
513694 |
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
|
513694 |
@@ -58,8 +58,10 @@
|
|
|
513694 |
#ifndef MOVQ
|
|
|
513694 |
# if VEC_SIZE > 16
|
|
|
513694 |
# define MOVQ vmovq
|
|
|
513694 |
+# define MOVD vmovd
|
|
|
513694 |
# else
|
|
|
513694 |
# define MOVQ movq
|
|
|
513694 |
+# define MOVD movd
|
|
|
513694 |
# endif
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
@@ -72,9 +74,17 @@
|
|
|
513694 |
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
|
|
513694 |
# define END_REG rcx
|
|
|
513694 |
# define LOOP_REG rdi
|
|
|
513694 |
+# define LESS_VEC_REG rax
|
|
|
513694 |
#else
|
|
|
513694 |
# define END_REG rdi
|
|
|
513694 |
# define LOOP_REG rdx
|
|
|
513694 |
+# define LESS_VEC_REG rdi
|
|
|
513694 |
+#endif
|
|
|
513694 |
+
|
|
|
513694 |
+#ifdef USE_XMM_LESS_VEC
|
|
|
513694 |
+# define XMM_SMALL 1
|
|
|
513694 |
+#else
|
|
|
513694 |
+# define XMM_SMALL 0
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
#define PAGE_SIZE 4096
|
|
|
513694 |
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
|
513694 |
|
|
|
513694 |
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
|
513694 |
shl $2, %RDX_LP
|
|
|
513694 |
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
|
513694 |
- jmp L(entry_from_bzero)
|
|
|
513694 |
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
|
513694 |
+ WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
513694 |
+ cmpq $VEC_SIZE, %rdx
|
|
|
513694 |
+ jb L(less_vec_no_vdup)
|
|
|
513694 |
+ WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+ jmp L(entry_from_wmemset)
|
|
|
513694 |
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
|
513694 |
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
|
513694 |
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
|
513694 |
# ifdef __ILP32__
|
|
|
513694 |
/* Clear the upper 32 bits. */
|
|
|
513694 |
mov %edx, %edx
|
|
|
513694 |
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
|
513694 |
L(entry_from_bzero):
|
|
|
513694 |
cmpq $VEC_SIZE, %rdx
|
|
|
513694 |
jb L(less_vec)
|
|
|
513694 |
+ MEMSET_VDUP_TO_VEC0_HIGH()
|
|
|
513694 |
+L(entry_from_wmemset):
|
|
|
513694 |
cmpq $(VEC_SIZE * 2), %rdx
|
|
|
513694 |
ja L(more_2x_vec)
|
|
|
513694 |
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
|
513694 |
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
|
513694 |
# endif
|
|
|
513694 |
|
|
|
513694 |
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
|
|
513694 |
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
|
513694 |
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
|
513694 |
# ifdef __ILP32__
|
|
|
513694 |
/* Clear the upper 32 bits. */
|
|
|
513694 |
mov %edx, %edx
|
|
|
513694 |
# endif
|
|
|
513694 |
cmp $VEC_SIZE, %RDX_LP
|
|
|
513694 |
jb L(less_vec)
|
|
|
513694 |
+ MEMSET_VDUP_TO_VEC0_HIGH ()
|
|
|
513694 |
cmp $(VEC_SIZE * 2), %RDX_LP
|
|
|
513694 |
ja L(stosb_more_2x_vec)
|
|
|
513694 |
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
|
|
513694 |
- */
|
|
|
513694 |
- VMOVU %VEC(0), (%rax)
|
|
|
513694 |
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
|
|
513694 |
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
|
513694 |
+ VMOVU %VEC(0), (%rdi)
|
|
|
513694 |
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
|
|
513694 |
VZEROUPPER_RETURN
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
- .p2align 4,, 10
|
|
|
513694 |
+ .p2align 4,, 4
|
|
|
513694 |
L(last_2x_vec):
|
|
|
513694 |
#ifdef USE_LESS_VEC_MASK_STORE
|
|
|
513694 |
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
|
|
513694 |
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
|
|
513694 |
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
|
|
|
513694 |
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
|
|
513694 |
#else
|
|
|
513694 |
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
|
|
513694 |
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
|
|
513694 |
@@ -212,6 +228,7 @@ L(last_2x_vec):
|
|
|
513694 |
#ifdef USE_LESS_VEC_MASK_STORE
|
|
|
513694 |
.p2align 4,, 10
|
|
|
513694 |
L(less_vec):
|
|
|
513694 |
+L(less_vec_no_vdup):
|
|
|
513694 |
/* Less than 1 VEC. */
|
|
|
513694 |
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
|
513694 |
# error Unsupported VEC_SIZE!
|
|
|
513694 |
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
|
|
|
513694 |
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
|
|
513694 |
and (4x, 8x] jump to target. */
|
|
|
513694 |
L(more_2x_vec):
|
|
|
513694 |
-
|
|
|
513694 |
- /* Two different methods of setting up pointers / compare. The
|
|
|
513694 |
- two methods are based on the fact that EVEX/AVX512 mov
|
|
|
513694 |
- instructions take more bytes then AVX2/SSE2 mov instructions. As
|
|
|
513694 |
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
|
|
513694 |
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
|
|
513694 |
- this saves code size and keeps a few targets in one fetch block.
|
|
|
513694 |
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
|
|
513694 |
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
|
|
513694 |
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
|
|
513694 |
- LOOP_4X_OFFSET) with LEA_BID. */
|
|
|
513694 |
-
|
|
|
513694 |
- /* END_REG is rcx for EVEX/AVX512. */
|
|
|
513694 |
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
|
|
513694 |
-#endif
|
|
|
513694 |
-
|
|
|
513694 |
- /* Stores to first 2x VEC before cmp as any path forward will
|
|
|
513694 |
- require it. */
|
|
|
513694 |
- VMOVU %VEC(0), (%rax)
|
|
|
513694 |
- VMOVU %VEC(0), VEC_SIZE(%rax)
|
|
|
513694 |
+ /* Store next 2x vec regardless. */
|
|
|
513694 |
+ VMOVU %VEC(0), (%rdi)
|
|
|
513694 |
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
|
|
|
513694 |
|
|
|
513694 |
|
|
|
513694 |
+ /* Two different methods of setting up pointers / compare. The two
|
|
|
513694 |
+ methods are based on the fact that EVEX/AVX512 mov instructions take
|
|
|
513694 |
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
|
|
|
513694 |
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
|
|
|
513694 |
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
|
|
|
513694 |
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
|
|
|
513694 |
+ bottlenecks. */
|
|
|
513694 |
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
|
|
513694 |
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
|
|
513694 |
addq %rdx, %END_REG
|
|
|
513694 |
@@ -292,6 +299,15 @@ L(more_2x_vec):
|
|
|
513694 |
cmpq $(VEC_SIZE * 4), %rdx
|
|
|
513694 |
jbe L(last_2x_vec)
|
|
|
513694 |
|
|
|
513694 |
+
|
|
|
513694 |
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
|
|
513694 |
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
|
|
|
513694 |
+ LEA_BID. */
|
|
|
513694 |
+
|
|
|
513694 |
+ /* END_REG is rcx for EVEX/AVX512. */
|
|
|
513694 |
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
|
|
513694 |
+#endif
|
|
|
513694 |
+
|
|
|
513694 |
/* Store next 2x vec regardless. */
|
|
|
513694 |
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
|
|
513694 |
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
|
|
513694 |
@@ -355,65 +371,93 @@ L(stosb_local):
|
|
|
513694 |
/* Define L(less_vec) only if not otherwise defined. */
|
|
|
513694 |
.p2align 4
|
|
|
513694 |
L(less_vec):
|
|
|
513694 |
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
|
|
513694 |
+ xmm). This is only does anything for AVX2. */
|
|
|
513694 |
+ MEMSET_VDUP_TO_VEC0_LOW ()
|
|
|
513694 |
+L(less_vec_no_vdup):
|
|
|
513694 |
#endif
|
|
|
513694 |
L(cross_page):
|
|
|
513694 |
#if VEC_SIZE > 32
|
|
|
513694 |
cmpl $32, %edx
|
|
|
513694 |
- jae L(between_32_63)
|
|
|
513694 |
+ jge L(between_32_63)
|
|
|
513694 |
#endif
|
|
|
513694 |
#if VEC_SIZE > 16
|
|
|
513694 |
cmpl $16, %edx
|
|
|
513694 |
- jae L(between_16_31)
|
|
|
513694 |
+ jge L(between_16_31)
|
|
|
513694 |
+#endif
|
|
|
513694 |
+#ifndef USE_XMM_LESS_VEC
|
|
|
513694 |
+ MOVQ %XMM0, %rcx
|
|
|
513694 |
#endif
|
|
|
513694 |
- MOVQ %XMM0, %rdi
|
|
|
513694 |
cmpl $8, %edx
|
|
|
513694 |
- jae L(between_8_15)
|
|
|
513694 |
+ jge L(between_8_15)
|
|
|
513694 |
cmpl $4, %edx
|
|
|
513694 |
- jae L(between_4_7)
|
|
|
513694 |
+ jge L(between_4_7)
|
|
|
513694 |
cmpl $1, %edx
|
|
|
513694 |
- ja L(between_2_3)
|
|
|
513694 |
- jb L(return)
|
|
|
513694 |
- movb %sil, (%rax)
|
|
|
513694 |
- VZEROUPPER_RETURN
|
|
|
513694 |
+ jg L(between_2_3)
|
|
|
513694 |
+ jl L(between_0_0)
|
|
|
513694 |
+ movb %sil, (%LESS_VEC_REG)
|
|
|
513694 |
+L(between_0_0):
|
|
|
513694 |
+ ret
|
|
|
513694 |
|
|
|
513694 |
- /* Align small targets only if not doing so would cross a fetch
|
|
|
513694 |
- line. */
|
|
|
513694 |
+ /* Align small targets only if not doing so would cross a fetch line.
|
|
|
513694 |
+ */
|
|
|
513694 |
#if VEC_SIZE > 32
|
|
|
513694 |
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
|
|
513694 |
/* From 32 to 63. No branch when size == 32. */
|
|
|
513694 |
L(between_32_63):
|
|
|
513694 |
- VMOVU %YMM0, (%rax)
|
|
|
513694 |
- VMOVU %YMM0, -32(%rax, %rdx)
|
|
|
513694 |
+ VMOVU %YMM0, (%LESS_VEC_REG)
|
|
|
513694 |
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
|
|
|
513694 |
VZEROUPPER_RETURN
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
#if VEC_SIZE >= 32
|
|
|
513694 |
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
|
|
513694 |
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
|
|
|
513694 |
L(between_16_31):
|
|
|
513694 |
/* From 16 to 31. No branch when size == 16. */
|
|
|
513694 |
- VMOVU %XMM0, (%rax)
|
|
|
513694 |
- VMOVU %XMM0, -16(%rax, %rdx)
|
|
|
513694 |
- VZEROUPPER_RETURN
|
|
|
513694 |
+ VMOVU %XMM0, (%LESS_VEC_REG)
|
|
|
513694 |
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
|
|
|
513694 |
+ ret
|
|
|
513694 |
#endif
|
|
|
513694 |
|
|
|
513694 |
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
|
|
513694 |
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
|
|
513694 |
+ */
|
|
|
513694 |
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
|
|
|
513694 |
L(between_8_15):
|
|
|
513694 |
/* From 8 to 15. No branch when size == 8. */
|
|
|
513694 |
- movq %rdi, (%rax)
|
|
|
513694 |
- movq %rdi, -8(%rax, %rdx)
|
|
|
513694 |
- VZEROUPPER_RETURN
|
|
|
513694 |
+#ifdef USE_XMM_LESS_VEC
|
|
|
513694 |
+ MOVQ %XMM0, (%rdi)
|
|
|
513694 |
+ MOVQ %XMM0, -8(%rdi, %rdx)
|
|
|
513694 |
+#else
|
|
|
513694 |
+ movq %rcx, (%LESS_VEC_REG)
|
|
|
513694 |
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
|
|
513694 |
+#endif
|
|
|
513694 |
+ ret
|
|
|
513694 |
|
|
|
513694 |
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
|
|
513694 |
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
|
|
513694 |
+ */
|
|
|
513694 |
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
|
|
|
513694 |
L(between_4_7):
|
|
|
513694 |
/* From 4 to 7. No branch when size == 4. */
|
|
|
513694 |
- movl %edi, (%rax)
|
|
|
513694 |
- movl %edi, -4(%rax, %rdx)
|
|
|
513694 |
- VZEROUPPER_RETURN
|
|
|
513694 |
+#ifdef USE_XMM_LESS_VEC
|
|
|
513694 |
+ MOVD %XMM0, (%rdi)
|
|
|
513694 |
+ MOVD %XMM0, -4(%rdi, %rdx)
|
|
|
513694 |
+#else
|
|
|
513694 |
+ movl %ecx, (%LESS_VEC_REG)
|
|
|
513694 |
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
|
|
513694 |
+#endif
|
|
|
513694 |
+ ret
|
|
|
513694 |
|
|
|
513694 |
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
|
|
513694 |
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
|
|
|
513694 |
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
|
|
|
513694 |
L(between_2_3):
|
|
|
513694 |
/* From 2 to 3. No branch when size == 2. */
|
|
|
513694 |
- movw %di, (%rax)
|
|
|
513694 |
- movb %dil, -1(%rax, %rdx)
|
|
|
513694 |
- VZEROUPPER_RETURN
|
|
|
513694 |
+#ifdef USE_XMM_LESS_VEC
|
|
|
513694 |
+ movb %sil, (%rdi)
|
|
|
513694 |
+ movb %sil, 1(%rdi)
|
|
|
513694 |
+ movb %sil, -1(%rdi, %rdx)
|
|
|
513694 |
+#else
|
|
|
513694 |
+ movw %cx, (%LESS_VEC_REG)
|
|
|
513694 |
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
|
|
|
513694 |
+#endif
|
|
|
513694 |
+ ret
|
|
|
513694 |
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
|
513694 |
--
|
|
|
513694 |
GitLab
|
|
|
513694 |
|