|
|
190885 |
From aa1f037077a41b36dd477e6ca754e207b37d661a Mon Sep 17 00:00:00 2001
|
|
|
190885 |
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
|
190885 |
Date: Wed, 2 Mar 2022 16:27:24 -0800
|
|
|
190885 |
Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
|
|
|
190885 |
dl-cacheinfo.h
|
|
|
190885 |
|
|
|
190885 |
No bug.
|
|
|
190885 |
|
|
|
190885 |
This patch doubles the rep_movsb_threshold when using ERMS. Based on
|
|
|
190885 |
benchmarks the vector copy loop, especially now that it handles 4k
|
|
|
190885 |
aliasing, is better for these medium ranged.
|
|
|
190885 |
|
|
|
190885 |
On Skylake with ERMS:
|
|
|
190885 |
|
|
|
190885 |
Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)
|
|
|
190885 |
4096, 0, 0, 0, 0.975
|
|
|
190885 |
4096, 0, 0, 1, 0.953
|
|
|
190885 |
4096, 12, 0, 0, 0.969
|
|
|
190885 |
4096, 12, 0, 1, 0.872
|
|
|
190885 |
4096, 44, 0, 0, 0.979
|
|
|
190885 |
4096, 44, 0, 1, 0.83
|
|
|
190885 |
4096, 0, 12, 0, 1.006
|
|
|
190885 |
4096, 0, 12, 1, 0.989
|
|
|
190885 |
4096, 0, 44, 0, 0.739
|
|
|
190885 |
4096, 0, 44, 1, 0.942
|
|
|
190885 |
4096, 12, 12, 0, 1.009
|
|
|
190885 |
4096, 12, 12, 1, 0.973
|
|
|
190885 |
4096, 44, 44, 0, 0.791
|
|
|
190885 |
4096, 44, 44, 1, 0.961
|
|
|
190885 |
4096, 2048, 0, 0, 0.978
|
|
|
190885 |
4096, 2048, 0, 1, 0.951
|
|
|
190885 |
4096, 2060, 0, 0, 0.986
|
|
|
190885 |
4096, 2060, 0, 1, 0.963
|
|
|
190885 |
4096, 2048, 12, 0, 0.971
|
|
|
190885 |
4096, 2048, 12, 1, 0.941
|
|
|
190885 |
4096, 2060, 12, 0, 0.977
|
|
|
190885 |
4096, 2060, 12, 1, 0.949
|
|
|
190885 |
8192, 0, 0, 0, 0.85
|
|
|
190885 |
8192, 0, 0, 1, 0.845
|
|
|
190885 |
8192, 13, 0, 0, 0.937
|
|
|
190885 |
8192, 13, 0, 1, 0.939
|
|
|
190885 |
8192, 45, 0, 0, 0.932
|
|
|
190885 |
8192, 45, 0, 1, 0.927
|
|
|
190885 |
8192, 0, 13, 0, 0.621
|
|
|
190885 |
8192, 0, 13, 1, 0.62
|
|
|
190885 |
8192, 0, 45, 0, 0.53
|
|
|
190885 |
8192, 0, 45, 1, 0.516
|
|
|
190885 |
8192, 13, 13, 0, 0.664
|
|
|
190885 |
8192, 13, 13, 1, 0.659
|
|
|
190885 |
8192, 45, 45, 0, 0.593
|
|
|
190885 |
8192, 45, 45, 1, 0.575
|
|
|
190885 |
8192, 2048, 0, 0, 0.854
|
|
|
190885 |
8192, 2048, 0, 1, 0.834
|
|
|
190885 |
8192, 2061, 0, 0, 0.863
|
|
|
190885 |
8192, 2061, 0, 1, 0.857
|
|
|
190885 |
8192, 2048, 13, 0, 0.63
|
|
|
190885 |
8192, 2048, 13, 1, 0.629
|
|
|
190885 |
8192, 2061, 13, 0, 0.627
|
|
|
190885 |
8192, 2061, 13, 1, 0.62
|
|
|
190885 |
|
|
|
190885 |
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
|
190885 |
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
190885 |
(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
|
|
|
190885 |
---
|
|
|
190885 |
sysdeps/x86/cacheinfo.h | 8 +++++---
|
|
|
190885 |
sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
|
|
|
190885 |
2 files changed, 20 insertions(+), 14 deletions(-)
|
|
|
190885 |
|
|
|
190885 |
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
|
|
190885 |
index cc3941d3..ac025e08 100644
|
|
|
190885 |
--- a/sysdeps/x86/cacheinfo.h
|
|
|
190885 |
+++ b/sysdeps/x86/cacheinfo.h
|
|
|
190885 |
@@ -411,18 +411,20 @@ init_cacheinfo (void)
|
|
|
190885 |
|
|
|
190885 |
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
|
|
|
190885 |
unsigned int minimum_rep_movsb_threshold;
|
|
|
190885 |
- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
|
|
|
190885 |
+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
|
|
|
190885 |
+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
|
|
|
190885 |
+ threshold is 2048 * (VEC_SIZE / 16). */
|
|
|
190885 |
unsigned int rep_movsb_threshold;
|
|
|
190885 |
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
|
190885 |
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
|
|
|
190885 |
{
|
|
|
190885 |
- rep_movsb_threshold = 2048 * (64 / 16);
|
|
|
190885 |
+ rep_movsb_threshold = 4096 * (64 / 16);
|
|
|
190885 |
minimum_rep_movsb_threshold = 64 * 8;
|
|
|
190885 |
}
|
|
|
190885 |
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
|
|
|
190885 |
AVX_Fast_Unaligned_Load))
|
|
|
190885 |
{
|
|
|
190885 |
- rep_movsb_threshold = 2048 * (32 / 16);
|
|
|
190885 |
+ rep_movsb_threshold = 4096 * (32 / 16);
|
|
|
190885 |
minimum_rep_movsb_threshold = 32 * 8;
|
|
|
190885 |
}
|
|
|
190885 |
else
|
|
|
190885 |
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
|
|
|
190885 |
index 89bf2966..56c6834a 100644
|
|
|
190885 |
--- a/sysdeps/x86/dl-tunables.list
|
|
|
190885 |
+++ b/sysdeps/x86/dl-tunables.list
|
|
|
190885 |
@@ -32,17 +32,21 @@ glibc {
|
|
|
190885 |
}
|
|
|
190885 |
x86_rep_movsb_threshold {
|
|
|
190885 |
type: SIZE_T
|
|
|
190885 |
- # Since there is overhead to set up REP MOVSB operation, REP MOVSB
|
|
|
190885 |
- # isn't faster on short data. The memcpy micro benchmark in glibc
|
|
|
190885 |
- # shows that 2KB is the approximate value above which REP MOVSB
|
|
|
190885 |
- # becomes faster than SSE2 optimization on processors with Enhanced
|
|
|
190885 |
- # REP MOVSB. Since larger register size can move more data with a
|
|
|
190885 |
- # single load and store, the threshold is higher with larger register
|
|
|
190885 |
- # size. Note: Since the REP MOVSB threshold must be greater than 8
|
|
|
190885 |
- # times of vector size and the default value is 2048 * (vector size
|
|
|
190885 |
- # / 16), the default value and the minimum value must be updated at
|
|
|
190885 |
- # run-time. NB: Don't set the default value since we can't tell if
|
|
|
190885 |
- # the tunable value is set by user or not [BZ #27069].
|
|
|
190885 |
+ # Since there is overhead to set up REP MOVSB operation, REP
|
|
|
190885 |
+ # MOVSB isn't faster on short data. The memcpy micro benchmark
|
|
|
190885 |
+ # in glibc shows that 2KB is the approximate value above which
|
|
|
190885 |
+ # REP MOVSB becomes faster than SSE2 optimization on processors
|
|
|
190885 |
+ # with Enhanced REP MOVSB. Since larger register size can move
|
|
|
190885 |
+ # more data with a single load and store, the threshold is
|
|
|
190885 |
+ # higher with larger register size. Micro benchmarks show AVX
|
|
|
190885 |
+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512
|
|
|
190885 |
+ # threshold is extrapolated to 16KB. For machines with FSRM the
|
|
|
190885 |
+ # threshold is universally set at 2112 bytes. Note: Since the
|
|
|
190885 |
+ # REP MOVSB threshold must be greater than 8 times of vector
|
|
|
190885 |
+ # size and the default value is 4096 * (vector size / 16), the
|
|
|
190885 |
+ # default value and the minimum value must be updated at
|
|
|
190885 |
+ # run-time. NB: Don't set the default value since we can't tell
|
|
|
190885 |
+ # if the tunable value is set by user or not [BZ #27069].
|
|
|
190885 |
minval: 1
|
|
|
190885 |
}
|
|
|
190885 |
x86_rep_stosb_threshold {
|
|
|
190885 |
--
|
|
|
190885 |
GitLab
|
|
|
190885 |
|