Blame SOURCES/ia-double-rep_movsb_threshold-erms.patch

190885
From aa1f037077a41b36dd477e6ca754e207b37d661a Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Wed, 2 Mar 2022 16:27:24 -0800
190885
Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
190885
 dl-cacheinfo.h
190885
190885
No bug.
190885
190885
This patch doubles the rep_movsb_threshold when using ERMS. Based on
190885
benchmarks the vector copy loop, especially now that it handles 4k
190885
aliasing, is better for these medium ranged.
190885
190885
On Skylake with ERMS:
190885
190885
Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
190885
4096,   0,      0,      0,      0.975
190885
4096,   0,      0,      1,      0.953
190885
4096,   12,     0,      0,      0.969
190885
4096,   12,     0,      1,      0.872
190885
4096,   44,     0,      0,      0.979
190885
4096,   44,     0,      1,      0.83
190885
4096,   0,      12,     0,      1.006
190885
4096,   0,      12,     1,      0.989
190885
4096,   0,      44,     0,      0.739
190885
4096,   0,      44,     1,      0.942
190885
4096,   12,     12,     0,      1.009
190885
4096,   12,     12,     1,      0.973
190885
4096,   44,     44,     0,      0.791
190885
4096,   44,     44,     1,      0.961
190885
4096,   2048,   0,      0,      0.978
190885
4096,   2048,   0,      1,      0.951
190885
4096,   2060,   0,      0,      0.986
190885
4096,   2060,   0,      1,      0.963
190885
4096,   2048,   12,     0,      0.971
190885
4096,   2048,   12,     1,      0.941
190885
4096,   2060,   12,     0,      0.977
190885
4096,   2060,   12,     1,      0.949
190885
8192,   0,      0,      0,      0.85
190885
8192,   0,      0,      1,      0.845
190885
8192,   13,     0,      0,      0.937
190885
8192,   13,     0,      1,      0.939
190885
8192,   45,     0,      0,      0.932
190885
8192,   45,     0,      1,      0.927
190885
8192,   0,      13,     0,      0.621
190885
8192,   0,      13,     1,      0.62
190885
8192,   0,      45,     0,      0.53
190885
8192,   0,      45,     1,      0.516
190885
8192,   13,     13,     0,      0.664
190885
8192,   13,     13,     1,      0.659
190885
8192,   45,     45,     0,      0.593
190885
8192,   45,     45,     1,      0.575
190885
8192,   2048,   0,      0,      0.854
190885
8192,   2048,   0,      1,      0.834
190885
8192,   2061,   0,      0,      0.863
190885
8192,   2061,   0,      1,      0.857
190885
8192,   2048,   13,     0,      0.63
190885
8192,   2048,   13,     1,      0.629
190885
8192,   2061,   13,     0,      0.627
190885
8192,   2061,   13,     1,      0.62
190885
190885
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
190885
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
190885
(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
190885
---
190885
 sysdeps/x86/cacheinfo.h      |  8 +++++---
190885
 sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
190885
 2 files changed, 20 insertions(+), 14 deletions(-)
190885
190885
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
190885
index cc3941d3..ac025e08 100644
190885
--- a/sysdeps/x86/cacheinfo.h
190885
+++ b/sysdeps/x86/cacheinfo.h
190885
@@ -411,18 +411,20 @@ init_cacheinfo (void)
190885
 
190885
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
190885
   unsigned int minimum_rep_movsb_threshold;
190885
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
190885
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
190885
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
190885
+     threshold is 2048 * (VEC_SIZE / 16).  */
190885
   unsigned int rep_movsb_threshold;
190885
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
190885
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
190885
     {
190885
-      rep_movsb_threshold = 2048 * (64 / 16);
190885
+      rep_movsb_threshold = 4096 * (64 / 16);
190885
       minimum_rep_movsb_threshold = 64 * 8;
190885
     }
190885
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
190885
 				    AVX_Fast_Unaligned_Load))
190885
     {
190885
-      rep_movsb_threshold = 2048 * (32 / 16);
190885
+      rep_movsb_threshold = 4096 * (32 / 16);
190885
       minimum_rep_movsb_threshold = 32 * 8;
190885
     }
190885
   else
190885
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
190885
index 89bf2966..56c6834a 100644
190885
--- a/sysdeps/x86/dl-tunables.list
190885
+++ b/sysdeps/x86/dl-tunables.list
190885
@@ -32,17 +32,21 @@ glibc {
190885
     }
190885
     x86_rep_movsb_threshold {
190885
       type: SIZE_T
190885
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
190885
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
190885
-      # shows that 2KB is the approximate value above which REP MOVSB
190885
-      # becomes faster than SSE2 optimization on processors with Enhanced
190885
-      # REP MOVSB.  Since larger register size can move more data with a
190885
-      # single load and store, the threshold is higher with larger register
190885
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
190885
-      # times of vector size and the default value is 2048 * (vector size
190885
-      # / 16), the default value and the minimum value must be updated at
190885
-      # run-time.  NB: Don't set the default value since we can't tell if
190885
-      # the tunable value is set by user or not [BZ #27069].
190885
+      # Since there is overhead to set up REP MOVSB operation, REP
190885
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
190885
+      # in glibc shows that 2KB is the approximate value above which
190885
+      # REP MOVSB becomes faster than SSE2 optimization on processors
190885
+      # with Enhanced REP MOVSB.  Since larger register size can move
190885
+      # more data with a single load and store, the threshold is
190885
+      # higher with larger register size.  Micro benchmarks show AVX
190885
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
190885
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
190885
+      # threshold is universally set at 2112 bytes.  Note: Since the
190885
+      # REP MOVSB threshold must be greater than 8 times of vector
190885
+      # size and the default value is 4096 * (vector size / 16), the
190885
+      # default value and the minimum value must be updated at
190885
+      # run-time.  NB: Don't set the default value since we can't tell
190885
+      # if the tunable value is set by user or not [BZ #27069].
190885
       minval: 1
190885
     }
190885
     x86_rep_stosb_threshold {
190885
-- 
190885
GitLab
190885