08c3a6
commit cecbac52123456e2fbcff062a4165bf7b9174797
08c3a6
Author: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
Date:   Mon Nov 1 00:49:52 2021 -0500
08c3a6
08c3a6
    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
08c3a6
    
08c3a6
    No bug.
08c3a6
    
08c3a6
    This patch doubles the rep_movsb_threshold when using ERMS. Based on
08c3a6
    benchmarks the vector copy loop, especially now that it handles 4k
08c3a6
    aliasing, is better for these medium ranged.
08c3a6
    
08c3a6
    On Skylake with ERMS:
08c3a6
    
08c3a6
    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
08c3a6
    4096,   0,      0,      0,      0.975
08c3a6
    4096,   0,      0,      1,      0.953
08c3a6
    4096,   12,     0,      0,      0.969
08c3a6
    4096,   12,     0,      1,      0.872
08c3a6
    4096,   44,     0,      0,      0.979
08c3a6
    4096,   44,     0,      1,      0.83
08c3a6
    4096,   0,      12,     0,      1.006
08c3a6
    4096,   0,      12,     1,      0.989
08c3a6
    4096,   0,      44,     0,      0.739
08c3a6
    4096,   0,      44,     1,      0.942
08c3a6
    4096,   12,     12,     0,      1.009
08c3a6
    4096,   12,     12,     1,      0.973
08c3a6
    4096,   44,     44,     0,      0.791
08c3a6
    4096,   44,     44,     1,      0.961
08c3a6
    4096,   2048,   0,      0,      0.978
08c3a6
    4096,   2048,   0,      1,      0.951
08c3a6
    4096,   2060,   0,      0,      0.986
08c3a6
    4096,   2060,   0,      1,      0.963
08c3a6
    4096,   2048,   12,     0,      0.971
08c3a6
    4096,   2048,   12,     1,      0.941
08c3a6
    4096,   2060,   12,     0,      0.977
08c3a6
    4096,   2060,   12,     1,      0.949
08c3a6
    8192,   0,      0,      0,      0.85
08c3a6
    8192,   0,      0,      1,      0.845
08c3a6
    8192,   13,     0,      0,      0.937
08c3a6
    8192,   13,     0,      1,      0.939
08c3a6
    8192,   45,     0,      0,      0.932
08c3a6
    8192,   45,     0,      1,      0.927
08c3a6
    8192,   0,      13,     0,      0.621
08c3a6
    8192,   0,      13,     1,      0.62
08c3a6
    8192,   0,      45,     0,      0.53
08c3a6
    8192,   0,      45,     1,      0.516
08c3a6
    8192,   13,     13,     0,      0.664
08c3a6
    8192,   13,     13,     1,      0.659
08c3a6
    8192,   45,     45,     0,      0.593
08c3a6
    8192,   45,     45,     1,      0.575
08c3a6
    8192,   2048,   0,      0,      0.854
08c3a6
    8192,   2048,   0,      1,      0.834
08c3a6
    8192,   2061,   0,      0,      0.863
08c3a6
    8192,   2061,   0,      1,      0.857
08c3a6
    8192,   2048,   13,     0,      0.63
08c3a6
    8192,   2048,   13,     1,      0.629
08c3a6
    8192,   2061,   13,     0,      0.627
08c3a6
    8192,   2061,   13,     1,      0.62
08c3a6
    
08c3a6
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
08c3a6
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
08c3a6
    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
08c3a6
08c3a6
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
08c3a6
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
08c3a6
--- a/sysdeps/x86/dl-cacheinfo.h
08c3a6
+++ b/sysdeps/x86/dl-cacheinfo.h
08c3a6
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
08c3a6
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
08c3a6
   unsigned int minimum_rep_movsb_threshold;
08c3a6
 #endif
08c3a6
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
08c3a6
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
08c3a6
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
08c3a6
+     threshold is 2048 * (VEC_SIZE / 16).  */
08c3a6
   unsigned int rep_movsb_threshold;
08c3a6
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
08c3a6
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
08c3a6
     {
08c3a6
-      rep_movsb_threshold = 2048 * (64 / 16);
08c3a6
+      rep_movsb_threshold = 4096 * (64 / 16);
08c3a6
 #if HAVE_TUNABLES
08c3a6
       minimum_rep_movsb_threshold = 64 * 8;
08c3a6
 #endif
08c3a6
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
08c3a6
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
08c3a6
 				    AVX_Fast_Unaligned_Load))
08c3a6
     {
08c3a6
-      rep_movsb_threshold = 2048 * (32 / 16);
08c3a6
+      rep_movsb_threshold = 4096 * (32 / 16);
08c3a6
 #if HAVE_TUNABLES
08c3a6
       minimum_rep_movsb_threshold = 32 * 8;
08c3a6
 #endif
08c3a6
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
08c3a6
index dd6e1d65c9490d4f..419313804d49cf65 100644
08c3a6
--- a/sysdeps/x86/dl-tunables.list
08c3a6
+++ b/sysdeps/x86/dl-tunables.list
08c3a6
@@ -32,17 +32,21 @@ glibc {
08c3a6
     }
08c3a6
     x86_rep_movsb_threshold {
08c3a6
       type: SIZE_T
08c3a6
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
08c3a6
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
08c3a6
-      # shows that 2KB is the approximate value above which REP MOVSB
08c3a6
-      # becomes faster than SSE2 optimization on processors with Enhanced
08c3a6
-      # REP MOVSB.  Since larger register size can move more data with a
08c3a6
-      # single load and store, the threshold is higher with larger register
08c3a6
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
08c3a6
-      # times of vector size and the default value is 2048 * (vector size
08c3a6
-      # / 16), the default value and the minimum value must be updated at
08c3a6
-      # run-time.  NB: Don't set the default value since we can't tell if
08c3a6
-      # the tunable value is set by user or not [BZ #27069].
08c3a6
+      # Since there is overhead to set up REP MOVSB operation, REP
08c3a6
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
08c3a6
+      # in glibc shows that 2KB is the approximate value above which
08c3a6
+      # REP MOVSB becomes faster than SSE2 optimization on processors
08c3a6
+      # with Enhanced REP MOVSB.  Since larger register size can move
08c3a6
+      # more data with a single load and store, the threshold is
08c3a6
+      # higher with larger register size.  Micro benchmarks show AVX
08c3a6
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
08c3a6
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
08c3a6
+      # threshold is universally set at 2112 bytes.  Note: Since the
08c3a6
+      # REP MOVSB threshold must be greater than 8 times of vector
08c3a6
+      # size and the default value is 4096 * (vector size / 16), the
08c3a6
+      # default value and the minimum value must be updated at
08c3a6
+      # run-time.  NB: Don't set the default value since we can't tell
08c3a6
+      # if the tunable value is set by user or not [BZ #27069].
08c3a6
       minval: 1
08c3a6
     }
08c3a6
     x86_rep_stosb_threshold {