076f82
commit cecbac52123456e2fbcff062a4165bf7b9174797
076f82
Author: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
Date:   Mon Nov 1 00:49:52 2021 -0500
076f82
076f82
    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
076f82
    
076f82
    No bug.
076f82
    
076f82
    This patch doubles the rep_movsb_threshold when using ERMS. Based on
076f82
    benchmarks the vector copy loop, especially now that it handles 4k
076f82
    aliasing, is better for these medium ranged.
076f82
    
076f82
    On Skylake with ERMS:
076f82
    
076f82
    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
076f82
    4096,   0,      0,      0,      0.975
076f82
    4096,   0,      0,      1,      0.953
076f82
    4096,   12,     0,      0,      0.969
076f82
    4096,   12,     0,      1,      0.872
076f82
    4096,   44,     0,      0,      0.979
076f82
    4096,   44,     0,      1,      0.83
076f82
    4096,   0,      12,     0,      1.006
076f82
    4096,   0,      12,     1,      0.989
076f82
    4096,   0,      44,     0,      0.739
076f82
    4096,   0,      44,     1,      0.942
076f82
    4096,   12,     12,     0,      1.009
076f82
    4096,   12,     12,     1,      0.973
076f82
    4096,   44,     44,     0,      0.791
076f82
    4096,   44,     44,     1,      0.961
076f82
    4096,   2048,   0,      0,      0.978
076f82
    4096,   2048,   0,      1,      0.951
076f82
    4096,   2060,   0,      0,      0.986
076f82
    4096,   2060,   0,      1,      0.963
076f82
    4096,   2048,   12,     0,      0.971
076f82
    4096,   2048,   12,     1,      0.941
076f82
    4096,   2060,   12,     0,      0.977
076f82
    4096,   2060,   12,     1,      0.949
076f82
    8192,   0,      0,      0,      0.85
076f82
    8192,   0,      0,      1,      0.845
076f82
    8192,   13,     0,      0,      0.937
076f82
    8192,   13,     0,      1,      0.939
076f82
    8192,   45,     0,      0,      0.932
076f82
    8192,   45,     0,      1,      0.927
076f82
    8192,   0,      13,     0,      0.621
076f82
    8192,   0,      13,     1,      0.62
076f82
    8192,   0,      45,     0,      0.53
076f82
    8192,   0,      45,     1,      0.516
076f82
    8192,   13,     13,     0,      0.664
076f82
    8192,   13,     13,     1,      0.659
076f82
    8192,   45,     45,     0,      0.593
076f82
    8192,   45,     45,     1,      0.575
076f82
    8192,   2048,   0,      0,      0.854
076f82
    8192,   2048,   0,      1,      0.834
076f82
    8192,   2061,   0,      0,      0.863
076f82
    8192,   2061,   0,      1,      0.857
076f82
    8192,   2048,   13,     0,      0.63
076f82
    8192,   2048,   13,     1,      0.629
076f82
    8192,   2061,   13,     0,      0.627
076f82
    8192,   2061,   13,     1,      0.62
076f82
    
076f82
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
076f82
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
076f82
    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
076f82
076f82
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
076f82
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
076f82
--- a/sysdeps/x86/dl-cacheinfo.h
076f82
+++ b/sysdeps/x86/dl-cacheinfo.h
076f82
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
076f82
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
076f82
   unsigned int minimum_rep_movsb_threshold;
076f82
 #endif
076f82
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
076f82
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
076f82
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
076f82
+     threshold is 2048 * (VEC_SIZE / 16).  */
076f82
   unsigned int rep_movsb_threshold;
076f82
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
076f82
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
076f82
     {
076f82
-      rep_movsb_threshold = 2048 * (64 / 16);
076f82
+      rep_movsb_threshold = 4096 * (64 / 16);
076f82
 #if HAVE_TUNABLES
076f82
       minimum_rep_movsb_threshold = 64 * 8;
076f82
 #endif
076f82
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
076f82
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
076f82
 				    AVX_Fast_Unaligned_Load))
076f82
     {
076f82
-      rep_movsb_threshold = 2048 * (32 / 16);
076f82
+      rep_movsb_threshold = 4096 * (32 / 16);
076f82
 #if HAVE_TUNABLES
076f82
       minimum_rep_movsb_threshold = 32 * 8;
076f82
 #endif
076f82
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
076f82
index dd6e1d65c9490d4f..419313804d49cf65 100644
076f82
--- a/sysdeps/x86/dl-tunables.list
076f82
+++ b/sysdeps/x86/dl-tunables.list
076f82
@@ -32,17 +32,21 @@ glibc {
076f82
     }
076f82
     x86_rep_movsb_threshold {
076f82
       type: SIZE_T
076f82
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
076f82
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
076f82
-      # shows that 2KB is the approximate value above which REP MOVSB
076f82
-      # becomes faster than SSE2 optimization on processors with Enhanced
076f82
-      # REP MOVSB.  Since larger register size can move more data with a
076f82
-      # single load and store, the threshold is higher with larger register
076f82
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
076f82
-      # times of vector size and the default value is 2048 * (vector size
076f82
-      # / 16), the default value and the minimum value must be updated at
076f82
-      # run-time.  NB: Don't set the default value since we can't tell if
076f82
-      # the tunable value is set by user or not [BZ #27069].
076f82
+      # Since there is overhead to set up REP MOVSB operation, REP
076f82
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
076f82
+      # in glibc shows that 2KB is the approximate value above which
076f82
+      # REP MOVSB becomes faster than SSE2 optimization on processors
076f82
+      # with Enhanced REP MOVSB.  Since larger register size can move
076f82
+      # more data with a single load and store, the threshold is
076f82
+      # higher with larger register size.  Micro benchmarks show AVX
076f82
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
076f82
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
076f82
+      # threshold is universally set at 2112 bytes.  Note: Since the
076f82
+      # REP MOVSB threshold must be greater than 8 times of vector
076f82
+      # size and the default value is 4096 * (vector size / 16), the
076f82
+      # default value and the minimum value must be updated at
076f82
+      # run-time.  NB: Don't set the default value since we can't tell
076f82
+      # if the tunable value is set by user or not [BZ #27069].
076f82
       minval: 1
076f82
     }
076f82
     x86_rep_stosb_threshold {