| commit cecbac52123456e2fbcff062a4165bf7b9174797 |
| Author: Noah Goldstein <goldstein.w.n@gmail.com> |
| Date: Mon Nov 1 00:49:52 2021 -0500 |
| |
| x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h |
| |
| No bug. |
| |
| This patch doubles the rep_movsb_threshold when using ERMS. Based on |
| benchmarks the vector copy loop, especially now that it handles 4k |
| aliasing, is better for these medium ranged. |
| |
| On Skylake with ERMS: |
| |
| Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) |
| 4096, 0, 0, 0, 0.975 |
| 4096, 0, 0, 1, 0.953 |
| 4096, 12, 0, 0, 0.969 |
| 4096, 12, 0, 1, 0.872 |
| 4096, 44, 0, 0, 0.979 |
| 4096, 44, 0, 1, 0.83 |
| 4096, 0, 12, 0, 1.006 |
| 4096, 0, 12, 1, 0.989 |
| 4096, 0, 44, 0, 0.739 |
| 4096, 0, 44, 1, 0.942 |
| 4096, 12, 12, 0, 1.009 |
| 4096, 12, 12, 1, 0.973 |
| 4096, 44, 44, 0, 0.791 |
| 4096, 44, 44, 1, 0.961 |
| 4096, 2048, 0, 0, 0.978 |
| 4096, 2048, 0, 1, 0.951 |
| 4096, 2060, 0, 0, 0.986 |
| 4096, 2060, 0, 1, 0.963 |
| 4096, 2048, 12, 0, 0.971 |
| 4096, 2048, 12, 1, 0.941 |
| 4096, 2060, 12, 0, 0.977 |
| 4096, 2060, 12, 1, 0.949 |
| 8192, 0, 0, 0, 0.85 |
| 8192, 0, 0, 1, 0.845 |
| 8192, 13, 0, 0, 0.937 |
| 8192, 13, 0, 1, 0.939 |
| 8192, 45, 0, 0, 0.932 |
| 8192, 45, 0, 1, 0.927 |
| 8192, 0, 13, 0, 0.621 |
| 8192, 0, 13, 1, 0.62 |
| 8192, 0, 45, 0, 0.53 |
| 8192, 0, 45, 1, 0.516 |
| 8192, 13, 13, 0, 0.664 |
| 8192, 13, 13, 1, 0.659 |
| 8192, 45, 45, 0, 0.593 |
| 8192, 45, 45, 1, 0.575 |
| 8192, 2048, 0, 0, 0.854 |
| 8192, 2048, 0, 1, 0.834 |
| 8192, 2061, 0, 0, 0.863 |
| 8192, 2061, 0, 1, 0.857 |
| 8192, 2048, 13, 0, 0.63 |
| 8192, 2048, 13, 1, 0.629 |
| 8192, 2061, 13, 0, 0.627 |
| 8192, 2061, 13, 1, 0.62 |
| |
| Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> |
| Reviewed-by: H.J. Lu <hjl.tools@gmail.com> |
| (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3) |
| |
| diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h |
| index e6c94dfd023a25dc..2e43e67e4f4037d3 100644 |
| |
| |
| @@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) |
| /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ |
| unsigned int minimum_rep_movsb_threshold; |
| #endif |
| - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ |
| + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for |
| + VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB |
| + threshold is 2048 * (VEC_SIZE / 16). */ |
| unsigned int rep_movsb_threshold; |
| if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) |
| && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) |
| { |
| - rep_movsb_threshold = 2048 * (64 / 16); |
| + rep_movsb_threshold = 4096 * (64 / 16); |
| #if HAVE_TUNABLES |
| minimum_rep_movsb_threshold = 64 * 8; |
| #endif |
| @@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) |
| else if (CPU_FEATURE_PREFERRED_P (cpu_features, |
| AVX_Fast_Unaligned_Load)) |
| { |
| - rep_movsb_threshold = 2048 * (32 / 16); |
| + rep_movsb_threshold = 4096 * (32 / 16); |
| #if HAVE_TUNABLES |
| minimum_rep_movsb_threshold = 32 * 8; |
| #endif |
| diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list |
| index dd6e1d65c9490d4f..419313804d49cf65 100644 |
| |
| |
| @@ -32,17 +32,21 @@ glibc { |
| } |
| x86_rep_movsb_threshold { |
| type: SIZE_T |
| - # Since there is overhead to set up REP MOVSB operation, REP MOVSB |
| - # isn't faster on short data. The memcpy micro benchmark in glibc |
| - # shows that 2KB is the approximate value above which REP MOVSB |
| - # becomes faster than SSE2 optimization on processors with Enhanced |
| - # REP MOVSB. Since larger register size can move more data with a |
| - # single load and store, the threshold is higher with larger register |
| - # size. Note: Since the REP MOVSB threshold must be greater than 8 |
| - # times of vector size and the default value is 2048 * (vector size |
| - # / 16), the default value and the minimum value must be updated at |
| - # run-time. NB: Don't set the default value since we can't tell if |
| - # the tunable value is set by user or not [BZ #27069]. |
| + # Since there is overhead to set up REP MOVSB operation, REP |
| + # MOVSB isn't faster on short data. The memcpy micro benchmark |
| + # in glibc shows that 2KB is the approximate value above which |
| + # REP MOVSB becomes faster than SSE2 optimization on processors |
| + # with Enhanced REP MOVSB. Since larger register size can move |
| + # more data with a single load and store, the threshold is |
| + # higher with larger register size. Micro benchmarks show AVX |
| + # REP MOVSB becomes faster apprximately at 8KB. The AVX512 |
| + # threshold is extrapolated to 16KB. For machines with FSRM the |
| + # threshold is universally set at 2112 bytes. Note: Since the |
| + # REP MOVSB threshold must be greater than 8 times of vector |
| + # size and the default value is 4096 * (vector size / 16), the |
| + # default value and the minimum value must be updated at |
| + # run-time. NB: Don't set the default value since we can't tell |
| + # if the tunable value is set by user or not [BZ #27069]. |
| minval: 1 |
| } |
| x86_rep_stosb_threshold { |