Blame SOURCES/ia-avoid_short_distance_rep_movsb.patch

190885
From 869a7106bfc301aa021a77a9bcede85eddd17da1 Mon Sep 17 00:00:00 2001
190885
From: "H.J. Lu" <hjl.tools@gmail.com>
190885
Date: Wed, 2 Mar 2022 15:33:52 -0800
190885
Subject: [PATCH] x86-64: Add Avoid_Short_Distance_REP_MOVSB
190885
190885
commit 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5
190885
Author: H.J. Lu <hjl.tools@gmail.com>
190885
Date:   Sat Jan 25 14:19:40 2020 -0800
190885
190885
    x86-64: Avoid rep movsb with short distance [BZ #27130]
190885
190885
introduced some regressions on Intel processors without Fast Short REP
190885
MOV (FSRM).  Add Avoid_Short_Distance_REP_MOVSB to avoid rep movsb with
190885
short distance only on Intel processors with FSRM.  bench-memmove-large
190885
on Skylake server shows that cycles of __memmove_evex_unaligned_erms
190885
improves for the following data size:
190885
190885
                                  before    after    Improvement
190885
length=4127, align1=3, align2=0:  479.38    349.25      27%
190885
length=4223, align1=9, align2=5:  405.62    333.25      18%
190885
length=8223, align1=3, align2=0:  786.12    496.38      37%
190885
length=8319, align1=9, align2=5:  727.50    501.38      31%
190885
length=16415, align1=3, align2=0: 1436.88   840.00      41%
190885
length=16511, align1=9, align2=5: 1375.50   836.38      39%
190885
length=32799, align1=3, align2=0: 2890.00   1860.12     36%
190885
length=32895, align1=9, align2=5: 2891.38   1931.88     33%
190885
190885
(cherry picked from commit 91cc803d27bda34919717b496b53cf279e44a922)
190885
---
190885
 sysdeps/x86/cacheinfo.h                                  | 9 +++++++++
190885
 sysdeps/x86/cpu-features.c                               | 5 +++++
190885
 .../include/cpu-features-preferred_feature_index_1.def   | 1 +
190885
 sysdeps/x86/sysdep.h                                     | 5 +++++
190885
 sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S    | 5 +++++
190885
 5 files changed, 25 insertions(+)
190885
190885
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
190885
index b982982f..f72f634a 100644
190885
--- a/sysdeps/x86/cacheinfo.h
190885
+++ b/sysdeps/x86/cacheinfo.h
190885
@@ -48,6 +48,11 @@ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
190885
 /* Threshold to stop using Enhanced REP MOVSB.  */
190885
 long int __x86_rep_movsb_stop_threshold attribute_hidden;
190885
 
190885
+/* A bit-wise OR of string/memory requirements for optimal performance
190885
+   e.g. X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB.  These bits
190885
+   are used at runtime to tune implementation behavior.  */
190885
+int __x86_string_control attribute_hidden;
190885
+
190885
 static void
190885
 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
190885
 		       long int core)
190885
@@ -435,6 +440,10 @@ init_cacheinfo (void)
190885
   if (cpu_features->basic.kind != arch_kind_amd)
190885
     __x86_rep_movsb_stop_threshold = __x86_shared_non_temporal_threshold;
190885
 
190885
+  if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_Short_Distance_REP_MOVSB))
190885
+    __x86_string_control
190885
+      |= X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB;
190885
+
190885
 # if HAVE_TUNABLES
190885
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
190885
 # endif
190885
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
190885
index 4889f062..8885b48e 100644
190885
--- a/sysdeps/x86/cpu-features.c
190885
+++ b/sysdeps/x86/cpu-features.c
190885
@@ -580,6 +580,11 @@ init_cpu_features (struct cpu_features *cpu_features)
190885
 	      &= ~bit_arch_AVX_Fast_Unaligned_Load;
190885
 	  }
190885
 	}
190885
+
190885
+      /* Avoid avoid short distance REP MOVSB on processor with FSRM.  */
190885
+      if (CPU_FEATURES_CPU_P (cpu_features, FSRM))
190885
+	cpu_features->preferred[index_arch_Avoid_Short_Distance_REP_MOVSB]
190885
+	  |= bit_arch_Avoid_Short_Distance_REP_MOVSB;
190885
     }
190885
   /* This spells out "CentaurHauls" or " Shanghai ".  */
190885
   else if ((ebx == 0x746e6543 && ecx == 0x736c7561 && edx == 0x48727561)
190885
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
190885
index 4ca70b40..f2340624 100644
190885
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
190885
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
190885
@@ -33,3 +33,4 @@ BIT (Prefer_FSRM)
190885
 BIT (Prefer_No_AVX512)
190885
 BIT (MathVec_Prefer_No_AVX512)
190885
 BIT (Prefer_AVX2_STRCMP)
190885
+BIT (Avoid_Short_Distance_REP_MOVSB)
190885
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
190885
index f41f4ebd..01bac0f6 100644
190885
--- a/sysdeps/x86/sysdep.h
190885
+++ b/sysdeps/x86/sysdep.h
190885
@@ -57,6 +57,11 @@ enum cf_protection_level
190885
 #define STATE_SAVE_MASK \
190885
   ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
190885
 
190885
+/* Constants for bits in __x86_string_control:  */
190885
+
190885
+/* Avoid short distance REP MOVSB.  */
190885
+#define X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB	(1 << 0)
190885
+
190885
 #ifdef	__ASSEMBLER__
190885
 
190885
 /* Syntactic details of assembler.  */
190885
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
index 620ce3a8..0469bf99 100644
190885
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
190885
@@ -325,12 +325,16 @@ L(movsb):
190885
 	/* Avoid slow backward REP MOVSB.  */
190885
 	jb	L(more_8x_vec_backward)
190885
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
190885
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
190885
+	jz	3f
190885
 	movq	%rdi, %rcx
190885
 	subq	%rsi, %rcx
190885
 	jmp	2f
190885
 # endif
190885
 1:
190885
 # if AVOID_SHORT_DISTANCE_REP_MOVSB
190885
+	andl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
190885
+	jz	3f
190885
 	movq	%rsi, %rcx
190885
 	subq	%rdi, %rcx
190885
 2:
190885
@@ -338,6 +342,7 @@ L(movsb):
190885
    is N*4GB + [1..63] with N >= 0.  */
190885
 	cmpl	$63, %ecx
190885
 	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
190885
+3:
190885
 # endif
190885
 	mov	%RDX_LP, %RCX_LP
190885
 	rep movsb
190885
-- 
190885
GitLab
190885