Blame SOURCES/gcc11-tremont2.patch

a46658
From 80c2ed8228817fb6438120997227811a746272ba Mon Sep 17 00:00:00 2001
a46658
From: "H.J. Lu" <hjl.tools@gmail.com>
a46658
Date: Wed, 15 Sep 2021 14:17:08 +0800
a46658
Subject: [PATCH 2/3] x86: Update memcpy/memset inline strategies for
a46658
 -mtune=tremont
a46658
a46658
Simply memcpy and memset inline strategies to avoid branches for
a46658
-mtune=tremont:
a46658
a46658
1. Create Tremont cost model from generic cost model.
a46658
2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
a46658
   load and store for up to 16 * 16 (256) bytes when the data size is
a46658
   fixed and known.
a46658
3. Inline only if data size is known to be <= 256.
a46658
   a. Use "rep movsb/stosb" with simple code sequence if the data size
a46658
      is a constant.
a46658
   b. Use loop if data size is not a constant.
a46658
4. Use memcpy/memset libray function if data size is unknown or > 256.
a46658
a46658
	* config/i386/i386-options.c (processor_cost_table): Use
a46658
	tremont_cost for Tremont.
a46658
	* config/i386/x86-tune-costs.h (tremont_memcpy): New.
a46658
	(tremont_memset): Likewise.
a46658
	(tremont_cost): Likewise.
a46658
	* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
a46658
	Enable for Tremont.
a46658
---
a46658
 gcc/config/i386/i386-options.c   |   2 +-
a46658
 gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
a46658
 gcc/config/i386/x86-tune.def     |   2 +-
a46658
 3 files changed, 126 insertions(+), 2 deletions(-)
a46658
a46658
diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
a46658
index 19632b5fd6b..4b77d62926f 100644
a46658
--- a/gcc/config/i386/i386-options.c
a46658
+++ b/gcc/config/i386/i386-options.c
a46658
@@ -719,7 +719,7 @@ static const struct processor_costs *processor_cost_table[] =
a46658
   &slm_cost,
a46658
   &slm_cost,
a46658
   &slm_cost,
a46658
-  &slm_cost,
a46658
+  &tremont_cost,
a46658
   &slm_cost,
a46658
   &slm_cost,
a46658
   &skylake_cost,
a46658
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
a46658
index ffe810f2bcb..93644be9cb3 100644
a46658
--- a/gcc/config/i386/x86-tune-costs.h
a46658
+++ b/gcc/config/i386/x86-tune-costs.h
a46658
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
a46658
   "16",					/* Func alignment.  */
a46658
 };
a46658
 
a46658
+static stringop_algs tremont_memcpy[2] = {
a46658
+  {libcall,
a46658
+   {{256, rep_prefix_1_byte, true},
a46658
+    {256, loop, false},
a46658
+    {-1, libcall, false}}},
a46658
+  {libcall,
a46658
+   {{256, rep_prefix_1_byte, true},
a46658
+    {256, loop, false},
a46658
+    {-1, libcall, false}}}};
a46658
+static stringop_algs tremont_memset[2] = {
a46658
+  {libcall,
a46658
+   {{256, rep_prefix_1_byte, true},
a46658
+    {256, loop, false},
a46658
+    {-1, libcall, false}}},
a46658
+  {libcall,
a46658
+   {{256, rep_prefix_1_byte, true},
a46658
+    {256, loop, false},
a46658
+    {-1, libcall, false}}}};
a46658
+static const
a46658
+struct processor_costs tremont_cost = {
a46658
+  {
a46658
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
a46658
+  6,				     /* cost for loading QImode using movzbl */
a46658
+  {6, 6, 6},				/* cost of loading integer registers
a46658
+					   in QImode, HImode and SImode.
a46658
+					   Relative to reg-reg move (2).  */
a46658
+  {6, 6, 6},				/* cost of storing integer registers */
a46658
+  4,					/* cost of reg,reg fld/fst */
a46658
+  {6, 6, 12},				/* cost of loading fp registers
a46658
+					   in SFmode, DFmode and XFmode */
a46658
+  {6, 6, 12},				/* cost of storing fp registers
a46658
+					   in SFmode, DFmode and XFmode */
a46658
+  2,					/* cost of moving MMX register */
a46658
+  {6, 6},				/* cost of loading MMX registers
a46658
+					   in SImode and DImode */
a46658
+  {6, 6},				/* cost of storing MMX registers
a46658
+					   in SImode and DImode */
a46658
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
a46658
+  {6, 6, 6, 10, 15},			/* cost of loading SSE registers
a46658
+					   in 32,64,128,256 and 512-bit */
a46658
+  {6, 6, 6, 10, 15},			/* cost of storing SSE registers
a46658
+					   in 32,64,128,256 and 512-bit */
a46658
+  6, 6,				/* SSE->integer and integer->SSE moves */
a46658
+  6, 6,				/* mask->integer and integer->mask moves */
a46658
+  {6, 6, 6},				/* cost of loading mask register
a46658
+					   in QImode, HImode, SImode.  */
a46658
+  {6, 6, 6},			/* cost if storing mask register
a46658
+					   in QImode, HImode, SImode.  */
a46658
+  2,					/* cost of moving mask register.  */
a46658
+  /* End of register allocator costs.  */
a46658
+  },
a46658
+
a46658
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
a46658
+  /* Setting cost to 2 makes our current implementation of synth_mult result in
a46658
+     use of unnecessary temporary registers causing regression on several
a46658
+     SPECfp benchmarks.  */
a46658
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
a46658
+  COSTS_N_INSNS (1),			/* variable shift costs */
a46658
+  COSTS_N_INSNS (1),			/* constant shift costs */
a46658
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
a46658
+   COSTS_N_INSNS (4),			/*				 HI */
a46658
+   COSTS_N_INSNS (3),			/*				 SI */
a46658
+   COSTS_N_INSNS (4),			/*				 DI */
a46658
+   COSTS_N_INSNS (4)},			/*			      other */
a46658
+  0,					/* cost of multiply per each bit set */
a46658
+  {COSTS_N_INSNS (16),			/* cost of a divide/mod for QI */
a46658
+   COSTS_N_INSNS (22),			/*			    HI */
a46658
+   COSTS_N_INSNS (30),			/*			    SI */
a46658
+   COSTS_N_INSNS (74),			/*			    DI */
a46658
+   COSTS_N_INSNS (74)},			/*			    other */
a46658
+  COSTS_N_INSNS (1),			/* cost of movsx */
a46658
+  COSTS_N_INSNS (1),			/* cost of movzx */
a46658
+  8,					/* "large" insn */
a46658
+  17,					/* MOVE_RATIO */
a46658
+  17,					/* CLEAR_RATIO */
a46658
+  {6, 6, 6},				/* cost of loading integer registers
a46658
+					   in QImode, HImode and SImode.
a46658
+					   Relative to reg-reg move (2).  */
a46658
+  {6, 6, 6},				/* cost of storing integer registers */
a46658
+  {6, 6, 6, 10, 15},			/* cost of loading SSE register
a46658
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
a46658
+  {6, 6, 6, 10, 15},			/* cost of storing SSE register
a46658
+					   in 32bit, 64bit, 128bit, 256bit and 512bit */
a46658
+  {6, 6, 6, 10, 15},			/* cost of unaligned loads.  */
a46658
+  {6, 6, 6, 10, 15},			/* cost of unaligned storess.  */
a46658
+  2, 3, 4,				/* cost of moving XMM,YMM,ZMM register */
a46658
+  6,					/* cost of moving SSE register to integer.  */
a46658
+  18, 6,				/* Gather load static, per_elt.  */
a46658
+  18, 6,				/* Gather store static, per_elt.  */
a46658
+  32,					/* size of l1 cache.  */
a46658
+  512,					/* size of l2 cache.  */
a46658
+  64,					/* size of prefetch block */
a46658
+  6,					/* number of parallel prefetches */
a46658
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
a46658
+     value is increased to perhaps more appropriate value of 5.  */
a46658
+  3,					/* Branch cost */
a46658
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
a46658
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
a46658
+  COSTS_N_INSNS (17),			/* cost of FDIV instruction.  */
a46658
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
a46658
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
a46658
+  COSTS_N_INSNS (14),			/* cost of FSQRT instruction.  */
a46658
+
a46658
+  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
a46658
+  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
a46658
+  COSTS_N_INSNS (4),			/* cost of MULSS instruction.  */
a46658
+  COSTS_N_INSNS (5),			/* cost of MULSD instruction.  */
a46658
+  COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
a46658
+  COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
a46658
+  COSTS_N_INSNS (13),			/* cost of DIVSS instruction.  */
a46658
+  COSTS_N_INSNS (17),			/* cost of DIVSD instruction.  */
a46658
+  COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
a46658
+  COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
a46658
+  1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
a46658
+  tremont_memcpy,
a46658
+  tremont_memset,
a46658
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
a46658
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
a46658
+  "16:11:8",				/* Loop alignment.  */
a46658
+  "16:11:8",				/* Jump alignment.  */
a46658
+  "0:0:8",				/* Label alignment.  */
a46658
+  "16",					/* Func alignment.  */
a46658
+};
a46658
+
a46658
 static stringop_algs intel_memcpy[2] = {
a46658
   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
a46658
   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
a46658
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
a46658
index 6bd7087a03f..636e0c788bf 100644
a46658
--- a/gcc/config/i386/x86-tune.def
a46658
+++ b/gcc/config/i386/x86-tune.def
a46658
@@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
a46658
    move/set sequences of bytes with known size.  */
a46658
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
a46658
 	  "prefer_known_rep_movsb_stosb",
a46658
-	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
a46658
+	  m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
a46658
 
a46658
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
a46658
    compact prologues and epilogues by issuing a misaligned moves.  This
a46658
-- 
a46658
2.18.2
a46658