diff --git a/SOURCES/gcc8-pr96796.patch b/SOURCES/gcc8-pr96796.patch new file mode 100644 index 0000000..46d1d2c --- /dev/null +++ b/SOURCES/gcc8-pr96796.patch @@ -0,0 +1,254 @@ +commit 6001db79c477b03eacc7e7049560921fb54b7845 +Author: Richard Sandiford +Date: Mon Sep 7 20:15:36 2020 +0100 + + lra: Avoid cycling on certain subreg reloads [PR96796] + + This PR is about LRA cycling for a reload of the form: + + ---------------------------------------------------------------------------- + Changing pseudo 196 in operand 1 of insn 103 on equiv [r105:DI*0x8+r140:DI] + Creating newreg=287, assigning class ALL_REGS to slow/invalid mem r287 + Creating newreg=288, assigning class ALL_REGS to slow/invalid mem r288 + 103: r203:SI=r288:SI<<0x1+r196:DI#0 + REG_DEAD r196:DI + Inserting slow/invalid mem reload before: + 316: r287:DI=[r105:DI*0x8+r140:DI] + 317: r288:SI=r287:DI#0 + ---------------------------------------------------------------------------- + + The problem is with r287. We rightly give it a broad starting class of + POINTER_AND_FP_REGS (reduced from ALL_REGS by preferred_reload_class). + However, we never make forward progress towards narrowing it down to + a specific choice of class (POINTER_REGS or FP_REGS). + + I think in practice we rely on two things to narrow a reload pseudo's + class down to a specific choice: + + (1) a restricted class is specified when the pseudo is created + + This happens for input address reloads, where the class is taken + from the target's chosen base register class. It also happens + for simple REG reloads, where the class is taken from the chosen + alternative's constraints. + + (2) uses of the reload pseudo as a direct input operand + + In this case get_reload_reg tries to reuse the existing register + and narrow its class, instead of creating a new reload pseudo. + + However, neither occurs here. As described above, r287 rightly + starts out with a wide choice of class, ultimately derived from + ALL_REGS, so we don't get (1). And as the comments in the PR + explain, r287 is never used as an input reload, only the subreg is, + so we don't get (2): + + ---------------------------------------------------------------------------- + Choosing alt 13 in insn 317: (0) r (1) w {*movsi_aarch64} + Creating newreg=291, assigning class FP_REGS to r291 + 317: r288:SI=r291:SI + Inserting insn reload before: + 320: r291:SI=r287:DI#0 + ---------------------------------------------------------------------------- + + IMO, in this case we should rely on the reload of r316 to narrow + down the class of r278. Currently we do: + + ---------------------------------------------------------------------------- + Choosing alt 7 in insn 316: (0) r (1) m {*movdi_aarch64} + Creating newreg=289 from oldreg=287, assigning class GENERAL_REGS to r289 + 316: r289:DI=[r105:DI*0x8+r140:DI] + Inserting insn reload after: + 318: r287:DI=r289:DI + --------------------------------------------------- + + i.e. we create a new pseudo register r289 and give *that* pseudo + GENERAL_REGS instead. This is because get_reload_reg only narrows + down the existing class for OP_IN and OP_INOUT, not OP_OUT. + + But if we have a reload pseudo in a reload instruction and have chosen + a specific class for the reload pseudo, I think we should simply install + it for OP_OUT reloads too, if the class is a subset of the existing class. + We will need to pick such a register whatever happens (for r289 in the + example above). And as explained in the PR, doing this actually avoids + an unnecessary move via the FP registers too. + + The patch is quite aggressive in that it does this for all reload + pseudos in all reload instructions. I wondered about reusing the + condition for a reload move in in_class_p: + + INSN_UID (curr_insn) >= new_insn_uid_start + && curr_insn_set != NULL + && ((OBJECT_P (SET_SRC (curr_insn_set)) + && ! CONSTANT_P (SET_SRC (curr_insn_set))) + || (GET_CODE (SET_SRC (curr_insn_set)) == SUBREG + && OBJECT_P (SUBREG_REG (SET_SRC (curr_insn_set))) + && ! CONSTANT_P (SUBREG_REG (SET_SRC (curr_insn_set))))))) + + but I can't really justify that on first principles. I think we + should apply the rule consistently until we have a specific reason + for doing otherwise. + + gcc/ + PR rtl-optimization/96796 + * lra-constraints.c (in_class_p): Add a default-false + allow_all_reload_class_changes_p parameter. Do not treat + reload moves specially when the parameter is true. + (get_reload_reg): Try to narrow the class of an existing OP_OUT + reload if we're reloading a reload pseudo in a reload instruction. + + gcc/testsuite/ + PR rtl-optimization/96796 + * gcc.c-torture/compile/pr96796.c: New test. + +diff --git a/gcc/lra-constraints.c b/gcc/lra-constraints.c +index 580da9c3ed6..161b721efb1 100644 +--- a/gcc/lra-constraints.c ++++ b/gcc/lra-constraints.c +@@ -236,12 +236,17 @@ get_reg_class (int regno) + CL. Use elimination first if REG is a hard register. If REG is a + reload pseudo created by this constraints pass, assume that it will + be allocated a hard register from its allocno class, but allow that +- class to be narrowed to CL if it is currently a superset of CL. ++ class to be narrowed to CL if it is currently a superset of CL and ++ if either: ++ ++ - ALLOW_ALL_RELOAD_CLASS_CHANGES_P is true or ++ - the instruction we're processing is not a reload move. + + If NEW_CLASS is nonnull, set *NEW_CLASS to the new allocno class of + REGNO (reg), or NO_REGS if no change in its class was needed. */ + static bool +-in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class) ++in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class, ++ bool allow_all_reload_class_changes_p = false) + { + enum reg_class rclass, common_class; + machine_mode reg_mode; +@@ -266,7 +271,8 @@ in_class_p (rtx reg, enum reg_class cl, enum reg_class *new_class) + typically moves that have many alternatives, and restricting + reload pseudos for one alternative may lead to situations + where other reload pseudos are no longer allocatable. */ +- || (INSN_UID (curr_insn) >= new_insn_uid_start ++ || (!allow_all_reload_class_changes_p ++ && INSN_UID (curr_insn) >= new_insn_uid_start + && curr_insn_set != NULL + && ((OBJECT_P (SET_SRC (curr_insn_set)) + && ! CONSTANT_P (SET_SRC (curr_insn_set))) +@@ -551,13 +557,12 @@ init_curr_insn_input_reloads (void) + curr_insn_input_reloads_num = 0; + } + +-/* Create a new pseudo using MODE, RCLASS, ORIGINAL or reuse already +- created input reload pseudo (only if TYPE is not OP_OUT). Don't +- reuse pseudo if IN_SUBREG_P is true and the reused pseudo should be +- wrapped up in SUBREG. The result pseudo is returned through +- RESULT_REG. Return TRUE if we created a new pseudo, FALSE if we +- reused the already created input reload pseudo. Use TITLE to +- describe new registers for debug purposes. */ ++/* Create a new pseudo using MODE, RCLASS, ORIGINAL or reuse an existing ++ reload pseudo. Don't reuse an existing reload pseudo if IN_SUBREG_P ++ is true and the reused pseudo should be wrapped up in a SUBREG. ++ The result pseudo is returned through RESULT_REG. Return TRUE if we ++ created a new pseudo, FALSE if we reused an existing reload pseudo. ++ Use TITLE to describe new registers for debug purposes. */ + static bool + get_reload_reg (enum op_type type, machine_mode mode, rtx original, + enum reg_class rclass, bool in_subreg_p, +@@ -616,6 +621,35 @@ get_reload_reg (enum op_type type, machine_mode mode, rtx original, + + if (type == OP_OUT) + { ++ /* Output reload registers tend to start out with a conservative ++ choice of register class. Usually this is ALL_REGS, although ++ a target might narrow it (for performance reasons) through ++ targetm.preferred_reload_class. It's therefore quite common ++ for a reload instruction to require a more restrictive class ++ than the class that was originally assigned to the reload register. ++ ++ In these situations, it's more efficient to refine the choice ++ of register class rather than create a second reload register. ++ This also helps to avoid cycling for registers that are only ++ used by reload instructions. */ ++ if (REG_P (original) ++ && (int) REGNO (original) >= new_regno_start ++ && INSN_UID (curr_insn) >= new_insn_uid_start ++ && in_class_p (original, rclass, &new_class, true)) ++ { ++ unsigned int regno = REGNO (original); ++ if (lra_dump_file != NULL) ++ { ++ fprintf (lra_dump_file, " Reuse r%d for output ", regno); ++ dump_value_slim (lra_dump_file, original, 1); ++ } ++ if (new_class != lra_get_allocno_class (regno)) ++ lra_change_class (regno, new_class, ", change to", false); ++ if (lra_dump_file != NULL) ++ fprintf (lra_dump_file, "\n"); ++ *result_reg = original; ++ return false; ++ } + *result_reg + = lra_create_new_reg_with_unique_value (mode, original, rclass, title); + return true; +diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96796.c b/gcc/testsuite/gcc.c-torture/compile/pr96796.c +new file mode 100644 +index 00000000000..8808e62fe77 +--- /dev/null ++++ b/gcc/testsuite/gcc.c-torture/compile/pr96796.c +@@ -0,0 +1,55 @@ ++/* { dg-additional-options "-fcommon" } */ ++ ++struct S0 { ++ signed f0 : 8; ++ unsigned f1; ++ unsigned f4; ++}; ++struct S1 { ++ long f3; ++ char f4; ++} g_3_4; ++ ++int g_5, func_1_l_32, func_50___trans_tmp_31; ++static struct S0 g_144, g_834, g_1255, g_1261; ++ ++int g_273[120] = {}; ++int *g_555; ++char **g_979; ++static int g_1092_0; ++static int g_1193; ++int safe_mul_func_int16_t_s_s(int si1, int si2) { return si1 * si2; } ++static struct S0 *func_50(); ++int func_1() { func_50(g_3_4, g_5, func_1_l_32, 8, 3); } ++void safe_div_func_int64_t_s_s(int *); ++void safe_mod_func_uint32_t_u_u(struct S0); ++struct S0 *func_50(int p_51, struct S0 p_52, struct S1 p_53, int p_54, ++ int p_55) { ++ int __trans_tmp_30; ++ char __trans_tmp_22; ++ short __trans_tmp_19; ++ long l_985_1; ++ long l_1191[8]; ++ safe_div_func_int64_t_s_s(g_273); ++ __builtin_printf((char*)g_1261.f4); ++ safe_mod_func_uint32_t_u_u(g_834); ++ g_144.f0 += 1; ++ for (;;) { ++ struct S1 l_1350 = {&l_1350}; ++ for (; p_53.f3; p_53.f3 -= 1) ++ for (; g_1193 <= 2; g_1193 += 1) { ++ __trans_tmp_19 = safe_mul_func_int16_t_s_s(l_1191[l_985_1 + p_53.f3], ++ p_55 % (**g_979 = 10)); ++ __trans_tmp_22 = g_1255.f1 * p_53.f4; ++ __trans_tmp_30 = __trans_tmp_19 + __trans_tmp_22; ++ if (__trans_tmp_30) ++ g_1261.f0 = p_51; ++ else { ++ g_1255.f0 = p_53.f3; ++ int *l_1422 = g_834.f0 = g_144.f4 != (*l_1422)++ > 0 < 0 ^ 51; ++ g_555 = ~0; ++ g_1092_0 |= func_50___trans_tmp_31; ++ } ++ } ++ } ++} diff --git a/SPECS/gcc.spec b/SPECS/gcc.spec index a6bfab2..efba515 100644 --- a/SPECS/gcc.spec +++ b/SPECS/gcc.spec @@ -4,7 +4,7 @@ %global gcc_major 8 # Note, gcc_release must be integer, if you want to add suffixes to # %%{release}, append them after %%{gcc_release} on Release: line. -%global gcc_release 5 +%global gcc_release 6 %global nvptx_tools_gitrev c28050f60193b3b95a18866a96f03334e874e78f %global nvptx_newlib_gitrev aadc8eb0ec43b7cd0dd2dfb484bae63c8b05ef24 %global _unpackaged_files_terminate_build 0 @@ -280,6 +280,7 @@ Patch19: gcc8-rh1960701.patch Patch20: gcc8-pr100797.patch Patch21: gcc8-rh1981822.patch Patch22: gcc8-Wbidi-chars.patch +Patch23: gcc8-pr96796.patch Patch30: gcc8-rh1668903-1.patch Patch31: gcc8-rh1668903-2.patch @@ -861,6 +862,7 @@ to NVidia PTX capable devices if available. %patch20 -p0 -b .pr100797~ %patch21 -p0 -b .rh1981822~ %patch22 -p1 -b .bidi~ +%patch23 -p1 -b .pr96796~ %patch30 -p0 -b .rh1668903-1~ %patch31 -p0 -b .rh1668903-2~ @@ -3177,6 +3179,9 @@ fi %endif %changelog +* Fri Dec 3 2021 Marek Polacek 8.5.0-6 +- avoid cycling on certain subreg reloads (PR rtl-optimization/96796, #2028798) + * Tue Nov 30 2021 Marek Polacek 8.5.0-5 - when linking against libgcc_s, link libgcc.a too (#2022588)