|
|
0e3697 |
commit e4108e7e619dcf7f21224382bc37ba2ef651eb43
|
|
|
0e3697 |
Author: acsawdey <acsawdey@138bc75d-0d04-0410-961f-82ee72b054a4>
|
|
|
0e3697 |
Date: Thu Aug 30 18:17:00 2018 +0000
|
|
|
0e3697 |
|
|
|
0e3697 |
2018-08-30 Aaron Sawdey <acsawdey@linux.ibm.com>
|
|
|
0e3697 |
|
|
|
0e3697 |
* config/rs6000/altivec.md (altivec_eq<mode>): Remove star.
|
|
|
0e3697 |
(altivec_vcmpequ<VI_char>_p): Remove star.
|
|
|
0e3697 |
* config/rs6000/rs6000-string.c (do_load_for_compare): Support
|
|
|
0e3697 |
vector load modes.
|
|
|
0e3697 |
(expand_strncmp_vec_sequence): New function.
|
|
|
0e3697 |
(emit_final_str_compare_vec): New function.
|
|
|
0e3697 |
(expand_strn_compare): Add support for vector strncmp.
|
|
|
0e3697 |
* config/rs6000/rs6000.opt (-mstring-compare-inline-limit): Change
|
|
|
0e3697 |
length specification to bytes.
|
|
|
0e3697 |
* config/rs6000/vsx.md (vsx_ld_elemrev_v16qi_internal): Remove star.
|
|
|
0e3697 |
(vcmpnezb_p): New pattern.
|
|
|
0e3697 |
* doc/invoke.texi (RS/6000 and PowerPC Options): Update documentation
|
|
|
0e3697 |
for option -mstring-compare-inline-limit.
|
|
|
0e3697 |
|
|
|
0e3697 |
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@263991 138bc75d-0d04-0410-961f-82ee72b054a4
|
|
|
0e3697 |
|
|
|
0e3697 |
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
|
|
|
0e3697 |
index 13f4654db6a..db4f926bd15 100644
|
|
|
0e3697 |
--- a/gcc/config/rs6000/altivec.md
|
|
|
0e3697 |
+++ b/gcc/config/rs6000/altivec.md
|
|
|
0e3697 |
@@ -608,7 +608,7 @@
|
|
|
0e3697 |
"vcmpbfp %0,%1,%2"
|
|
|
0e3697 |
[(set_attr "type" "veccmp")])
|
|
|
0e3697 |
|
|
|
0e3697 |
-(define_insn "*altivec_eq<mode>"
|
|
|
0e3697 |
+(define_insn "altivec_eq<mode>"
|
|
|
0e3697 |
[(set (match_operand:VI2 0 "altivec_register_operand" "=v")
|
|
|
0e3697 |
(eq:VI2 (match_operand:VI2 1 "altivec_register_operand" "v")
|
|
|
0e3697 |
(match_operand:VI2 2 "altivec_register_operand" "v")))]
|
|
|
0e3697 |
@@ -2438,7 +2438,7 @@
|
|
|
0e3697 |
|
|
|
0e3697 |
;; Compare vectors producing a vector result and a predicate, setting CR6 to
|
|
|
0e3697 |
;; indicate a combined status
|
|
|
0e3697 |
-(define_insn "*altivec_vcmpequ<VI_char>_p"
|
|
|
0e3697 |
+(define_insn "altivec_vcmpequ<VI_char>_p"
|
|
|
0e3697 |
[(set (reg:CC CR6_REGNO)
|
|
|
0e3697 |
(unspec:CC [(eq:CC (match_operand:VI2 1 "register_operand" "v")
|
|
|
0e3697 |
(match_operand:VI2 2 "register_operand" "v"))]
|
|
|
0e3697 |
diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
|
|
|
0e3697 |
index 451e9ed33da..ff0414586d0 100644
|
|
|
0e3697 |
--- a/gcc/config/rs6000/rs6000-string.c
|
|
|
0e3697 |
+++ b/gcc/config/rs6000/rs6000-string.c
|
|
|
0e3697 |
@@ -157,6 +157,33 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
|
|
|
0e3697 |
{
|
|
|
0e3697 |
switch (GET_MODE (reg))
|
|
|
0e3697 |
{
|
|
|
0e3697 |
+ case E_V16QImode:
|
|
|
0e3697 |
+ switch (mode)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ case E_V16QImode:
|
|
|
0e3697 |
+ if (!BYTES_BIG_ENDIAN)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ if (TARGET_P9_VECTOR)
|
|
|
0e3697 |
+ emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
|
|
|
0e3697 |
+ V16QImode, 0);
|
|
|
0e3697 |
+ gcc_assert (MEM_P (mem));
|
|
|
0e3697 |
+ rtx addr = XEXP (mem, 0);
|
|
|
0e3697 |
+ rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
|
|
|
0e3697 |
+ MEM_COPY_ATTRIBUTES (mem_v2di, mem);
|
|
|
0e3697 |
+ set_mem_size (mem, GET_MODE_SIZE (V2DImode));
|
|
|
0e3697 |
+ emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ emit_insn (gen_vsx_movv2di_64bit (reg, mem));
|
|
|
0e3697 |
+ break;
|
|
|
0e3697 |
+ default:
|
|
|
0e3697 |
+ gcc_unreachable ();
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ break;
|
|
|
0e3697 |
case E_DImode:
|
|
|
0e3697 |
switch (mode)
|
|
|
0e3697 |
{
|
|
|
0e3697 |
@@ -227,6 +254,12 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
|
|
|
0e3697 |
gcc_unreachable ();
|
|
|
0e3697 |
}
|
|
|
0e3697 |
break;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ case E_QImode:
|
|
|
0e3697 |
+ gcc_assert (mode == E_QImode);
|
|
|
0e3697 |
+ emit_move_insn (reg, mem);
|
|
|
0e3697 |
+ break;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
default:
|
|
|
0e3697 |
gcc_unreachable ();
|
|
|
0e3697 |
break;
|
|
|
0e3697 |
@@ -1705,17 +1738,17 @@ expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes
|
|
|
0e3697 |
RESULT_REG is the rtx for the result register.
|
|
|
0e3697 |
EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
|
|
|
0e3697 |
to strcmp/strncmp if we have equality at the end of the inline comparison.
|
|
|
0e3697 |
- CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
|
|
|
0e3697 |
- and generate the final comparison result.
|
|
|
0e3697 |
+ P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
|
|
|
0e3697 |
+ to clean up and generate the final comparison result.
|
|
|
0e3697 |
FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
|
|
|
0e3697 |
set the final result. */
|
|
|
0e3697 |
static void
|
|
|
0e3697 |
-expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
|
|
|
0e3697 |
- unsigned int base_align,
|
|
|
0e3697 |
- rtx orig_src1, rtx orig_src2,
|
|
|
0e3697 |
- rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
|
|
|
0e3697 |
- bool equality_compare_rest, rtx &cleanup_label,
|
|
|
0e3697 |
- rtx final_move_label)
|
|
|
0e3697 |
+expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
|
|
|
0e3697 |
+ unsigned int base_align,
|
|
|
0e3697 |
+ rtx orig_src1, rtx orig_src2,
|
|
|
0e3697 |
+ rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
|
|
|
0e3697 |
+ bool equality_compare_rest, rtx *p_cleanup_label,
|
|
|
0e3697 |
+ rtx final_move_label)
|
|
|
0e3697 |
{
|
|
|
0e3697 |
unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
|
|
|
0e3697 |
machine_mode load_mode;
|
|
|
0e3697 |
@@ -1724,6 +1757,8 @@ expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
|
|
|
0e3697 |
unsigned HOST_WIDE_INT offset = 0;
|
|
|
0e3697 |
rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
|
|
|
0e3697 |
rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
|
|
|
0e3697 |
+ gcc_assert (p_cleanup_label != NULL);
|
|
|
0e3697 |
+ rtx cleanup_label = *p_cleanup_label;
|
|
|
0e3697 |
|
|
|
0e3697 |
while (bytes_to_compare > 0)
|
|
|
0e3697 |
{
|
|
|
0e3697 |
@@ -1876,6 +1911,178 @@ expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
|
|
|
0e3697 |
bytes_to_compare -= cmp_bytes;
|
|
|
0e3697 |
}
|
|
|
0e3697 |
|
|
|
0e3697 |
+ *p_cleanup_label = cleanup_label;
|
|
|
0e3697 |
+ return;
|
|
|
0e3697 |
+}
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
|
|
|
0e3697 |
+ instructions.
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ BYTES_TO_COMPARE is the number of bytes to be compared.
|
|
|
0e3697 |
+ ORIG_SRC1 is the unmodified rtx for the first string.
|
|
|
0e3697 |
+ ORIG_SRC2 is the unmodified rtx for the second string.
|
|
|
0e3697 |
+ S1ADDR is the register to use for the base address of the first string.
|
|
|
0e3697 |
+ S2ADDR is the register to use for the base address of the second string.
|
|
|
0e3697 |
+ OFF_REG is the register to use for the string offset for loads.
|
|
|
0e3697 |
+ S1DATA is the register for loading the first string.
|
|
|
0e3697 |
+ S2DATA is the register for loading the second string.
|
|
|
0e3697 |
+ VEC_RESULT is the rtx for the vector result indicating the byte difference.
|
|
|
0e3697 |
+ EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
|
|
|
0e3697 |
+ to strcmp/strncmp if we have equality at the end of the inline comparison.
|
|
|
0e3697 |
+ P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up
|
|
|
0e3697 |
+ and generate the final comparison result.
|
|
|
0e3697 |
+ FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
|
|
|
0e3697 |
+ set the final result. */
|
|
|
0e3697 |
+static void
|
|
|
0e3697 |
+expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
|
|
|
0e3697 |
+ rtx orig_src1, rtx orig_src2,
|
|
|
0e3697 |
+ rtx s1addr, rtx s2addr, rtx off_reg,
|
|
|
0e3697 |
+ rtx s1data, rtx s2data,
|
|
|
0e3697 |
+ rtx vec_result, bool equality_compare_rest,
|
|
|
0e3697 |
+ rtx *p_cleanup_label, rtx final_move_label)
|
|
|
0e3697 |
+{
|
|
|
0e3697 |
+ machine_mode load_mode;
|
|
|
0e3697 |
+ unsigned int load_mode_size;
|
|
|
0e3697 |
+ unsigned HOST_WIDE_INT cmp_bytes = 0;
|
|
|
0e3697 |
+ unsigned HOST_WIDE_INT offset = 0;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ gcc_assert (p_cleanup_label != NULL);
|
|
|
0e3697 |
+ rtx cleanup_label = *p_cleanup_label;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
|
|
|
0e3697 |
+ emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ unsigned int i;
|
|
|
0e3697 |
+ rtx zr[16];
|
|
|
0e3697 |
+ for (i = 0; i < 16; i++)
|
|
|
0e3697 |
+ zr[i] = GEN_INT (0);
|
|
|
0e3697 |
+ rtvec zv = gen_rtvec_v (16, zr);
|
|
|
0e3697 |
+ rtx zero_reg = gen_reg_rtx (V16QImode);
|
|
|
0e3697 |
+ rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ while (bytes_to_compare > 0)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ /* VEC/VSX compare sequence for P8:
|
|
|
0e3697 |
+ check each 16B with:
|
|
|
0e3697 |
+ lxvd2x 32,28,8
|
|
|
0e3697 |
+ lxvd2x 33,29,8
|
|
|
0e3697 |
+ vcmpequb 2,0,1 # compare strings
|
|
|
0e3697 |
+ vcmpequb 4,0,3 # compare w/ 0
|
|
|
0e3697 |
+ xxlorc 37,36,34 # first FF byte is either mismatch or end of string
|
|
|
0e3697 |
+ vcmpequb. 7,5,3 # reg 7 contains 0
|
|
|
0e3697 |
+ bnl 6,.Lmismatch
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ For the P8 LE case, we use lxvd2x and compare full 16 bytes
|
|
|
0e3697 |
+ but then use use vgbbd and a shift to get two bytes with the
|
|
|
0e3697 |
+ information we need in the correct order.
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ VEC/VSX compare sequence if TARGET_P9_VECTOR:
|
|
|
0e3697 |
+ lxvb16x/lxvb16x # load 16B of each string
|
|
|
0e3697 |
+ vcmpnezb. # produces difference location or zero byte location
|
|
|
0e3697 |
+ bne 6,.Lmismatch
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ Use the overlapping compare trick for the last block if it is
|
|
|
0e3697 |
+ less than 16 bytes.
|
|
|
0e3697 |
+ */
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ load_mode = V16QImode;
|
|
|
0e3697 |
+ load_mode_size = GET_MODE_SIZE (load_mode);
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ if (bytes_to_compare >= load_mode_size)
|
|
|
0e3697 |
+ cmp_bytes = load_mode_size;
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ /* Move this load back so it doesn't go past the end. P8/P9
|
|
|
0e3697 |
+ can do this efficiently. This is never called with less
|
|
|
0e3697 |
+ than 16 bytes so we should always be able to do this. */
|
|
|
0e3697 |
+ unsigned int extra_bytes = load_mode_size - bytes_to_compare;
|
|
|
0e3697 |
+ cmp_bytes = bytes_to_compare;
|
|
|
0e3697 |
+ gcc_assert (offset > extra_bytes);
|
|
|
0e3697 |
+ offset -= extra_bytes;
|
|
|
0e3697 |
+ cmp_bytes = load_mode_size;
|
|
|
0e3697 |
+ bytes_to_compare = cmp_bytes;
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ /* The offset currently used is always kept in off_reg so that the
|
|
|
0e3697 |
+ cleanup code on P8 can use it to extract the differing byte. */
|
|
|
0e3697 |
+ emit_move_insn (off_reg, GEN_INT (offset));
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
|
|
|
0e3697 |
+ do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
|
|
|
0e3697 |
+ rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
|
|
|
0e3697 |
+ do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ /* Cases to handle. A and B are chunks of the two strings.
|
|
|
0e3697 |
+ 1: Not end of comparison:
|
|
|
0e3697 |
+ A != B: branch to cleanup code to compute result.
|
|
|
0e3697 |
+ A == B: next block
|
|
|
0e3697 |
+ 2: End of the inline comparison:
|
|
|
0e3697 |
+ A != B: branch to cleanup code to compute result.
|
|
|
0e3697 |
+ A == B: call strcmp/strncmp
|
|
|
0e3697 |
+ 3: compared requested N bytes:
|
|
|
0e3697 |
+ A == B: branch to result 0.
|
|
|
0e3697 |
+ A != B: cleanup code to compute result. */
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ if (TARGET_P9_VECTOR)
|
|
|
0e3697 |
+ emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ /* Emit instructions to do comparison and zero check. */
|
|
|
0e3697 |
+ rtx cmp_res = gen_reg_rtx (load_mode);
|
|
|
0e3697 |
+ rtx cmp_zero = gen_reg_rtx (load_mode);
|
|
|
0e3697 |
+ rtx cmp_combined = gen_reg_rtx (load_mode);
|
|
|
0e3697 |
+ emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
|
|
|
0e3697 |
+ emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
|
|
|
0e3697 |
+ emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
|
|
|
0e3697 |
+ emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
|
|
|
0e3697 |
+ rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
|
|
|
0e3697 |
+ rtx dst_label;
|
|
|
0e3697 |
+ rtx cmp_rtx;
|
|
|
0e3697 |
+ if (branch_to_cleanup)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ /* Branch to cleanup code, otherwise fall through to do more
|
|
|
0e3697 |
+ compares. P8 and P9 use different CR bits because on P8
|
|
|
0e3697 |
+ we are looking at the result of a comparsion vs a
|
|
|
0e3697 |
+ register of zeroes so the all-true condition means no
|
|
|
0e3697 |
+ difference or zero was found. On P9, vcmpnezb sets a byte
|
|
|
0e3697 |
+ to 0xff if there is a mismatch or zero, so the all-false
|
|
|
0e3697 |
+ condition indicates we found no difference or zero. */
|
|
|
0e3697 |
+ if (!cleanup_label)
|
|
|
0e3697 |
+ cleanup_label = gen_label_rtx ();
|
|
|
0e3697 |
+ dst_label = cleanup_label;
|
|
|
0e3697 |
+ if (TARGET_P9_VECTOR)
|
|
|
0e3697 |
+ cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ /* Branch to final return or fall through to cleanup,
|
|
|
0e3697 |
+ result is already set to 0. */
|
|
|
0e3697 |
+ dst_label = final_move_label;
|
|
|
0e3697 |
+ if (TARGET_P9_VECTOR)
|
|
|
0e3697 |
+ cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
|
|
|
0e3697 |
+ rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
|
|
|
0e3697 |
+ lab_ref, pc_rtx);
|
|
|
0e3697 |
+ rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
|
|
|
0e3697 |
+ JUMP_LABEL (j2) = dst_label;
|
|
|
0e3697 |
+ LABEL_NUSES (dst_label) += 1;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ offset += cmp_bytes;
|
|
|
0e3697 |
+ bytes_to_compare -= cmp_bytes;
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ *p_cleanup_label = cleanup_label;
|
|
|
0e3697 |
+ return;
|
|
|
0e3697 |
}
|
|
|
0e3697 |
|
|
|
0e3697 |
/* Generate the final sequence that identifies the differing
|
|
|
0e3697 |
@@ -1948,6 +2155,96 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
|
|
|
0e3697 |
return;
|
|
|
0e3697 |
}
|
|
|
0e3697 |
|
|
|
0e3697 |
+/* Generate the final sequence that identifies the differing
|
|
|
0e3697 |
+ byte and generates the final result, taking into account
|
|
|
0e3697 |
+ zero bytes:
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ P8:
|
|
|
0e3697 |
+ vgbbd 0,0
|
|
|
0e3697 |
+ vsldoi 0,0,0,9
|
|
|
0e3697 |
+ mfvsrd 9,32
|
|
|
0e3697 |
+ addi 10,9,-1 # count trailing zero bits
|
|
|
0e3697 |
+ andc 9,10,9
|
|
|
0e3697 |
+ popcntd 9,9
|
|
|
0e3697 |
+ lbzx 10,28,9 # use that offset to load differing byte
|
|
|
0e3697 |
+ lbzx 3,29,9
|
|
|
0e3697 |
+ subf 3,3,10 # subtract for final result
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ P9:
|
|
|
0e3697 |
+ vclzlsbb # counts trailing bytes with lsb=0
|
|
|
0e3697 |
+ vextublx # extract differing byte
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ STR1 is the reg rtx for data from string 1.
|
|
|
0e3697 |
+ STR2 is the reg rtx for data from string 2.
|
|
|
0e3697 |
+ RESULT is the reg rtx for the comparison result.
|
|
|
0e3697 |
+ S1ADDR is the register to use for the base address of the first string.
|
|
|
0e3697 |
+ S2ADDR is the register to use for the base address of the second string.
|
|
|
0e3697 |
+ ORIG_SRC1 is the unmodified rtx for the first string.
|
|
|
0e3697 |
+ ORIG_SRC2 is the unmodified rtx for the second string.
|
|
|
0e3697 |
+ OFF_REG is the register to use for the string offset for loads.
|
|
|
0e3697 |
+ VEC_RESULT is the rtx for the vector result indicating the byte difference.
|
|
|
0e3697 |
+ */
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+static void
|
|
|
0e3697 |
+emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
|
|
|
0e3697 |
+ rtx s1addr, rtx s2addr,
|
|
|
0e3697 |
+ rtx orig_src1, rtx orig_src2,
|
|
|
0e3697 |
+ rtx off_reg, rtx vec_result)
|
|
|
0e3697 |
+{
|
|
|
0e3697 |
+ if (TARGET_P9_VECTOR)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ rtx diffix = gen_reg_rtx (SImode);
|
|
|
0e3697 |
+ rtx chr1 = gen_reg_rtx (SImode);
|
|
|
0e3697 |
+ rtx chr2 = gen_reg_rtx (SImode);
|
|
|
0e3697 |
+ rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
|
|
|
0e3697 |
+ rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
|
|
|
0e3697 |
+ emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
|
|
|
0e3697 |
+ emit_insn (gen_vextublx (chr1, diffix, str1));
|
|
|
0e3697 |
+ emit_insn (gen_vextublx (chr2, diffix, str2));
|
|
|
0e3697 |
+ do_sub3 (result, chr1_di, chr2_di);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ rtx diffix = gen_reg_rtx (DImode);
|
|
|
0e3697 |
+ rtx result_gbbd = gen_reg_rtx (V16QImode);
|
|
|
0e3697 |
+ /* Since each byte of the input is either 00 or FF, the bytes in
|
|
|
0e3697 |
+ dw0 and dw1 after vgbbd are all identical to each other. */
|
|
|
0e3697 |
+ emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
|
|
|
0e3697 |
+ /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
|
|
|
0e3697 |
+ For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
|
|
|
0e3697 |
+ rtx result_shifted = gen_reg_rtx (V16QImode);
|
|
|
0e3697 |
+ int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
|
|
|
0e3697 |
+ emit_insn (gen_altivec_vsldoi_v16qi (result_shifted,result_gbbd,result_gbbd, GEN_INT (shift_amt)));
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
|
|
|
0e3697 |
+ emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
|
|
|
0e3697 |
+ rtx count = gen_reg_rtx (DImode);
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ if (BYTES_BIG_ENDIAN)
|
|
|
0e3697 |
+ emit_insn (gen_clzdi2 (count, diffix));
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ emit_insn (gen_ctzdi2 (count, diffix));
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ /* P8 doesn't have a good solution for extracting one byte from
|
|
|
0e3697 |
+ a vsx reg like vextublx on P9 so we just compute the offset
|
|
|
0e3697 |
+ of the differing byte and load it from each string. */
|
|
|
0e3697 |
+ do_add3 (off_reg, off_reg, count);
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ rtx chr1 = gen_reg_rtx (QImode);
|
|
|
0e3697 |
+ rtx chr2 = gen_reg_rtx (QImode);
|
|
|
0e3697 |
+ rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
|
|
|
0e3697 |
+ do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
|
|
|
0e3697 |
+ rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
|
|
|
0e3697 |
+ do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
|
|
|
0e3697 |
+ machine_mode rmode = GET_MODE (result);
|
|
|
0e3697 |
+ rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
|
|
|
0e3697 |
+ rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
|
|
|
0e3697 |
+ do_sub3 (result, chr1_rm, chr2_rm);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ return;
|
|
|
0e3697 |
+}
|
|
|
0e3697 |
+
|
|
|
0e3697 |
/* Expand a string compare operation with length, and return
|
|
|
0e3697 |
true if successful. Return false if we should let the
|
|
|
0e3697 |
compiler generate normal code, probably a strncmp call.
|
|
|
0e3697 |
@@ -2002,21 +2299,43 @@ expand_strn_compare (rtx operands[], int no_length)
|
|
|
0e3697 |
|
|
|
0e3697 |
gcc_assert (GET_MODE (target) == SImode);
|
|
|
0e3697 |
|
|
|
0e3697 |
- unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
|
|
|
0e3697 |
+ unsigned int required_align = 8;
|
|
|
0e3697 |
|
|
|
0e3697 |
unsigned HOST_WIDE_INT offset = 0;
|
|
|
0e3697 |
unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
|
|
|
0e3697 |
unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
|
|
|
0e3697 |
+
|
|
|
0e3697 |
if (no_length)
|
|
|
0e3697 |
- /* Use this as a standin to determine the mode to use. */
|
|
|
0e3697 |
- bytes = rs6000_string_compare_inline_limit * word_mode_size;
|
|
|
0e3697 |
+ bytes = rs6000_string_compare_inline_limit;
|
|
|
0e3697 |
else
|
|
|
0e3697 |
bytes = UINTVAL (bytes_rtx);
|
|
|
0e3697 |
|
|
|
0e3697 |
- machine_mode load_mode =
|
|
|
0e3697 |
- select_block_compare_mode (0, bytes, base_align);
|
|
|
0e3697 |
- unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
|
|
|
0e3697 |
- compare_length = rs6000_string_compare_inline_limit * load_mode_size;
|
|
|
0e3697 |
+ /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
|
|
|
0e3697 |
+ least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
|
|
|
0e3697 |
+ at least POWER8. That way we can rely on overlapping compares to
|
|
|
0e3697 |
+ do the final comparison of less than 16 bytes. Also I do not want
|
|
|
0e3697 |
+ to deal with making this work for 32 bits. */
|
|
|
0e3697 |
+ int use_vec = (bytes >= 16 && !TARGET_32BIT && TARGET_EFFICIENT_UNALIGNED_VSX);
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ if (use_vec)
|
|
|
0e3697 |
+ required_align = 16;
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ machine_mode load_mode;
|
|
|
0e3697 |
+ rtx tmp_reg_src1, tmp_reg_src2;
|
|
|
0e3697 |
+ if (use_vec)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ load_mode = V16QImode;
|
|
|
0e3697 |
+ tmp_reg_src1 = gen_reg_rtx (V16QImode);
|
|
|
0e3697 |
+ tmp_reg_src2 = gen_reg_rtx (V16QImode);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ load_mode = select_block_compare_mode (0, bytes, base_align);
|
|
|
0e3697 |
+ tmp_reg_src1 = gen_reg_rtx (word_mode);
|
|
|
0e3697 |
+ tmp_reg_src2 = gen_reg_rtx (word_mode);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+
|
|
|
0e3697 |
+ compare_length = rs6000_string_compare_inline_limit;
|
|
|
0e3697 |
|
|
|
0e3697 |
/* If we have equality at the end of the last compare and we have not
|
|
|
0e3697 |
found the end of the string, we need to call strcmp/strncmp to
|
|
|
0e3697 |
@@ -2040,10 +2359,7 @@ expand_strn_compare (rtx operands[], int no_length)
|
|
|
0e3697 |
rtx final_move_label = gen_label_rtx ();
|
|
|
0e3697 |
rtx final_label = gen_label_rtx ();
|
|
|
0e3697 |
rtx begin_compare_label = NULL;
|
|
|
0e3697 |
- unsigned int required_align = 8;
|
|
|
0e3697 |
-
|
|
|
0e3697 |
- required_align = 8;
|
|
|
0e3697 |
-
|
|
|
0e3697 |
+
|
|
|
0e3697 |
if (base_align < required_align)
|
|
|
0e3697 |
{
|
|
|
0e3697 |
/* Generate code that checks distance to 4k boundary for this case. */
|
|
|
0e3697 |
@@ -2060,7 +2376,7 @@ expand_strn_compare (rtx operands[], int no_length)
|
|
|
0e3697 |
the subsequent code generation are in agreement so we do not
|
|
|
0e3697 |
go past the length we tested for a 4k boundary crossing. */
|
|
|
0e3697 |
unsigned HOST_WIDE_INT align_test = compare_length;
|
|
|
0e3697 |
- if (align_test < 8)
|
|
|
0e3697 |
+ if (align_test < required_align)
|
|
|
0e3697 |
{
|
|
|
0e3697 |
align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
|
|
|
0e3697 |
base_align = align_test;
|
|
|
0e3697 |
@@ -2102,7 +2418,7 @@ expand_strn_compare (rtx operands[], int no_length)
|
|
|
0e3697 |
else
|
|
|
0e3697 |
{
|
|
|
0e3697 |
/* -m32 -mpowerpc64 results in word_mode being DImode even
|
|
|
0e3697 |
- though otherwise it is 32-bit. The length arg to strncmp
|
|
|
0e3697 |
+ though otherwise it is 32-bit. The length arg to strncmp
|
|
|
0e3697 |
is a size_t which will be the same size as pointers. */
|
|
|
0e3697 |
rtx len_rtx = gen_reg_rtx (Pmode);
|
|
|
0e3697 |
emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
|
|
|
0e3697 |
@@ -2124,17 +2440,32 @@ expand_strn_compare (rtx operands[], int no_length)
|
|
|
0e3697 |
}
|
|
|
0e3697 |
|
|
|
0e3697 |
rtx cleanup_label = NULL;
|
|
|
0e3697 |
- rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
|
|
|
0e3697 |
- rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
|
|
|
0e3697 |
+ rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
|
|
|
0e3697 |
|
|
|
0e3697 |
/* Generate a sequence of GPR or VEC/VSX instructions to compare out
|
|
|
0e3697 |
to the length specified. */
|
|
|
0e3697 |
- expand_strncmp_gpr_sequence(compare_length, base_align,
|
|
|
0e3697 |
- orig_src1, orig_src2,
|
|
|
0e3697 |
- tmp_reg_src1, tmp_reg_src2,
|
|
|
0e3697 |
- result_reg,
|
|
|
0e3697 |
- equality_compare_rest,
|
|
|
0e3697 |
- cleanup_label, final_move_label);
|
|
|
0e3697 |
+ if (use_vec)
|
|
|
0e3697 |
+ {
|
|
|
0e3697 |
+ s1addr = gen_reg_rtx (Pmode);
|
|
|
0e3697 |
+ s2addr = gen_reg_rtx (Pmode);
|
|
|
0e3697 |
+ off_reg = gen_reg_rtx (Pmode);
|
|
|
0e3697 |
+ vec_result = gen_reg_rtx (load_mode);
|
|
|
0e3697 |
+ emit_move_insn (result_reg, GEN_INT (0));
|
|
|
0e3697 |
+ expand_strncmp_vec_sequence (compare_length,
|
|
|
0e3697 |
+ orig_src1, orig_src2,
|
|
|
0e3697 |
+ s1addr, s2addr, off_reg,
|
|
|
0e3697 |
+ tmp_reg_src1, tmp_reg_src2,
|
|
|
0e3697 |
+ vec_result,
|
|
|
0e3697 |
+ equality_compare_rest,
|
|
|
0e3697 |
+ &cleanup_label, final_move_label);
|
|
|
0e3697 |
+ }
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ expand_strncmp_gpr_sequence (compare_length, base_align,
|
|
|
0e3697 |
+ orig_src1, orig_src2,
|
|
|
0e3697 |
+ tmp_reg_src1, tmp_reg_src2,
|
|
|
0e3697 |
+ result_reg,
|
|
|
0e3697 |
+ equality_compare_rest,
|
|
|
0e3697 |
+ &cleanup_label, final_move_label);
|
|
|
0e3697 |
|
|
|
0e3697 |
offset = compare_length;
|
|
|
0e3697 |
|
|
|
0e3697 |
@@ -2174,7 +2505,12 @@ expand_strn_compare (rtx operands[], int no_length)
|
|
|
0e3697 |
if (cleanup_label)
|
|
|
0e3697 |
emit_label (cleanup_label);
|
|
|
0e3697 |
|
|
|
0e3697 |
- emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
|
|
|
0e3697 |
+ if (use_vec)
|
|
|
0e3697 |
+ emit_final_str_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
|
|
|
0e3697 |
+ s1addr, s2addr, orig_src1, orig_src2,
|
|
|
0e3697 |
+ off_reg, vec_result);
|
|
|
0e3697 |
+ else
|
|
|
0e3697 |
+ emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
|
|
|
0e3697 |
|
|
|
0e3697 |
emit_label (final_move_label);
|
|
|
0e3697 |
emit_insn (gen_movsi (target,
|
|
|
0e3697 |
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
|
|
|
0e3697 |
index ace8a477550..ad1b8a29ac6 100644
|
|
|
0e3697 |
--- a/gcc/config/rs6000/rs6000.opt
|
|
|
0e3697 |
+++ b/gcc/config/rs6000/rs6000.opt
|
|
|
0e3697 |
@@ -342,8 +342,8 @@ Target Report Var(rs6000_block_compare_inline_loop_limit) Init(-1) RejectNegativ
|
|
|
0e3697 |
Max number of bytes to compare with loops.
|
|
|
0e3697 |
|
|
|
0e3697 |
mstring-compare-inline-limit=
|
|
|
0e3697 |
-Target Report Var(rs6000_string_compare_inline_limit) Init(8) RejectNegative Joined UInteger Save
|
|
|
0e3697 |
-Max number of pairs of load insns for compare.
|
|
|
0e3697 |
+Target Report Var(rs6000_string_compare_inline_limit) Init(64) RejectNegative Joined UInteger Save
|
|
|
0e3697 |
+Max number of bytes to compare.
|
|
|
0e3697 |
|
|
|
0e3697 |
misel
|
|
|
0e3697 |
Target Report Mask(ISEL) Var(rs6000_isa_flags)
|
|
|
0e3697 |
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
|
|
|
0e3697 |
index e6921e96a3d..01fb4213001 100644
|
|
|
0e3697 |
--- a/gcc/config/rs6000/vsx.md
|
|
|
0e3697 |
+++ b/gcc/config/rs6000/vsx.md
|
|
|
0e3697 |
@@ -1429,7 +1429,7 @@
|
|
|
0e3697 |
}
|
|
|
0e3697 |
})
|
|
|
0e3697 |
|
|
|
0e3697 |
-(define_insn "*vsx_ld_elemrev_v16qi_internal"
|
|
|
0e3697 |
+(define_insn "vsx_ld_elemrev_v16qi_internal"
|
|
|
0e3697 |
[(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
|
|
|
0e3697 |
(vec_select:V16QI
|
|
|
0e3697 |
(match_operand:V16QI 1 "memory_operand" "Z")
|
|
|
0e3697 |
@@ -5107,6 +5107,22 @@
|
|
|
0e3697 |
"vcmpnezb %0,%1,%2"
|
|
|
0e3697 |
[(set_attr "type" "vecsimple")])
|
|
|
0e3697 |
|
|
|
0e3697 |
+;; Vector Compare Not Equal or Zero Byte predicate or record-form
|
|
|
0e3697 |
+(define_insn "vcmpnezb_p"
|
|
|
0e3697 |
+ [(set (reg:CC CR6_REGNO)
|
|
|
0e3697 |
+ (unspec:CC
|
|
|
0e3697 |
+ [(match_operand:V16QI 1 "altivec_register_operand" "v")
|
|
|
0e3697 |
+ (match_operand:V16QI 2 "altivec_register_operand" "v")]
|
|
|
0e3697 |
+ UNSPEC_VCMPNEZB))
|
|
|
0e3697 |
+ (set (match_operand:V16QI 0 "altivec_register_operand" "=v")
|
|
|
0e3697 |
+ (unspec:V16QI
|
|
|
0e3697 |
+ [(match_dup 1)
|
|
|
0e3697 |
+ (match_dup 2)]
|
|
|
0e3697 |
+ UNSPEC_VCMPNEZB))]
|
|
|
0e3697 |
+ "TARGET_P9_VECTOR"
|
|
|
0e3697 |
+ "vcmpnezb. %0,%1,%2"
|
|
|
0e3697 |
+ [(set_attr "type" "vecsimple")])
|
|
|
0e3697 |
+
|
|
|
0e3697 |
;; Vector Compare Not Equal Half Word (specified/not+eq:)
|
|
|
0e3697 |
(define_insn "vcmpneh"
|
|
|
0e3697 |
[(set (match_operand:V8HI 0 "altivec_register_operand" "=v")
|
|
|
0e3697 |
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
|
|
|
0e3697 |
index f2dd12b3d73..291e414fea2 100644
|
|
|
0e3697 |
--- a/gcc/doc/invoke.texi
|
|
|
0e3697 |
+++ b/gcc/doc/invoke.texi
|
|
|
0e3697 |
@@ -24165,12 +24165,10 @@ target-specific.
|
|
|
0e3697 |
|
|
|
0e3697 |
@item -mstring-compare-inline-limit=@var{num}
|
|
|
0e3697 |
@opindex mstring-compare-inline-limit
|
|
|
0e3697 |
-Generate at most @var{num} pairs of load instructions to compare the
|
|
|
0e3697 |
-string inline. If the difference or end of string is not found at the
|
|
|
0e3697 |
+Compare at most @var{num} string bytes with inline code.
|
|
|
0e3697 |
+If the difference or end of string is not found at the
|
|
|
0e3697 |
end of the inline compare a call to @code{strcmp} or @code{strncmp} will
|
|
|
0e3697 |
-take care of the rest of the comparison. The default is 8 pairs of
|
|
|
0e3697 |
-loads, which will compare 64 bytes on a 64-bit target and 32 bytes on a
|
|
|
0e3697 |
-32-bit target.
|
|
|
0e3697 |
+take care of the rest of the comparison. The default is 64 bytes.
|
|
|
0e3697 |
|
|
|
0e3697 |
@item -G @var{num}
|
|
|
0e3697 |
@opindex G
|