2005-12-18 Alexandre Oliva <aoliva@redhat.com>
* optabs.c (expand_vector_binop): Do not use a SUBREG to modify
a subword in the output if it matches any of the inputs.
2006-04-20 Jakub Jelinek <jakub@redhat.com>
* gcc.c-torture/execute/20060420-1.c: New test.
--- gcc/optabs.c.orig 2005-11-21 11:43:20.000000000 -0200
+++ gcc/optabs.c 2005-12-18 18:35:14.000000000 -0200
@@ -1933,16 +1933,19 @@
for (i = 0; i < elts; ++i)
{
- /* If this is part of a register, and not the first item in the
- word, we can't store using a SUBREG - that would clobber
- previous results.
+ /* If this is part of a register, and not the first item in
+ the word, we can't store using a SUBREG - that would
+ clobber previous results, or even the input operands, if
+ target matches any of them.
And storing with a SUBREG is only possible for the least
significant part, hence we can't do it for big endian
(unless we want to permute the evaluation order. */
if (GET_CODE (target) == REG
&& (BYTES_BIG_ENDIAN
? subsize < UNITS_PER_WORD
- : ((i * subsize) % UNITS_PER_WORD) != 0))
+ : (((i * subsize) % UNITS_PER_WORD) != 0
+ || (subsize < UNITS_PER_WORD
+ && (target == op0 || target == op1)))))
t = NULL_RTX;
else
t = simplify_gen_subreg (submode, target, mode, i * subsize);
--- gcc/testsuite/gcc.c-torture/execute/20060420-1.c.jj 2006-04-20 18:47:19.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/execute/20060420-1.c 2006-04-20 19:07:20.000000000 +0200
@@ -0,0 +1,71 @@
+extern void abort (void);
+
+typedef float v4flt __attribute__ ((vector_size (16)));
+
+void __attribute__ ((noinline)) foo (float *dst, float **src, int a, int n)
+{
+ int i, j;
+ int z = sizeof (v4flt) / sizeof (float);
+ unsigned m = sizeof (v4flt) - 1;
+
+ for (j = 0; j < n && (((unsigned long) dst + j) & m); ++j)
+ {
+ float t = src[0][j];
+ for (i = 1; i < a; ++i)
+ t += src[i][j];
+ dst[j] = t;
+ }
+
+ for (; j < (n - (4 * z - 1)); j += 4 * z)
+ {
+ v4flt t0 = *(v4flt *) (src[0] + j + 0 * z);
+ v4flt t1 = *(v4flt *) (src[0] + j + 1 * z);
+ v4flt t2 = *(v4flt *) (src[0] + j + 2 * z);
+ v4flt t3 = *(v4flt *) (src[0] + j + 3 * z);
+ for (i = 1; i < a; ++i)
+ {
+ t0 += *(v4flt *) (src[i] + j + 0 * z);
+ t1 += *(v4flt *) (src[i] + j + 1 * z);
+ t2 += *(v4flt *) (src[i] + j + 2 * z);
+ t3 += *(v4flt *) (src[i] + j + 3 * z);
+ }
+ *(v4flt *) (dst + j + 0 * z) = t0;
+ *(v4flt *) (dst + j + 1 * z) = t1;
+ *(v4flt *) (dst + j + 2 * z) = t2;
+ *(v4flt *) (dst + j + 3 * z) = t3;
+ }
+ for (; j < n; ++j)
+ {
+ float t = src[0][j];
+ for (i = 1; i < a; ++i)
+ t += src[i][j];
+ dst[j] = t;
+ }
+}
+
+float buffer[64];
+
+int
+main (void)
+{
+ int i;
+ float *dst, *src[2];
+
+ dst = buffer;
+ dst += (-(long int) buffer & (16 * sizeof (float) - 1)) / sizeof (float);
+ src[0] = dst + 16;
+ src[1] = dst + 32;
+ for (i = 0; i < 16; ++i)
+ {
+ src[0][i] = (float) i + 11 * (float) i;
+ src[1][i] = (float) i + 12 * (float) i;
+ }
+ foo (dst, src, 2, 16);
+ for (i = 0; i < 16; ++i)
+ {
+ float e = (float) i + 11 * (float) i + (float) i + 12 * (float) i;
+ if (dst[i] != e)
+ abort ();
+ }
+ return 0;
+}