2005-12-18 Alexandre Oliva * optabs.c (expand_vector_binop): Do not use a SUBREG to modify a subword in the output if it matches any of the inputs. 2006-04-20 Jakub Jelinek * gcc.c-torture/execute/20060420-1.c: New test. --- gcc/optabs.c.orig 2005-11-21 11:43:20.000000000 -0200 +++ gcc/optabs.c 2005-12-18 18:35:14.000000000 -0200 @@ -1933,16 +1933,19 @@ for (i = 0; i < elts; ++i) { - /* If this is part of a register, and not the first item in the - word, we can't store using a SUBREG - that would clobber - previous results. + /* If this is part of a register, and not the first item in + the word, we can't store using a SUBREG - that would + clobber previous results, or even the input operands, if + target matches any of them. And storing with a SUBREG is only possible for the least significant part, hence we can't do it for big endian (unless we want to permute the evaluation order. */ if (GET_CODE (target) == REG && (BYTES_BIG_ENDIAN ? subsize < UNITS_PER_WORD - : ((i * subsize) % UNITS_PER_WORD) != 0)) + : (((i * subsize) % UNITS_PER_WORD) != 0 + || (subsize < UNITS_PER_WORD + && (target == op0 || target == op1))))) t = NULL_RTX; else t = simplify_gen_subreg (submode, target, mode, i * subsize); --- gcc/testsuite/gcc.c-torture/execute/20060420-1.c.jj 2006-04-20 18:47:19.000000000 +0200 +++ gcc/testsuite/gcc.c-torture/execute/20060420-1.c 2006-04-20 19:07:20.000000000 +0200 @@ -0,0 +1,71 @@ +extern void abort (void); + +typedef float v4flt __attribute__ ((vector_size (16))); + +void __attribute__ ((noinline)) foo (float *dst, float **src, int a, int n) +{ + int i, j; + int z = sizeof (v4flt) / sizeof (float); + unsigned m = sizeof (v4flt) - 1; + + for (j = 0; j < n && (((unsigned long) dst + j) & m); ++j) + { + float t = src[0][j]; + for (i = 1; i < a; ++i) + t += src[i][j]; + dst[j] = t; + } + + for (; j < (n - (4 * z - 1)); j += 4 * z) + { + v4flt t0 = *(v4flt *) (src[0] + j + 0 * z); + v4flt t1 = *(v4flt *) (src[0] + j + 1 * z); + v4flt t2 = *(v4flt *) (src[0] + j + 2 * z); + v4flt t3 = *(v4flt *) (src[0] + j + 3 * z); + for (i = 1; i < a; ++i) + { + t0 += *(v4flt *) (src[i] + j + 0 * z); + t1 += *(v4flt *) (src[i] + j + 1 * z); + t2 += *(v4flt *) (src[i] + j + 2 * z); + t3 += *(v4flt *) (src[i] + j + 3 * z); + } + *(v4flt *) (dst + j + 0 * z) = t0; + *(v4flt *) (dst + j + 1 * z) = t1; + *(v4flt *) (dst + j + 2 * z) = t2; + *(v4flt *) (dst + j + 3 * z) = t3; + } + for (; j < n; ++j) + { + float t = src[0][j]; + for (i = 1; i < a; ++i) + t += src[i][j]; + dst[j] = t; + } +} + +float buffer[64]; + +int +main (void) +{ + int i; + float *dst, *src[2]; + + dst = buffer; + dst += (-(long int) buffer & (16 * sizeof (float) - 1)) / sizeof (float); + src[0] = dst + 16; + src[1] = dst + 32; + for (i = 0; i < 16; ++i) + { + src[0][i] = (float) i + 11 * (float) i; + src[1][i] = (float) i + 12 * (float) i; + } + foo (dst, src, 2, 16); + for (i = 0; i < 16; ++i) + { + float e = (float) i + 11 * (float) i + (float) i + 12 * (float) i; + if (dst[i] != e) + abort (); + } + return 0; +}