2005-12-18  Alexandre Oliva  <aoliva@redhat.com>

	* optabs.c (expand_vector_binop): Do not use a SUBREG to modify
	a subword in the output if it matches any of the inputs.

2006-04-20  Jakub Jelinek  <jakub@redhat.com>

	* gcc.c-torture/execute/20060420-1.c: New test.

--- gcc/optabs.c.orig	2005-11-21 11:43:20.000000000 -0200
+++ gcc/optabs.c	2005-12-18 18:35:14.000000000 -0200
@@ -1933,16 +1933,19 @@
 
       for (i = 0; i < elts; ++i)
 	{
-	  /* If this is part of a register, and not the first item in the
-	     word, we can't store using a SUBREG - that would clobber
-	     previous results.
+	  /* If this is part of a register, and not the first item in
+	     the word, we can't store using a SUBREG - that would
+	     clobber previous results, or even the input operands, if
+	     target matches any of them.
 	     And storing with a SUBREG is only possible for the least
 	     significant part, hence we can't do it for big endian
 	     (unless we want to permute the evaluation order.  */
 	  if (GET_CODE (target) == REG
 	      && (BYTES_BIG_ENDIAN
 		  ? subsize < UNITS_PER_WORD
-		  : ((i * subsize) % UNITS_PER_WORD) != 0))
+		  : (((i * subsize) % UNITS_PER_WORD) != 0
+		     || (subsize < UNITS_PER_WORD
+			 && (target == op0 || target == op1)))))
 	    t = NULL_RTX;
 	  else
 	    t = simplify_gen_subreg (submode, target, mode, i * subsize);
--- gcc/testsuite/gcc.c-torture/execute/20060420-1.c.jj	2006-04-20 18:47:19.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/execute/20060420-1.c	2006-04-20 19:07:20.000000000 +0200
@@ -0,0 +1,71 @@
+extern void abort (void);
+
+typedef float v4flt __attribute__ ((vector_size (16)));
+
+void __attribute__ ((noinline)) foo (float *dst, float **src, int a, int n)
+{
+  int i, j;
+  int z = sizeof (v4flt) / sizeof (float);
+  unsigned m = sizeof (v4flt) - 1;
+
+  for (j = 0; j < n && (((unsigned long) dst + j) & m); ++j)
+    {
+      float t = src[0][j];
+      for (i = 1; i < a; ++i)
+	t += src[i][j];
+      dst[j] = t;
+    }
+
+  for (; j < (n - (4 * z - 1)); j += 4 * z)
+    {
+      v4flt t0 = *(v4flt *) (src[0] + j + 0 * z);
+      v4flt t1 = *(v4flt *) (src[0] + j + 1 * z);
+      v4flt t2 = *(v4flt *) (src[0] + j + 2 * z);
+      v4flt t3 = *(v4flt *) (src[0] + j + 3 * z);
+      for (i = 1; i < a; ++i)
+	{
+	  t0 += *(v4flt *) (src[i] + j + 0 * z);
+	  t1 += *(v4flt *) (src[i] + j + 1 * z);
+	  t2 += *(v4flt *) (src[i] + j + 2 * z);
+	  t3 += *(v4flt *) (src[i] + j + 3 * z);
+	}
+      *(v4flt *) (dst + j + 0 * z) = t0;
+      *(v4flt *) (dst + j + 1 * z) = t1;
+      *(v4flt *) (dst + j + 2 * z) = t2;
+      *(v4flt *) (dst + j + 3 * z) = t3;
+    }
+  for (; j < n; ++j)
+    {
+      float t = src[0][j];
+      for (i = 1; i < a; ++i)
+	t += src[i][j];
+      dst[j] = t;
+    }
+}
+
+float buffer[64];
+
+int
+main (void)
+{
+  int i;
+  float *dst, *src[2];
+
+  dst = buffer;
+  dst += (-(long int) buffer & (16 * sizeof (float) - 1)) / sizeof (float);
+  src[0] = dst + 16;
+  src[1] = dst + 32;
+  for (i = 0; i < 16; ++i)
+    {
+      src[0][i] = (float) i + 11 * (float) i;
+      src[1][i] = (float) i + 12 * (float) i;
+    }
+  foo (dst, src, 2, 16);
+  for (i = 0; i < 16; ++i)
+    {
+      float e = (float) i + 11 * (float) i + (float) i + 12 * (float) i;
+      if (dst[i] != e)
+	abort ();
+    }
+  return 0;
+}