Blame SOURCES/0001-gallivm-Fix-Altivec-pack-intrinsics-for-little-endia.patch

ede8cb
From 0feb977bbfb0d6bb2c8d3178246acb035a739f37 Mon Sep 17 00:00:00 2001
ede8cb
From: Ulrich Weigand <uweigand@de.ibm.com>
ede8cb
Date: Mon, 4 Aug 2014 18:41:00 +0200
ede8cb
Subject: [PATCH] gallivm: Fix Altivec pack intrinsics for little-endian
ede8cb
ede8cb
This patch fixes use of Altivec pack intrinsics on little-endian PowerPC
ede8cb
systems.  Since little-endian operation only affects the load and store
ede8cb
instructions, the semantics of pack (and other) instructions that take
ede8cb
two input vectors implicitly change: the pack instructions still fill
ede8cb
a register placing values from the first operand into the "high" parts
ede8cb
of the register, and values from the second operand into the "low" parts
ede8cb
of the register, but since vector loads and stores perform an endian swap,
ede8cb
the high parts end up at high memory addresses.
ede8cb
ede8cb
To still achieve the desired effect, we have to swap the two inputs to
ede8cb
the pack instruction on little-endian systems.  This is done automatically
ede8cb
by the back-end for instructions generated by LLVM, but needs to be done
ede8cb
manually when emitting intrisincs (which still result in that instruction
ede8cb
being emitted directly).
ede8cb
ede8cb
Signed-off-by: Ulrich Weigand <ulrich.weigand@de.ibm.com>
ede8cb
Signed-off-by: Maarten Lankhorst <dev@mblankhorst.nl>
ede8cb
---
ede8cb
 src/gallium/auxiliary/gallivm/lp_bld_pack.c | 26 +++++++++++++++++++++-----
ede8cb
 1 file changed, 21 insertions(+), 5 deletions(-)
ede8cb
ede8cb
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
ede8cb
index a48a922..cdf6d80 100644
ede8cb
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
ede8cb
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
ede8cb
@@ -464,6 +464,7 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
    if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
ede8cb
        src_type.width * src_type.length >= 128) {
ede8cb
       const char *intrinsic = NULL;
ede8cb
+      boolean swap_intrinsic_operands = FALSE;
ede8cb
 
ede8cb
       switch(src_type.width) {
ede8cb
       case 32:
ede8cb
@@ -482,6 +483,9 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
            } else {
ede8cb
               intrinsic = "llvm.ppc.altivec.vpkuwus";
ede8cb
            }
ede8cb
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
ede8cb
+           swap_intrinsic_operands = TRUE;
ede8cb
+#endif
ede8cb
          }
ede8cb
          break;
ede8cb
       case 16:
ede8cb
@@ -490,12 +494,18 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
               intrinsic = "llvm.x86.sse2.packsswb.128";
ede8cb
             } else if (util_cpu_caps.has_altivec) {
ede8cb
               intrinsic = "llvm.ppc.altivec.vpkshss";
ede8cb
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
ede8cb
+              swap_intrinsic_operands = TRUE;
ede8cb
+#endif
ede8cb
             }
ede8cb
          } else {
ede8cb
             if (util_cpu_caps.has_sse2) {
ede8cb
               intrinsic = "llvm.x86.sse2.packuswb.128";
ede8cb
             } else if (util_cpu_caps.has_altivec) {
ede8cb
 	      intrinsic = "llvm.ppc.altivec.vpkshus";
ede8cb
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
ede8cb
+              swap_intrinsic_operands = TRUE;
ede8cb
+#endif
ede8cb
             }
ede8cb
          }
ede8cb
          break;
ede8cb
@@ -504,7 +514,11 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
       if (intrinsic) {
ede8cb
          if (src_type.width * src_type.length == 128) {
ede8cb
             LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
ede8cb
-            res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
ede8cb
+            if (swap_intrinsic_operands) {
ede8cb
+               res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
ede8cb
+            } else {
ede8cb
+               res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
ede8cb
+            }
ede8cb
             if (dst_vec_type != intr_vec_type) {
ede8cb
                res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
ede8cb
             }
ede8cb
@@ -513,6 +527,8 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
             int num_split = src_type.width * src_type.length / 128;
ede8cb
             int i;
ede8cb
             int nlen = 128 / src_type.width;
ede8cb
+            int lo_off = swap_intrinsic_operands ? nlen : 0;
ede8cb
+            int hi_off = swap_intrinsic_operands ? 0 : nlen;
ede8cb
             struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
ede8cb
             struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
ede8cb
             LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
ede8cb
@@ -524,9 +540,9 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
 
ede8cb
             for (i = 0; i < num_split / 2; i++) {
ede8cb
                tmplo = lp_build_extract_range(gallivm,
ede8cb
-                                              lo, i*nlen*2, nlen);
ede8cb
+                                              lo, i*nlen*2 + lo_off, nlen);
ede8cb
                tmphi = lp_build_extract_range(gallivm,
ede8cb
-                                              lo, i*nlen*2 + nlen, nlen);
ede8cb
+                                              lo, i*nlen*2 + hi_off, nlen);
ede8cb
                tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
ede8cb
                                                      nintr_vec_type, tmplo, tmphi);
ede8cb
                if (ndst_vec_type != nintr_vec_type) {
ede8cb
@@ -535,9 +551,9 @@ lp_build_pack2(struct gallivm_state *gallivm,
ede8cb
             }
ede8cb
             for (i = 0; i < num_split / 2; i++) {
ede8cb
                tmplo = lp_build_extract_range(gallivm,
ede8cb
-                                              hi, i*nlen*2, nlen);
ede8cb
+                                              hi, i*nlen*2 + lo_off, nlen);
ede8cb
                tmphi = lp_build_extract_range(gallivm,
ede8cb
-                                              hi, i*nlen*2 + nlen, nlen);
ede8cb
+                                              hi, i*nlen*2 + hi_off, nlen);
ede8cb
                tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
ede8cb
                                                                  nintr_vec_type,
ede8cb
                                                                  tmplo, tmphi);
ede8cb
-- 
ede8cb
1.9.3
ede8cb