|
|
ede8cb |
From 0feb977bbfb0d6bb2c8d3178246acb035a739f37 Mon Sep 17 00:00:00 2001
|
|
|
ede8cb |
From: Ulrich Weigand <uweigand@de.ibm.com>
|
|
|
ede8cb |
Date: Mon, 4 Aug 2014 18:41:00 +0200
|
|
|
ede8cb |
Subject: [PATCH] gallivm: Fix Altivec pack intrinsics for little-endian
|
|
|
ede8cb |
|
|
|
ede8cb |
This patch fixes use of Altivec pack intrinsics on little-endian PowerPC
|
|
|
ede8cb |
systems. Since little-endian operation only affects the load and store
|
|
|
ede8cb |
instructions, the semantics of pack (and other) instructions that take
|
|
|
ede8cb |
two input vectors implicitly change: the pack instructions still fill
|
|
|
ede8cb |
a register placing values from the first operand into the "high" parts
|
|
|
ede8cb |
of the register, and values from the second operand into the "low" parts
|
|
|
ede8cb |
of the register, but since vector loads and stores perform an endian swap,
|
|
|
ede8cb |
the high parts end up at high memory addresses.
|
|
|
ede8cb |
|
|
|
ede8cb |
To still achieve the desired effect, we have to swap the two inputs to
|
|
|
ede8cb |
the pack instruction on little-endian systems. This is done automatically
|
|
|
ede8cb |
by the back-end for instructions generated by LLVM, but needs to be done
|
|
|
ede8cb |
manually when emitting intrisincs (which still result in that instruction
|
|
|
ede8cb |
being emitted directly).
|
|
|
ede8cb |
|
|
|
ede8cb |
Signed-off-by: Ulrich Weigand <ulrich.weigand@de.ibm.com>
|
|
|
ede8cb |
Signed-off-by: Maarten Lankhorst <dev@mblankhorst.nl>
|
|
|
ede8cb |
---
|
|
|
ede8cb |
src/gallium/auxiliary/gallivm/lp_bld_pack.c | 26 +++++++++++++++++++++-----
|
|
|
ede8cb |
1 file changed, 21 insertions(+), 5 deletions(-)
|
|
|
ede8cb |
|
|
|
ede8cb |
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
|
|
|
ede8cb |
index a48a922..cdf6d80 100644
|
|
|
ede8cb |
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
|
|
|
ede8cb |
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
|
|
|
ede8cb |
@@ -464,6 +464,7 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
|
|
|
ede8cb |
src_type.width * src_type.length >= 128) {
|
|
|
ede8cb |
const char *intrinsic = NULL;
|
|
|
ede8cb |
+ boolean swap_intrinsic_operands = FALSE;
|
|
|
ede8cb |
|
|
|
ede8cb |
switch(src_type.width) {
|
|
|
ede8cb |
case 32:
|
|
|
ede8cb |
@@ -482,6 +483,9 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
} else {
|
|
|
ede8cb |
intrinsic = "llvm.ppc.altivec.vpkuwus";
|
|
|
ede8cb |
}
|
|
|
ede8cb |
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
|
|
|
ede8cb |
+ swap_intrinsic_operands = TRUE;
|
|
|
ede8cb |
+#endif
|
|
|
ede8cb |
}
|
|
|
ede8cb |
break;
|
|
|
ede8cb |
case 16:
|
|
|
ede8cb |
@@ -490,12 +494,18 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
intrinsic = "llvm.x86.sse2.packsswb.128";
|
|
|
ede8cb |
} else if (util_cpu_caps.has_altivec) {
|
|
|
ede8cb |
intrinsic = "llvm.ppc.altivec.vpkshss";
|
|
|
ede8cb |
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
|
|
|
ede8cb |
+ swap_intrinsic_operands = TRUE;
|
|
|
ede8cb |
+#endif
|
|
|
ede8cb |
}
|
|
|
ede8cb |
} else {
|
|
|
ede8cb |
if (util_cpu_caps.has_sse2) {
|
|
|
ede8cb |
intrinsic = "llvm.x86.sse2.packuswb.128";
|
|
|
ede8cb |
} else if (util_cpu_caps.has_altivec) {
|
|
|
ede8cb |
intrinsic = "llvm.ppc.altivec.vpkshus";
|
|
|
ede8cb |
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
|
|
|
ede8cb |
+ swap_intrinsic_operands = TRUE;
|
|
|
ede8cb |
+#endif
|
|
|
ede8cb |
}
|
|
|
ede8cb |
}
|
|
|
ede8cb |
break;
|
|
|
ede8cb |
@@ -504,7 +514,11 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
if (intrinsic) {
|
|
|
ede8cb |
if (src_type.width * src_type.length == 128) {
|
|
|
ede8cb |
LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
|
|
|
ede8cb |
- res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
|
|
|
ede8cb |
+ if (swap_intrinsic_operands) {
|
|
|
ede8cb |
+ res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
|
|
|
ede8cb |
+ } else {
|
|
|
ede8cb |
+ res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
|
|
|
ede8cb |
+ }
|
|
|
ede8cb |
if (dst_vec_type != intr_vec_type) {
|
|
|
ede8cb |
res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
|
|
|
ede8cb |
}
|
|
|
ede8cb |
@@ -513,6 +527,8 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
int num_split = src_type.width * src_type.length / 128;
|
|
|
ede8cb |
int i;
|
|
|
ede8cb |
int nlen = 128 / src_type.width;
|
|
|
ede8cb |
+ int lo_off = swap_intrinsic_operands ? nlen : 0;
|
|
|
ede8cb |
+ int hi_off = swap_intrinsic_operands ? 0 : nlen;
|
|
|
ede8cb |
struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
|
|
|
ede8cb |
struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
|
|
|
ede8cb |
LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
|
|
|
ede8cb |
@@ -524,9 +540,9 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
|
|
|
ede8cb |
for (i = 0; i < num_split / 2; i++) {
|
|
|
ede8cb |
tmplo = lp_build_extract_range(gallivm,
|
|
|
ede8cb |
- lo, i*nlen*2, nlen);
|
|
|
ede8cb |
+ lo, i*nlen*2 + lo_off, nlen);
|
|
|
ede8cb |
tmphi = lp_build_extract_range(gallivm,
|
|
|
ede8cb |
- lo, i*nlen*2 + nlen, nlen);
|
|
|
ede8cb |
+ lo, i*nlen*2 + hi_off, nlen);
|
|
|
ede8cb |
tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
|
|
|
ede8cb |
nintr_vec_type, tmplo, tmphi);
|
|
|
ede8cb |
if (ndst_vec_type != nintr_vec_type) {
|
|
|
ede8cb |
@@ -535,9 +551,9 @@ lp_build_pack2(struct gallivm_state *gallivm,
|
|
|
ede8cb |
}
|
|
|
ede8cb |
for (i = 0; i < num_split / 2; i++) {
|
|
|
ede8cb |
tmplo = lp_build_extract_range(gallivm,
|
|
|
ede8cb |
- hi, i*nlen*2, nlen);
|
|
|
ede8cb |
+ hi, i*nlen*2 + lo_off, nlen);
|
|
|
ede8cb |
tmphi = lp_build_extract_range(gallivm,
|
|
|
ede8cb |
- hi, i*nlen*2 + nlen, nlen);
|
|
|
ede8cb |
+ hi, i*nlen*2 + hi_off, nlen);
|
|
|
ede8cb |
tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
|
|
|
ede8cb |
nintr_vec_type,
|
|
|
ede8cb |
tmplo, tmphi);
|
|
|
ede8cb |
--
|
|
|
ede8cb |
1.9.3
|
|
|
ede8cb |
|