diff --git a/.gitignore b/.gitignore
index f1fe27b..94c9bc6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-SOURCES/mesa-20.1.2.tar.xz
+SOURCES/mesa-20.1.4.tar.xz
diff --git a/.mesa.metadata b/.mesa.metadata
index 87f9f7f..ca12abc 100644
--- a/.mesa.metadata
+++ b/.mesa.metadata
@@ -1 +1 @@
-b90fe9ca8c3bdad043e86cd1af93bcf83e1da3fb SOURCES/mesa-20.1.2.tar.xz
+78243cd7152a8ba759f8f2bdfcf0a877b455e351 SOURCES/mesa-20.1.4.tar.xz
diff --git a/SOURCES/0001-gallivm-nir-fix-big-endian-64-bit-splitting-merging.patch b/SOURCES/0001-gallivm-nir-fix-big-endian-64-bit-splitting-merging.patch
deleted file mode 100644
index 30fc63d..0000000
--- a/SOURCES/0001-gallivm-nir-fix-big-endian-64-bit-splitting-merging.patch
+++ /dev/null
@@ -1,45 +0,0 @@
-From fcf3f45728a22250ad15db7e230545147fc28c2e Mon Sep 17 00:00:00 2001
-From: Dave Airlie <airlied@redhat.com>
-Date: Mon, 29 Jun 2020 14:59:20 +1000
-Subject: [PATCH] gallivm/nir: fix big-endian 64-bit splitting/merging.
-
-The shuffles need to be swapped to do this properly on big-endian
----
- src/gallium/auxiliary/gallivm/lp_bld_nir.c | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir.c b/src/gallium/auxiliary/gallivm/lp_bld_nir.c
-index f14475e839d..2c4135ccc05 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_nir.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir.c
-@@ -353,8 +353,13 @@ static LLVMValueRef split_64bit(struct lp_build_nir_context *bld_base,
-    LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32];
-    int len = bld_base->base.type.length * 2;
-    for (unsigned i = 0; i < bld_base->base.type.length; i++) {
-+#if UTIL_ARCH_LITTLE_ENDIAN
-       shuffles[i] = lp_build_const_int32(gallivm, i * 2);
-       shuffles2[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
-+#else
-+      shuffles[i] = lp_build_const_int32(gallivm, (i * 2) + 1);
-+      shuffles2[i] = lp_build_const_int32(gallivm, (i * 2));
-+#endif
-    }
- 
-    src = LLVMBuildBitCast(gallivm->builder, src, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), len), "");
-@@ -378,8 +383,13 @@ merge_64bit(struct lp_build_nir_context *bld_base,
-    assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32)));
- 
-    for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
-+#if UTIL_ARCH_LITTLE_ENDIAN
-       shuffles[i] = lp_build_const_int32(gallivm, i / 2);
-       shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
-+#else
-+      shuffles[i] = lp_build_const_int32(gallivm, i / 2 + bld_base->base.type.length);
-+      shuffles[i + 1] = lp_build_const_int32(gallivm, i / 2);
-+#endif
-    }
-    return LLVMBuildShuffleVector(builder, input, input2, LLVMConstVector(shuffles, len), "");
- }
--- 
-2.26.2
-
diff --git a/SOURCES/0001-gallivm-nir-fix-const-loading-on-big-endian-systems.patch b/SOURCES/0001-gallivm-nir-fix-const-loading-on-big-endian-systems.patch
deleted file mode 100644
index 33c573f..0000000
--- a/SOURCES/0001-gallivm-nir-fix-const-loading-on-big-endian-systems.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-From ea7bf3941eeef8320c711a6f66b5e73077cc6e6b Mon Sep 17 00:00:00 2001
-From: Dave Airlie <airlied@redhat.com>
-Date: Mon, 29 Jun 2020 07:40:13 +1000
-Subject: [PATCH] gallivm/nir: fix const loading on big endian systems
-
-The code was expecting the lower 32-bits of the 64-bit to be
-what it wanted, don't be implicit, pull the value from the union.
-
-This should fix rendering on big endian systems since NIR was
-introduced.
-
-Fixes: 44a6b0107b37 ("gallivm: add nir->llvm translation (v2)")
-Reviewed-by: Timothy Arceri <tarceri@itsqueeze.com>
----
- src/gallium/auxiliary/gallivm/lp_bld_nir.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir.c b/src/gallium/auxiliary/gallivm/lp_bld_nir.c
-index 9aa582a0e8a..f14475e839d 100644
---- a/src/gallium/auxiliary/gallivm/lp_bld_nir.c
-+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir.c
-@@ -865,7 +865,7 @@ static void visit_load_const(struct lp_build_nir_context *bld_base,
-    LLVMValueRef result[NIR_MAX_VEC_COMPONENTS];
-    struct lp_build_context *int_bld = get_int_bld(bld_base, true, instr->def.bit_size);
-    for (unsigned i = 0; i < instr->def.num_components; i++)
--      result[i] = lp_build_const_int_vec(bld_base->base.gallivm, int_bld->type, instr->value[i].u64);
-+      result[i] = lp_build_const_int_vec(bld_base->base.gallivm, int_bld->type, instr->def.bit_size == 32 ? instr->value[i].u32 : instr->value[i].u64);
-    assign_ssa_dest(bld_base, &instr->def, result);
- }
- 
--- 
-2.26.2
-
diff --git a/SOURCES/0001-glsl-fix-constant-packing-for-64-bit-big-endian.patch b/SOURCES/0001-glsl-fix-constant-packing-for-64-bit-big-endian.patch
deleted file mode 100644
index 4e37ce3..0000000
--- a/SOURCES/0001-glsl-fix-constant-packing-for-64-bit-big-endian.patch
+++ /dev/null
@@ -1,81 +0,0 @@
-From 5fc0b580cecb1529659d5d3719412fb7cbffac0d Mon Sep 17 00:00:00 2001
-From: Dave Airlie <airlied@redhat.com>
-Date: Mon, 29 Jun 2020 13:26:56 +1000
-Subject: [PATCH] glsl: fix constant packing for 64-bit big endian.
-
-In a piglit run on s390 a lot of double tests fail, explicitly
-packing/shifting things rather than using memcpy seems to help
----
- src/compiler/glsl/ir_constant_expression.cpp | 15 +++++++++++++++
- src/compiler/glsl/ir_expression_operation.py | 20 ++++++++++----------
- 2 files changed, 25 insertions(+), 10 deletions(-)
-
-diff --git a/src/compiler/glsl/ir_constant_expression.cpp b/src/compiler/glsl/ir_constant_expression.cpp
-index 636196886b3..595cc821797 100644
---- a/src/compiler/glsl/ir_constant_expression.cpp
-+++ b/src/compiler/glsl/ir_constant_expression.cpp
-@@ -452,6 +452,21 @@ isub64_saturate(int64_t a, int64_t b)
-    return a - b;
- }
- 
-+static uint64_t
-+pack_2x32(uint32_t a, uint32_t b)
-+{
-+   uint64_t v = a;
-+   v |= (uint64_t)b << 32;
-+   return v;
-+}
-+
-+static void
-+unpack_2x32(uint64_t p, uint32_t *a, uint32_t *b)
-+{
-+   *a = p & 0xffffffff;
-+   *b = (p >> 32);
-+}
-+
- /**
-  * Get the constant that is ultimately referenced by an r-value, in a constant
-  * expression evaluation context.
-diff --git a/src/compiler/glsl/ir_expression_operation.py b/src/compiler/glsl/ir_expression_operation.py
-index d2c4d41024f..1c4e6b358e1 100644
---- a/src/compiler/glsl/ir_expression_operation.py
-+++ b/src/compiler/glsl/ir_expression_operation.py
-@@ -560,14 +560,14 @@ ir_expression_operation = [
-    operation("saturate", 1, printable_name="sat", source_types=(float_type,), c_expression="CLAMP({src0}, 0.0f, 1.0f)"),
- 
-    # Double packing, part of ARB_gpu_shader_fp64.
--   operation("pack_double_2x32", 1, printable_name="packDouble2x32", source_types=(uint_type,), dest_type=double_type, c_expression="memcpy(&data.d[0], &op[0]->value.u[0], sizeof(double))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("unpack_double_2x32", 1, printable_name="unpackDouble2x32", source_types=(double_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.d[0], sizeof(double))", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("pack_double_2x32", 1, printable_name="packDouble2x32", source_types=(uint_type,), dest_type=double_type, c_expression="data.u64[0] = pack_2x32(op[0]->value.u[0], op[0]->value.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("unpack_double_2x32", 1, printable_name="unpackDouble2x32", source_types=(double_type,), dest_type=uint_type, c_expression="unpack_2x32(op[0]->value.u64[0], &data.u[0], &data.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
- 
-    # Sampler/Image packing, part of ARB_bindless_texture.
--   operation("pack_sampler_2x32", 1, printable_name="packSampler2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="memcpy(&data.u64[0], &op[0]->value.u[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("pack_image_2x32", 1, printable_name="packImage2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="memcpy(&data.u64[0], &op[0]->value.u[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("unpack_sampler_2x32", 1, printable_name="unpackSampler2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.u64[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("unpack_image_2x32", 1, printable_name="unpackImage2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.u64[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("pack_sampler_2x32", 1, printable_name="packSampler2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="data.u64[0] = pack_2x32(op[0]->value.u[0], op[0]->value.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("pack_image_2x32", 1, printable_name="packImage2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="data.u64[0] = pack_2x32(op[0]->value.u[0], op[0]->value.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("unpack_sampler_2x32", 1, printable_name="unpackSampler2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="unpack_2x32(op[0]->value.u64[0], &data.u[0], &data.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("unpack_image_2x32", 1, printable_name="unpackImage2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="unpack_2x32(op[0]->value.u64[0], &data.u[0], &data.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
- 
-    operation("frexp_sig", 1),
-    operation("frexp_exp", 1),
-@@ -592,10 +592,10 @@ ir_expression_operation = [
-    operation("ssbo_unsized_array_length", 1),
- 
-    # 64-bit integer packing ops.
--   operation("pack_int_2x32", 1, printable_name="packInt2x32", source_types=(int_type,), dest_type=int64_type, c_expression="memcpy(&data.i64[0], &op[0]->value.i[0], sizeof(int64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("pack_uint_2x32", 1, printable_name="packUint2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="memcpy(&data.u64[0], &op[0]->value.u[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("unpack_int_2x32", 1, printable_name="unpackInt2x32", source_types=(int64_type,), dest_type=int_type, c_expression="memcpy(&data.i[0], &op[0]->value.i64[0], sizeof(int64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
--   operation("unpack_uint_2x32", 1, printable_name="unpackUint2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="memcpy(&data.u[0], &op[0]->value.u64[0], sizeof(uint64_t))", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("pack_int_2x32", 1, printable_name="packInt2x32", source_types=(int_type,), dest_type=int64_type, c_expression="data.u64[0] = pack_2x32(op[0]->value.u[0], op[0]->value.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("pack_uint_2x32", 1, printable_name="packUint2x32", source_types=(uint_type,), dest_type=uint64_type, c_expression="data.u64[0] = pack_2x32(op[0]->value.u[0], op[0]->value.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("unpack_int_2x32", 1, printable_name="unpackInt2x32", source_types=(int64_type,), dest_type=int_type, c_expression="unpack_2x32(op[0]->value.u64[0], &data.u[0], &data.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
-+   operation("unpack_uint_2x32", 1, printable_name="unpackUint2x32", source_types=(uint64_type,), dest_type=uint_type, c_expression="unpack_2x32(op[0]->value.u64[0], &data.u[0], &data.u[1])", flags=frozenset((horizontal_operation, non_assign_operation))),
- 
-    operation("add", 2, printable_name="+", source_types=numeric_types, c_expression="{src0} + {src1}", flags=vector_scalar_operation),
-    operation("sub", 2, printable_name="-", source_types=numeric_types, c_expression="{src0} - {src1}", flags=vector_scalar_operation),
--- 
-2.26.2
-
diff --git a/SOURCES/Makefile b/SOURCES/Makefile
index c431c49..8396596 100644
--- a/SOURCES/Makefile
+++ b/SOURCES/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= 20.1.2
+VERSION ?= 20.1.4
 SANITIZE ?= 1
 
 DIRNAME = mesa-${VERSION}
diff --git a/SOURCES/nouveau-tu1xx-support.patch b/SOURCES/nouveau-tu1xx-support.patch
index 1134f43..3254466 100644
--- a/SOURCES/nouveau-tu1xx-support.patch
+++ b/SOURCES/nouveau-tu1xx-support.patch
@@ -357,10 +357,10 @@ index e244bd0d610..dd8e1ab86c4 100644
  {
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp
 new file mode 100644
-index 00000000000..0fbd47ccf88
+index 00000000000..ef33743e610
 --- /dev/null
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp
-@@ -0,0 +1,2011 @@
+@@ -0,0 +1,2052 @@
 +/*
 + * Copyright 2020 Red Hat Inc.
 + *
@@ -1221,6 +1221,7 @@ index 00000000000..0fbd47ccf88
 +         break;
 +      }
 +      emitField(73, 3, dType);
++      emitGPR  (64, insn->src(2));
 +   }
 +
 +   emitPRED (81);
@@ -1272,6 +1273,40 @@ index 00000000000..0fbd47ccf88
 +   emitGPR  (16, insn->def(0));
 +}
 +
++static void
++interpApply(const FixupEntry *entry, uint32_t *code, const FixupData& data)
++{
++   int ipa = entry->ipa;
++   int loc = entry->loc;
++
++   if (data.force_persample_interp &&
++       (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
++       (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
++      ipa |= NV50_IR_INTERP_CENTROID;
++   }
++
++   int sample;
++   switch (ipa & NV50_IR_INTERP_SAMPLE_MASK) {
++   case NV50_IR_INTERP_DEFAULT : sample = 0; break;
++   case NV50_IR_INTERP_CENTROID: sample = 1; break;
++   case NV50_IR_INTERP_OFFSET  : sample = 2; break;
++   default: assert(!"invalid sample mode");
++   }
++
++   int interp;
++   switch (ipa & NV50_IR_INTERP_MODE_MASK) {
++   case NV50_IR_INTERP_LINEAR     :
++   case NV50_IR_INTERP_PERSPECTIVE: interp = 0; break;
++   case NV50_IR_INTERP_FLAT       : interp = 1; break;
++   case NV50_IR_INTERP_SC         : interp = 2; break;
++   default: assert(!"invalid ipa mode");
++   }
++
++   code[loc + 2] &= ~(0xf << 12);
++   code[loc + 2] |= sample << 12;
++   code[loc + 2] |= interp << 14;
++}
++
 +void
 +CodeEmitterGV100::emitIPA()
 +{
@@ -1288,17 +1323,21 @@ index 00000000000..0fbd47ccf88
 +      break;
 +   }
 +
++   switch (insn->getSampleMode()) {
++   case NV50_IR_INTERP_DEFAULT : emitField(76, 2, 0); break;
++   case NV50_IR_INTERP_CENTROID: emitField(76, 2, 1); break;
++   case NV50_IR_INTERP_OFFSET  : emitField(76, 2, 2); break;
++   default:
++      assert(!"invalid sample mode");
++      break;
++   }
++
 +   if (insn->getSampleMode() != NV50_IR_INTERP_OFFSET) {
-+      switch (insn->getSampleMode()) {
-+      case NV50_IR_INTERP_DEFAULT : emitField(76, 2, 0); break;
-+      case NV50_IR_INTERP_CENTROID: emitField(76, 2, 1); break;
-+      default:
-+         break;
-+      }
 +      emitGPR  (32);
++      addInterp(insn->ipa, 0xff, interpApply);
 +   } else {
-+      emitField(76, 2, 2);
 +      emitGPR  (32, insn->src(1));
++      addInterp(insn->ipa, insn->getSrc(1)->reg.data.id, interpApply);
 +   }
 +
 +   assert(!insn->src(0).isIndirect(0));
@@ -1315,21 +1354,22 @@ index 00000000000..0fbd47ccf88
 +}
 +
 +void
-+CodeEmitterGV100::emitLDSTc(int pos)
++CodeEmitterGV100::emitLDSTc(int posm, int poso)
 +{
 +   int mode = 0;
++   int order = 1;
 +
 +   switch (insn->cache) {
-+   case CACHE_CA: mode = 0; break;
-+   case CACHE_CG: mode = 1; break;
-+   case CACHE_CS: mode = 2; break;
-+   case CACHE_CV: mode = 3; break;
++   case CACHE_CA: mode = 0; order = 1; break;
++   case CACHE_CG: mode = 2; order = 2; break;
++   case CACHE_CV: mode = 3; order = 2; break;
 +   default:
 +      assert(!"invalid caching mode");
 +      break;
 +   }
 +
-+   emitField(pos, 2, mode);
++   emitField(poso, 2, order);
++   emitField(posm, 2, mode);
 +}
 +
 +void
@@ -1552,6 +1592,14 @@ index 00000000000..0fbd47ccf88
 +{
 +   const TexInstruction *insn = this->insn->asTex();
 +
++   int offsets = 0;
++   switch (insn->tex.useOffsets) {
++   case 4: offsets = 2; break;
++   case 1: offsets = 1; break;
++   case 0: offsets = 0; break;
++   default: assert(!"invalid offsets count"); break;
++   }
++
 +   if (insn->tex.rIndirectSrc < 0) {
 +      emitInsn (0xb63);
 +      emitField(54, 5, prog->driver->io.auxCBSlot);
@@ -1565,8 +1613,7 @@ index 00000000000..0fbd47ccf88
 +   emitField(84, 1, 1); // !.EF
 +   emitPRED (81);
 +   emitField(78, 1, insn->tex.target.isShadow());
-+   emitField(77, 2, insn->tex.useOffsets == 4);
-+   emitField(76, 2, insn->tex.useOffsets == 1);
++   emitField(76, 2, offsets);
 +   emitField(72, 4, insn->tex.mask);
 +   emitGPR  (64, insn->def(1));
 +   emitField(63, 1, insn->tex.target.isArray());
@@ -1776,7 +1823,6 @@ index 00000000000..0fbd47ccf88
 +         assert(0);
 +         break;
 +      }
-+   //   emitLDSTc(0x18);
 +      emitField(73, 3, type);
 +   } else {
 +      emitInsn(0x998);
@@ -1785,7 +1831,7 @@ index 00000000000..0fbd47ccf88
 +   }
 +
 +   emitPRED (81);
-+   emitField(79, 2, 1);
++   emitLDSTc(77, 79);
 +
 +   emitGPR  (16, insn->def(0));
 +   emitGPR  (24, insn->src(0));
@@ -1805,12 +1851,7 @@ index 00000000000..0fbd47ccf88
 +#endif
 +   emitSUTarget();
 +
-+
-+#if 0
-+   emitLDSTc(0x18);
-+#endif
-+
-+   emitField(79, 2, 1);
++   emitLDSTc(77, 79);
 +   emitField(72, 4, 0xf); // rgba
 +   emitGPR(32, insn->src(1));
 +   emitGPR(24, insn->src(0));
@@ -2374,7 +2415,7 @@ index 00000000000..0fbd47ccf88
 +};
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h
 new file mode 100644
-index 00000000000..e97bf6580a1
+index 00000000000..15ab717e460
 --- /dev/null
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h
 @@ -0,0 +1,403 @@
@@ -2744,7 +2785,7 @@ index 00000000000..e97bf6580a1
 +   void emitATOMS();
 +   void emitIPA();
 +   void emitISBERD();
-+   void emitLDSTc(int);
++   void emitLDSTc(int, int);
 +   void emitLDSTs(int, DataType);
 +   void emitLD();
 +   void emitLDC();
@@ -2782,10 +2823,26 @@ index 00000000000..e97bf6580a1
 +};
 +#endif
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
-index bd78b76f384..69f9cfad0d6 100644
+index bd78b76f384..eee9aa67256 100644
 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
-@@ -571,6 +571,10 @@ Converter::getSubOp(nir_op op)
+@@ -170,6 +170,7 @@ private:
+    NirArrayLMemOffsets regToLmemOffset;
+    NirBlockMap blocks;
+    unsigned int curLoopDepth;
++   unsigned int curIfDepth;
+ 
+    BasicBlock *exit;
+    Value *zero;
+@@ -188,6 +189,7 @@ Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info)
+    : ConverterCommon(prog, info),
+      nir(nir),
+      curLoopDepth(0),
++     curIfDepth(0),
+      clipVertexOutput(-1)
+ {
+    zero = mkImm((uint32_t)0);
+@@ -571,6 +573,10 @@ Converter::getSubOp(nir_op op)
     case nir_op_imul_high:
     case nir_op_umul_high:
        return NV50_IR_SUBOP_MUL_HIGH;
@@ -2796,7 +2853,120 @@ index bd78b76f384..69f9cfad0d6 100644
     default:
        return 0;
     }
-@@ -1067,7 +1071,11 @@ bool Converter::assignSlots() {
+@@ -909,7 +915,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
+    uint16_t slots;
+    switch (stage) {
+    case Program::TYPE_GEOMETRY:
+-      slots = type->uniform_locations();
++      slots = type->count_attribute_slots(false);
+       if (input)
+          slots /= info.gs.vertices_in;
+       break;
+@@ -917,9 +923,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
+    case Program::TYPE_TESSELLATION_EVAL:
+       // remove first dimension
+       if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL))
+-         slots = type->uniform_locations();
++         slots = type->count_attribute_slots(false);
+       else
+-         slots = type->fields.array->uniform_locations();
++         slots = type->fields.array->count_attribute_slots(false);
+       break;
+    default:
+       slots = type->count_attribute_slots(false);
+@@ -929,6 +935,24 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
+    return slots;
+ }
+ 
++static uint8_t
++getMaskForType(const glsl_type *type, uint8_t slot) {
++   uint16_t comp = type->without_array()->components();
++   comp = comp ? comp : 4;
++
++   if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
++      comp *= 2;
++      if (comp > 4) {
++         if (slot % 2)
++            comp -= 4;
++         else
++            comp = 4;
++      }
++   }
++
++   return (1 << comp) - 1;
++}
++
+ bool Converter::assignSlots() {
+    unsigned name;
+    unsigned index;
+@@ -981,16 +1005,8 @@ bool Converter::assignSlots() {
+       const glsl_type *type = var->type;
+       int slot = var->data.location;
+       uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var);
+-      uint32_t comp = type->is_array() ? type->without_array()->component_slots()
+-                                       : type->component_slots();
+-      uint32_t frac = var->data.location_frac;
+       uint32_t vary = var->data.driver_location;
+ 
+-      if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
+-         if (comp > 2)
+-            slots *= 2;
+-      }
+-
+       assert(vary + slots <= PIPE_MAX_SHADER_INPUTS);
+ 
+       switch(prog->getType()) {
+@@ -1014,6 +1030,8 @@ bool Converter::assignSlots() {
+             info->numPatchConstants = MAX2(info->numPatchConstants, index + slots);
+          break;
+       case Program::TYPE_VERTEX:
++         if (slot >= VERT_ATTRIB_GENERIC0)
++            slot = VERT_ATTRIB_GENERIC0 + vary;
+          vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index);
+          switch (name) {
+          case TGSI_SEMANTIC_EDGEFLAG:
+@@ -1029,17 +1047,12 @@ bool Converter::assignSlots() {
+       }
+ 
+       for (uint16_t i = 0u; i < slots; ++i, ++vary) {
+-         info->in[vary].id = vary;
+-         info->in[vary].patch = var->data.patch;
+-         info->in[vary].sn = name;
+-         info->in[vary].si = index + i;
+-         if (glsl_base_type_is_64bit(type->without_array()->base_type))
+-            if (i & 0x1)
+-               info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
+-            else
+-               info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
+-         else
+-            info->in[vary].mask |= ((1 << comp) - 1) << frac;
++         nv50_ir_varying *v = &info->in[vary];
++
++         v->patch = var->data.patch;
++         v->sn = name;
++         v->si = index + i;
++         v->mask |= getMaskForType(type, i) << var->data.location_frac;
+       }
+       info->numInputs = std::max<uint8_t>(info->numInputs, vary);
+    }
+@@ -1048,16 +1061,8 @@ bool Converter::assignSlots() {
+       const glsl_type *type = var->type;
+       int slot = var->data.location;
+       uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var);
+-      uint32_t comp = type->is_array() ? type->without_array()->component_slots()
+-                                       : type->component_slots();
+-      uint32_t frac = var->data.location_frac;
+       uint32_t vary = var->data.driver_location;
+ 
+-      if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
+-         if (comp > 2)
+-            slots *= 2;
+-      }
+-
+       assert(vary < PIPE_MAX_SHADER_OUTPUTS);
+ 
+       switch(prog->getType()) {
+@@ -1067,7 +1072,11 @@ bool Converter::assignSlots() {
           case TGSI_SEMANTIC_COLOR:
              if (!var->data.fb_fetch_output)
                 info->prop.fp.numColourResults++;
@@ -2809,7 +2979,185 @@ index bd78b76f384..69f9cfad0d6 100644
              // sometimes we get FRAG_RESULT_DATAX with data.index 0
              // sometimes we get FRAG_RESULT_DATA0 with data.index X
              index = index == 0 ? var->data.index : index;
-@@ -1617,6 +1625,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1118,20 +1127,14 @@ bool Converter::assignSlots() {
+       }
+ 
+       for (uint16_t i = 0u; i < slots; ++i, ++vary) {
+-         info->out[vary].id = vary;
+-         info->out[vary].patch = var->data.patch;
+-         info->out[vary].sn = name;
+-         info->out[vary].si = index + i;
+-         if (glsl_base_type_is_64bit(type->without_array()->base_type))
+-            if (i & 0x1)
+-               info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
+-            else
+-               info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
+-         else
+-            info->out[vary].mask |= ((1 << comp) - 1) << frac;
++         nv50_ir_varying *v = &info->out[vary];
++         v->patch = var->data.patch;
++         v->sn = name;
++         v->si = index + i;
++         v->mask |= getMaskForType(type, i) << var->data.location_frac;
+ 
+          if (nir->info.outputs_read & 1ull << slot)
+-            info->out[vary].oread = 1;
++            v->oread = 1;
+       }
+       info->numOutputs = std::max<uint8_t>(info->numOutputs, vary);
+    }
+@@ -1275,6 +1278,7 @@ Converter::parseNIR()
+    info->bin.tlsSpace = 0;
+    info->io.clipDistances = nir->info.clip_distance_array_size;
+    info->io.cullDistances = nir->info.cull_distance_array_size;
++   info->io.layer_viewport_relative = nir->info.layer_viewport_relative;
+ 
+    switch(prog->getType()) {
+    case Program::TYPE_COMPUTE:
+@@ -1291,7 +1295,7 @@ Converter::parseNIR()
+       info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage;
+       info->prop.fp.readsSampleLocations =
+          (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+-      info->prop.fp.usesDiscard = nir->info.fs.uses_discard;
++      info->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote;
+       info->prop.fp.usesSampleMaskIn =
+          !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN);
+       break;
+@@ -1426,64 +1430,69 @@ Converter::visit(nir_block *block)
+ bool
+ Converter::visit(nir_if *nif)
+ {
++   curIfDepth++;
++
+    DataType sType = getSType(nif->condition, false, false);
+    Value *src = getSrc(&nif->condition, 0);
+ 
+    nir_block *lastThen = nir_if_last_then_block(nif);
+    nir_block *lastElse = nir_if_last_else_block(nif);
+ 
+-   assert(!lastThen->successors[1]);
+-   assert(!lastElse->successors[1]);
+-
++   BasicBlock *headBB = bb;
+    BasicBlock *ifBB = convert(nir_if_first_then_block(nif));
+    BasicBlock *elseBB = convert(nir_if_first_else_block(nif));
+ 
+    bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
+    bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
+ 
+-   // we only insert joinats, if both nodes end up at the end of the if again.
+-   // the reason for this to not happens are breaks/continues/ret/... which
+-   // have their own handling
+-   if (lastThen->successors[0] == lastElse->successors[0])
+-      bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]),
+-                          CC_ALWAYS, NULL);
+-
++   bool insertJoins = lastThen->successors[0] == lastElse->successors[0];
+    mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType);
+ 
+    foreach_list_typed(nir_cf_node, node, node, &nif->then_list) {
+       if (!visit(node))
+          return false;
+    }
++
+    setPosition(convert(lastThen), true);
+-   if (!bb->getExit() ||
+-       !bb->getExit()->asFlow() ||
+-        bb->getExit()->asFlow()->op == OP_JOIN) {
++   if (!bb->isTerminated()) {
+       BasicBlock *tailBB = convert(lastThen->successors[0]);
+       mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
+       bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
++   } else {
++      insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
+    }
+ 
+    foreach_list_typed(nir_cf_node, node, node, &nif->else_list) {
+       if (!visit(node))
+          return false;
+    }
++
+    setPosition(convert(lastElse), true);
+-   if (!bb->getExit() ||
+-       !bb->getExit()->asFlow() ||
+-        bb->getExit()->asFlow()->op == OP_JOIN) {
++   if (!bb->isTerminated()) {
+       BasicBlock *tailBB = convert(lastElse->successors[0]);
+       mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
+       bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
++   } else {
++      insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
+    }
+ 
+-   if (lastThen->successors[0] == lastElse->successors[0]) {
+-      setPosition(convert(lastThen->successors[0]), true);
++   /* only insert joins for the most outer if */
++   if (--curIfDepth)
++      insertJoins = false;
++
++   /* we made sure that all threads would converge at the same block */
++   if (insertJoins) {
++      BasicBlock *conv = convert(lastThen->successors[0]);
++      setPosition(headBB->getExit(), false);
++      headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL);
++      setPosition(conv, false);
+       mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
+    }
+ 
+    return true;
+ }
+ 
++// TODO: add convergency
+ bool
+ Converter::visit(nir_loop *loop)
+ {
+@@ -1491,8 +1500,8 @@ Converter::visit(nir_loop *loop)
+    func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth);
+ 
+    BasicBlock *loopBB = convert(nir_loop_first_block(loop));
+-   BasicBlock *tailBB =
+-      convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
++   BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
++
+    bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE);
+ 
+    mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL);
+@@ -1503,19 +1512,15 @@ Converter::visit(nir_loop *loop)
+       if (!visit(node))
+          return false;
+    }
+-   Instruction *insn = bb->getExit();
+-   if (bb->cfg.incidentCount() != 0) {
+-      if (!insn || !insn->asFlow()) {
+-         mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
+-         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+-      } else if (insn && insn->op == OP_BRA && !insn->getPredicate() &&
+-                 tailBB->cfg.incidentCount() == 0) {
+-         // RA doesn't like having blocks around with no incident edge,
+-         // so we create a fake one to make it happy
+-         bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
+-      }
++
++   if (!bb->isTerminated()) {
++      mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
++      bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+    }
+ 
++   if (tailBB->cfg.incidentCount() == 0)
++      loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
++
+    curLoopDepth -= 1;
+ 
+    return true;
+@@ -1560,6 +1565,7 @@ Converter::convert(nir_intrinsic_op intr)
+       return SV_DRAWID;
+    case nir_intrinsic_load_front_face:
+       return SV_FACE;
++   case nir_intrinsic_is_helper_invocation:
+    case nir_intrinsic_load_helper_invocation:
+       return SV_THREAD_KILL;
+    case nir_intrinsic_load_instance_id:
+@@ -1617,6 +1623,7 @@ Converter::visit(nir_intrinsic_instr *insn)
  {
     nir_intrinsic_op op = insn->intrinsic;
     const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op];
@@ -2817,7 +3165,7 @@ index bd78b76f384..69f9cfad0d6 100644
  
     switch (op) {
     case nir_intrinsic_load_uniform: {
-@@ -1624,7 +1633,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1624,7 +1631,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        const DataType dType = getDType(insn);
        Value *indirect;
        uint32_t coffset = getIndirect(insn, 0, 0, indirect);
@@ -2826,7 +3174,7 @@ index bd78b76f384..69f9cfad0d6 100644
           loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect);
        }
        break;
-@@ -1635,7 +1644,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1635,7 +1642,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        DataType dType = getSType(insn->src[0], false, false);
        uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect);
  
@@ -2835,7 +3183,15 @@ index bd78b76f384..69f9cfad0d6 100644
           if (!((1u << i) & nir_intrinsic_write_mask(insn)))
              continue;
  
-@@ -1688,7 +1697,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1652,6 +1659,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+             break;
+          }
+          case Program::TYPE_GEOMETRY:
++         case Program::TYPE_TESSELLATION_EVAL:
+          case Program::TYPE_VERTEX: {
+             if (info->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) {
+                mkMov(clipVtx[i], src);
+@@ -1688,7 +1696,7 @@ Converter::visit(nir_intrinsic_instr *insn)
           srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0)));
           srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0)));
  
@@ -2844,7 +3200,27 @@ index bd78b76f384..69f9cfad0d6 100644
              defs.push_back(newDefs[i]);
              mask |= 1 << i;
           }
-@@ -1723,7 +1732,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1715,15 +1723,25 @@ Converter::visit(nir_intrinsic_instr *insn)
+ 
+       // see load_barycentric_* handling
+       if (prog->getType() == Program::TYPE_FRAGMENT) {
+-         mode = translateInterpMode(&vary, nvirOp);
+          if (op == nir_intrinsic_load_interpolated_input) {
+             ImmediateValue immMode;
+             if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode))
+-               mode |= immMode.reg.data.u32;
++               mode = immMode.reg.data.u32;
++         }
++         if (mode == NV50_IR_INTERP_DEFAULT)
++            mode |= translateInterpMode(&vary, nvirOp);
++         else {
++            if (vary.linear) {
++               nvirOp = OP_LINTERP;
++               mode |= NV50_IR_INTERP_LINEAR;
++            } else {
++               nvirOp = OP_PINTERP;
++               mode |= NV50_IR_INTERP_PERSPECTIVE;
++            }
           }
        }
  
@@ -2853,7 +3229,27 @@ index bd78b76f384..69f9cfad0d6 100644
           uint32_t address = getSlotAddress(insn, idx, i);
           Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address);
           if (prog->getType() == Program::TYPE_FRAGMENT) {
-@@ -1858,7 +1867,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1814,9 +1832,11 @@ Converter::visit(nir_intrinsic_instr *insn)
+       loadImm(newDefs[1], mode);
+       break;
+    }
++   case nir_intrinsic_demote:
+    case nir_intrinsic_discard:
+       mkOp(OP_DISCARD, TYPE_NONE, NULL);
+       break;
++   case nir_intrinsic_demote_if:
+    case nir_intrinsic_discard_if: {
+       Value *pred = getSSA(1, FILE_PREDICATE);
+       if (insn->num_components > 1) {
+@@ -1832,6 +1852,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+    case nir_intrinsic_load_base_instance:
+    case nir_intrinsic_load_draw_id:
+    case nir_intrinsic_load_front_face:
++   case nir_intrinsic_is_helper_invocation:
+    case nir_intrinsic_load_helper_invocation:
+    case nir_intrinsic_load_instance_id:
+    case nir_intrinsic_load_invocation_id:
+@@ -1858,7 +1879,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        SVSemantic sv = convert(op);
        LValues &newDefs = convert(&insn->dest);
  
@@ -2862,7 +3258,7 @@ index bd78b76f384..69f9cfad0d6 100644
           Value *def;
           if (typeSizeof(dType) == 8)
              def = getSSA();
-@@ -1910,12 +1919,12 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1910,12 +1931,12 @@ Converter::visit(nir_intrinsic_instr *insn)
  
        if (op == nir_intrinsic_read_first_invocation) {
           mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY;
@@ -2877,7 +3273,7 @@ index bd78b76f384..69f9cfad0d6 100644
           mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f))
              ->subOp = NV50_IR_SUBOP_SHFL_IDX;
        }
-@@ -1931,7 +1940,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1931,7 +1952,7 @@ Converter::visit(nir_intrinsic_instr *insn)
  
        Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS),
                                mkImm(baseVertex), indirectVertex);
@@ -2886,7 +3282,7 @@ index bd78b76f384..69f9cfad0d6 100644
           uint32_t address = getSlotAddress(insn, idx, i);
           loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0,
                    indirectOffset, vtxBase, info->in[idx].patch);
-@@ -1954,7 +1963,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -1954,19 +1975,24 @@ Converter::visit(nir_intrinsic_instr *insn)
  
        vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase);
  
@@ -2895,7 +3291,26 @@ index bd78b76f384..69f9cfad0d6 100644
           uint32_t address = getSlotAddress(insn, idx, i);
           loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0,
                    indirectOffset, vtxBase, info->in[idx].patch);
-@@ -1978,7 +1987,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+       }
+       break;
+    }
+-   case nir_intrinsic_emit_vertex:
++   case nir_intrinsic_emit_vertex: {
+       if (info->io.genUserClip > 0)
+          handleUserClipPlanes();
+-      // fallthrough
++      uint32_t idx = nir_intrinsic_stream_id(insn);
++      mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
++      break;
++   }
+    case nir_intrinsic_end_primitive: {
+       uint32_t idx = nir_intrinsic_stream_id(insn);
++      if (idx)
++         break;
+       mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
+       break;
+    }
+@@ -1978,7 +2004,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1;
        uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
  
@@ -2904,7 +3319,7 @@ index bd78b76f384..69f9cfad0d6 100644
           loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i,
                    indirectOffset, indirectIndex);
        }
-@@ -2001,7 +2010,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -2001,7 +2027,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer);
        uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset);
  
@@ -2913,7 +3328,7 @@ index bd78b76f384..69f9cfad0d6 100644
           if (!((1u << i) & nir_intrinsic_write_mask(insn)))
              continue;
           Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType,
-@@ -2020,7 +2029,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -2020,7 +2046,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer);
        uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
  
@@ -2922,7 +3337,7 @@ index bd78b76f384..69f9cfad0d6 100644
           loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i,
                    indirectOffset, indirectBuffer);
  
-@@ -2314,7 +2323,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -2314,7 +2340,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        Value *indirectOffset;
        uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
  
@@ -2931,7 +3346,7 @@ index bd78b76f384..69f9cfad0d6 100644
           if (!((1u << i) & nir_intrinsic_write_mask(insn)))
              continue;
           Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, sType, offset + i * typeSizeof(sType));
-@@ -2328,7 +2337,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -2328,7 +2354,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        Value *indirectOffset;
        uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
  
@@ -2940,7 +3355,7 @@ index bd78b76f384..69f9cfad0d6 100644
           loadFrom(FILE_MEMORY_SHARED, 0, dType, newDefs[i], offset, i, indirectOffset);
  
        break;
-@@ -2367,7 +2376,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -2367,7 +2393,7 @@ Converter::visit(nir_intrinsic_instr *insn)
        Value *indirectOffset;
        uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
  
@@ -2949,7 +3364,7 @@ index bd78b76f384..69f9cfad0d6 100644
           loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset);
  
        info->io.globalAccess |= 0x1;
-@@ -2376,7 +2385,7 @@ Converter::visit(nir_intrinsic_instr *insn)
+@@ -2376,7 +2402,7 @@ Converter::visit(nir_intrinsic_instr *insn)
     case nir_intrinsic_store_global: {
        DataType sType = getSType(insn->src[0], false, false);
  
@@ -2958,7 +3373,15 @@ index bd78b76f384..69f9cfad0d6 100644
           if (!((1u << i) & nir_intrinsic_write_mask(insn)))
              continue;
           if (typeSizeof(sType) == 8) {
-@@ -2774,7 +2783,7 @@ Converter::visit(nir_alu_instr *insn)
+@@ -2418,7 +2444,6 @@ Converter::visit(nir_jump_instr *insn)
+    case nir_jump_continue: {
+       bool isBreak = insn->type == nir_jump_break;
+       nir_block *block = insn->instr.block;
+-      assert(!block->successors[1]);
+       BasicBlock *target = convert(block->successors[0]);
+       mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL);
+       bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK);
+@@ -2774,7 +2799,7 @@ Converter::visit(nir_alu_instr *insn)
     case nir_op_bfm: {
        DEFAULT_CHECKS;
        LValues &newDefs = convert(&insn->dest);
@@ -2967,7 +3390,7 @@ index bd78b76f384..69f9cfad0d6 100644
        break;
     }
     case nir_op_bitfield_insert: {
-@@ -2794,17 +2803,69 @@ Converter::visit(nir_alu_instr *insn)
+@@ -2794,17 +2819,69 @@ Converter::visit(nir_alu_instr *insn)
     case nir_op_bitfield_reverse: {
        DEFAULT_CHECKS;
        LValues &newDefs = convert(&insn->dest);
@@ -3039,7 +3462,25 @@ index bd78b76f384..69f9cfad0d6 100644
     // boolean conversions
     case nir_op_b2f32: {
        DEFAULT_CHECKS;
-@@ -3224,6 +3285,11 @@ Converter::run()
+@@ -2990,14 +3067,11 @@ Converter::handleDeref(nir_deref_instr *deref, Value * &indirect, const nir_vari
+ CacheMode
+ Converter::convert(enum gl_access_qualifier access)
+ {
+-   switch (access) {
+-   case ACCESS_VOLATILE:
++   if (access & ACCESS_VOLATILE)
+       return CACHE_CV;
+-   case ACCESS_COHERENT:
++   if (access & ACCESS_COHERENT)
+       return CACHE_CG;
+-   default:
+-      return CACHE_CA;
+-   }
++   return CACHE_CA;
+ }
+ 
+ CacheMode
+@@ -3224,6 +3298,11 @@ Converter::run()
     NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
     NIR_PASS_V(nir, nir_lower_phis_to_scalar);
  
@@ -3051,7 +3492,7 @@ index bd78b76f384..69f9cfad0d6 100644
     do {
        progress = false;
        NIR_PASS(progress, nir, nir_copy_prop);
-@@ -3285,3 +3351,125 @@ Program::makeFromNIR(struct nv50_ir_prog_info *info)
+@@ -3285,3 +3364,125 @@ Program::makeFromNIR(struct nv50_ir_prog_info *info)
  }
  
  } // namespace nv50_ir
@@ -3059,106 +3500,106 @@ index bd78b76f384..69f9cfad0d6 100644
 +static nir_shader_compiler_options
 +nvir_nir_shader_compiler_options(int chipset)
 +{
-+   return {
-+      .lower_fdiv = (chipset >= NVISA_GV100_CHIPSET),
-+      .lower_ffma = false,
-+      .fuse_ffma = false, /* nir doesn't track mad vs fma */
-+      .lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET),
-+      .lower_flrp32 = true,
-+      .lower_flrp64 = true,
-+      .lower_fpow = false, // TODO: nir's lowering is broken, or we could use it
-+      .lower_fsat = false,
-+      .lower_fsqrt = false, // TODO: only before gm200
-+      .lower_sincos = false,
-+      .lower_fmod = true,
-+      .lower_bitfield_extract = false,
-+      .lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET),
-+      .lower_bitfield_insert = false,
-+      .lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET),
-+      .lower_bitfield_insert_to_bitfield_select = false,
-+      .lower_bitfield_reverse = false,
-+      .lower_bit_count = false,
-+      .lower_ifind_msb = false,
-+      .lower_find_lsb = false,
-+      .lower_uadd_carry = true, // TODO
-+      .lower_usub_borrow = true, // TODO
-+      .lower_mul_high = false,
-+      .lower_negate = false,
-+      .lower_sub = true,
-+      .lower_scmp = true, // TODO: not implemented yet
-+      .lower_vector_cmp = false,
-+      .lower_idiv = true,
-+      .lower_bitops = false,
-+      .lower_isign = (chipset >= NVISA_GV100_CHIPSET),
-+      .lower_fsign = (chipset >= NVISA_GV100_CHIPSET),
-+      .lower_fdph = false,
-+      .lower_fdot = false,
-+      .fdot_replicates = false, // TODO
-+      .lower_ffloor = false, // TODO
-+      .lower_ffract = true,
-+      .lower_fceil = false, // TODO
-+      .lower_ftrunc = false,
-+      .lower_ldexp = true,
-+      .lower_pack_half_2x16 = true,
-+      .lower_pack_unorm_2x16 = true,
-+      .lower_pack_snorm_2x16 = true,
-+      .lower_pack_unorm_4x8 = true,
-+      .lower_pack_snorm_4x8 = true,
-+      .lower_unpack_half_2x16 = true,
-+      .lower_unpack_unorm_2x16 = true,
-+      .lower_unpack_snorm_2x16 = true,
-+      .lower_unpack_unorm_4x8 = true,
-+      .lower_unpack_snorm_4x8 = true,
-+      .lower_pack_split = false,
-+      .lower_extract_byte = (chipset < NVISA_GM107_CHIPSET),
-+      .lower_extract_word = (chipset < NVISA_GM107_CHIPSET),
-+      .lower_all_io_to_temps = false,
-+      .lower_all_io_to_elements = false,
-+      .vertex_id_zero_based = false,
-+      .lower_base_vertex = false,
-+      .lower_helper_invocation = false,
-+      .optimize_sample_mask_in = false,
-+      .lower_cs_local_index_from_id = true,
-+      .lower_cs_local_id_from_index = false,
-+      .lower_device_index_to_zero = false, // TODO
-+      .lower_wpos_pntc = false, // TODO
-+      .lower_hadd = true, // TODO
-+      .lower_add_sat = true, // TODO
-+      .vectorize_io = false,
-+      .lower_to_scalar = true,
-+      .unify_interfaces = false,
-+      .use_interpolated_input_intrinsics = true,
-+      .lower_mul_2x32_64 = true, // TODO
-+      .lower_rotate = (chipset < NVISA_GV100_CHIPSET),
-+      .has_imul24 = false,
-+      .intel_vec4 = false,
-+      .max_unroll_iterations = 32,
-+      .lower_int64_options = (nir_lower_int64_options) (
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) |
-+            nir_lower_divmod64 |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) |
-+            ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) |
-+            nir_lower_ufind_msb64
-+      ),
-+      .lower_doubles_options = (nir_lower_doubles_options) (
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) |
-+            nir_lower_dmod |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) |
-+            ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0)
-+      )
-+   };
++   nir_shader_compiler_options op = {};
++   op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET);
++   op.lower_ffma = false;
++   op.fuse_ffma = false; /* nir doesn't track mad vs fma */
++   op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET);
++   op.lower_flrp32 = true;
++   op.lower_flrp64 = true;
++   op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it
++   op.lower_fsat = false;
++   op.lower_fsqrt = false; // TODO: only before gm200
++   op.lower_sincos = false;
++   op.lower_fmod = true;
++   op.lower_bitfield_extract = false;
++   op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
++   op.lower_bitfield_insert = false;
++   op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
++   op.lower_bitfield_insert_to_bitfield_select = false;
++   op.lower_bitfield_reverse = false;
++   op.lower_bit_count = false;
++   op.lower_ifind_msb = false;
++   op.lower_find_lsb = false;
++   op.lower_uadd_carry = true; // TODO
++   op.lower_usub_borrow = true; // TODO
++   op.lower_mul_high = false;
++   op.lower_negate = false;
++   op.lower_sub = true;
++   op.lower_scmp = true; // TODO: not implemented yet
++   op.lower_vector_cmp = false;
++   op.lower_idiv = true;
++   op.lower_bitops = false;
++   op.lower_isign = (chipset >= NVISA_GV100_CHIPSET);
++   op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET);
++   op.lower_fdph = false;
++   op.lower_fdot = false;
++   op.fdot_replicates = false; // TODO
++   op.lower_ffloor = false; // TODO
++   op.lower_ffract = true;
++   op.lower_fceil = false; // TODO
++   op.lower_ftrunc = false;
++   op.lower_ldexp = true;
++   op.lower_pack_half_2x16 = true;
++   op.lower_pack_unorm_2x16 = true;
++   op.lower_pack_snorm_2x16 = true;
++   op.lower_pack_unorm_4x8 = true;
++   op.lower_pack_snorm_4x8 = true;
++   op.lower_unpack_half_2x16 = true;
++   op.lower_unpack_unorm_2x16 = true;
++   op.lower_unpack_snorm_2x16 = true;
++   op.lower_unpack_unorm_4x8 = true;
++   op.lower_unpack_snorm_4x8 = true;
++   op.lower_pack_split = false;
++   op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET);
++   op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET);
++   op.lower_all_io_to_temps = false;
++   op.lower_all_io_to_elements = false;
++   op.vertex_id_zero_based = false;
++   op.lower_base_vertex = false;
++   op.lower_helper_invocation = false;
++   op.optimize_sample_mask_in = false;
++   op.lower_cs_local_index_from_id = true;
++   op.lower_cs_local_id_from_index = false;
++   op.lower_device_index_to_zero = false; // TODO
++   op.lower_wpos_pntc = false; // TODO
++   op.lower_hadd = true; // TODO
++   op.lower_add_sat = true; // TODO
++   op.vectorize_io = false;
++   op.lower_to_scalar = false;
++   op.unify_interfaces = false;
++   op.use_interpolated_input_intrinsics = true;
++   op.lower_mul_2x32_64 = true; // TODO
++   op.lower_rotate = (chipset < NVISA_GV100_CHIPSET);
++   op.has_imul24 = false;
++   op.intel_vec4 = false;
++   op.max_unroll_iterations = 32;
++   op.lower_int64_options = (nir_lower_int64_options) (
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) |
++      nir_lower_divmod64 |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) |
++      ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) |
++      nir_lower_ufind_msb64
++   );
++   op.lower_doubles_options = (nir_lower_doubles_options) (
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) |
++      nir_lower_dmod |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) |
++      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0)
++   );
++   return op;
 +}
 +
 +static const nir_shader_compiler_options gf100_nir_shader_compiler_options =
@@ -3241,10 +3682,10 @@ index 71e5ea6417a..dfa1d035dac 100644
  };
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp
 new file mode 100644
-index 00000000000..4b6df0db588
+index 00000000000..644d4928327
 --- /dev/null
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp
-@@ -0,0 +1,477 @@
+@@ -0,0 +1,481 @@
 +/*
 + * Copyright 2020 Red Hat Inc.
 + *
@@ -3282,7 +3723,7 @@ index 00000000000..4b6df0db588
 +   Value *pred = bld.getSSA(1, FILE_PREDICATE);
 +
 +   bld.mkCmp(OP_SET, reverseCondCode(i->asCmp()->setCond), TYPE_U8, pred,
-+             i->sType, bld.mkImm(0), i->getSrc(2));
++             i->sType, bld.mkImm(0), i->getSrc(2))->ftz = i->ftz;
 +   bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);
 +   return true;
 +}
@@ -3436,6 +3877,7 @@ index 00000000000..4b6df0db588
 +   xsetp->src(0).mod = i->src(0).mod;
 +   xsetp->src(1).mod = i->src(1).mod;
 +   xsetp->setSrc(2, src2);
++   xsetp->ftz = i->ftz;
 +
 +   i = bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), bld.mkImm(0), met, pred);
 +   i->src(2).mod = Modifier(NV50_IR_MOD_NOT);
@@ -3453,24 +3895,25 @@ index 00000000000..4b6df0db588
 +}
 +
 +bool
-+GV100LegalizeSSA::handleSHL(Instruction *i)
++GV100LegalizeSSA::handleShift(Instruction *i)
 +{
-+   if (i->src(0).getFile() != FILE_GPR) {
-+      bld.mkOp3(OP_SHF, i->dType, i->getDef(0), bld.mkImm(0), i->getSrc(1),
-+                i->getSrc(0))->subOp = NV50_IR_SUBOP_SHF_L |
-+                                       NV50_IR_SUBOP_SHF_HI;
++   Value *zero = bld.mkImm(0);
++   Value *src1 = i->getSrc(1);
++   Value *src0, *src2;
++   uint8_t subOp = i->op == OP_SHL ? NV50_IR_SUBOP_SHF_L : NV50_IR_SUBOP_SHF_R;
++
++   if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) {
++      src0 = i->getSrc(0);
++      src2 = zero;
 +   } else {
-+      bld.mkOp3(OP_SHF, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1),
-+                bld.mkImm(0))->subOp = NV50_IR_SUBOP_SHF_L;
++      src0 = zero;
++      src2 = i->getSrc(0);
++      subOp |= NV50_IR_SUBOP_SHF_HI;
 +   }
-+   return true;
-+}
++   if (i->subOp & NV50_IR_SUBOP_SHIFT_WRAP)
++      subOp |= NV50_IR_SUBOP_SHF_W;
 +
-+bool
-+GV100LegalizeSSA::handleSHR(Instruction *i)
-+{
-+   bld.mkOp3(OP_SHF, i->dType, i->getDef(0), bld.mkImm(0), i->getSrc(1),
-+             i->getSrc(0))->subOp = NV50_IR_SUBOP_SHF_R | NV50_IR_SUBOP_SHF_HI;
++   bld.mkOp3(OP_SHF, i->dType, i->getDef(0), src0, src1, src2)->subOp = subOp;
 +   return true;
 +}
 +
@@ -3481,6 +3924,7 @@ index 00000000000..4b6df0db588
 +      bld.mkOp2(OP_ADD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1));
 +   xadd->src(0).mod = i->src(0).mod;
 +   xadd->src(1).mod = i->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
++   xadd->ftz = i->ftz;
 +   return true;
 +}
 +
@@ -3490,6 +3934,9 @@ index 00000000000..4b6df0db588
 +   bool lowered = false;
 +
 +   bld.setPosition(i, false);
++   if (i->sType == TYPE_F32 && i->dType != TYPE_F16 &&
++       prog->getType() != Program::TYPE_COMPUTE)
++      handleFTZ(i);
 +
 +   switch (i->op) {
 +   case OP_AND:
@@ -3502,10 +3949,8 @@ index 00000000000..4b6df0db588
 +      lowered = handleNOT(i);
 +      break;
 +   case OP_SHL:
-+      lowered = handleSHL(i);
-+      break;
 +   case OP_SHR:
-+      lowered = handleSHR(i);
++      lowered = handleShift(i);
 +      break;
 +   case OP_SET:
 +   case OP_SET_AND:
@@ -3724,10 +4169,10 @@ index 00000000000..4b6df0db588
 +} // namespace nv50_ir
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h
 new file mode 100644
-index 00000000000..92fdb938244
+index 00000000000..d918c6e83eb
 --- /dev/null
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h
-@@ -0,0 +1,79 @@
+@@ -0,0 +1,78 @@
 +/*
 + * Copyright 2020 Red Hat Inc.
 + *
@@ -3801,14 +4246,13 @@ index 00000000000..92fdb938244
 +   bool handleQUADPOP(Instruction *);
 +   bool handleSET(Instruction *);
 +   bool handleSHFL(Instruction *);
-+   bool handleSHL(Instruction *);
-+   bool handleSHR(Instruction *);
++   bool handleShift(Instruction *);
 +   bool handleSUB(Instruction *);
 +};
 +}
 +#endif
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
-index a60881000fe..f100445e9d0 100644
+index a60881000fe..067f9abaca8 100644
 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
 @@ -310,6 +310,14 @@ NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
@@ -3860,18 +4304,36 @@ index a60881000fe..f100445e9d0 100644
           bld.mkMovToReg(0, gpEmitAddress);
        }
     }
+@@ -1714,7 +1727,8 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
+          cctl->setPredicate(cas->cc, cas->getPredicate());
+    }
+ 
+-   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
++   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS &&
++       targ->getChipset() < NVISA_GV100_CHIPSET) {
+       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
+       // should be set to the high part of the double reg or bad things will
+       // happen elsewhere in the universe.
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
-index b4c405a9ea5..a4925013ee4 100644
+index b4c405a9ea5..8c99427d3c0 100644
 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
 +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
-@@ -68,6 +68,7 @@ private:
+@@ -64,12 +64,14 @@ private:
+    void handleDIV(Instruction *); // integer division, modulus
+    void handleRCPRSQLib(Instruction *, Value *[]);
+    void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
+-   void handleFTZ(Instruction *);
     void handleSET(CmpInstruction *);
     void handleTEXLOD(TexInstruction *);
     void handleShift(Instruction *);
 +   void handleBREV(Instruction *);
  
  protected:
++   void handleFTZ(Instruction *);
++
     BuildUtil bld;
+ };
+ 
 diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
 index 2f46b0e886a..3a4ec3ca561 100644
 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -8661,7 +9123,7 @@ index 8aa7088dfec..d49a5dfd2cf 100644
     PUSH_DATA (push, va + info->count * index_size - 1);
  
 diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
-index 146eeb35f85..d4687b652ba 100644
+index 146eeb35f85..ebbc410184b 100644
 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
 +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
 @@ -27,11 +27,18 @@
@@ -8792,7 +9254,7 @@ index 146eeb35f85..d4687b652ba 100644
  }
  
  static void
-@@ -577,92 +622,182 @@ nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
+@@ -577,92 +622,186 @@ nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
  }
  
  static void
@@ -8972,9 +9434,13 @@ index 146eeb35f85..d4687b652ba 100644
 +
 +   NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
 +   NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
-+   NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, VIA_HEADER_INDEX);
++   NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
 +   NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
 +                                  align(cp->cp.smem_size, 0x100));
++   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
++                                 (cp->hdr[1] & 0xfffff0) +
++                                 align(cp->cp.lmem_size, 0x10));
++   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
 +   NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
 +                                  gv100_sm_config_smem_size(8 * 1024));
 +   NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
@@ -9017,7 +9483,7 @@ index 146eeb35f85..d4687b652ba 100644
  }
  
  static inline void *
-@@ -677,6 +812,7 @@ nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
+@@ -677,6 +816,7 @@ nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
        ptr += adj;
        *pgpuaddr += adj;
     }
@@ -9025,7 +9491,7 @@ index 146eeb35f85..d4687b652ba 100644
     return ptr;
  }
  
-@@ -734,6 +870,9 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
+@@ -734,6 +874,9 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
     if (ret)
        goto out;
  
@@ -9035,7 +9501,7 @@ index 146eeb35f85..d4687b652ba 100644
     if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
        gp100_compute_setup_launch_desc(nvc0, desc, info);
     else
-@@ -743,10 +882,14 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
+@@ -743,10 +886,14 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
  
  #ifndef NDEBUG
     if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
@@ -9052,7 +9518,7 @@ index 146eeb35f85..d4687b652ba 100644
     }
  #endif
  
-@@ -877,115 +1020,6 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
+@@ -877,115 +1024,6 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
     nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
  }
  
diff --git a/SPECS/mesa.spec b/SPECS/mesa.spec
index 4e6c79d..177ac45 100644
--- a/SPECS/mesa.spec
+++ b/SPECS/mesa.spec
@@ -41,8 +41,8 @@
 
 Name:           mesa
 Summary:        Mesa graphics libraries
-Version:        20.1.2
-Release:        3%{?rctag:.%{rctag}}%{?dist}
+Version:        20.1.4
+Release:        1%{?rctag:.%{rctag}}%{?dist}
 
 License:        MIT
 URL:            http://www.mesa3d.org
@@ -56,11 +56,6 @@ Source3:        Makefile
 # Fedora opts to ignore the optional part of clause 2 and treat that code as 2 clause BSD.
 Source4:        Mesa-MLAA-License-Clarification-Email.txt
 
-# fix llvmpipe big-endian (#1847064)
-Patch1: 0001-gallivm-nir-fix-const-loading-on-big-endian-systems.patch
-Patch2: 0001-glsl-fix-constant-packing-for-64-bit-big-endian.patch
-Patch3: 0001-gallivm-nir-fix-big-endian-64-bit-splitting-merging.patch
-
 # Add support for TU11x nvidia
 Patch10: 0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch
 Patch11: nouveau-tu1xx-support.patch
@@ -551,6 +546,10 @@ done
 %endif
 
 %changelog
+* Wed Aug 05 2020 Dave Airlie <airlied@redhat.com> - 20.1.4-1
+- Update to 20.1.4
+- Update nouveau tu1xx support patch (Karol)
+
 * Mon Jun 29 2020 Dave Airlie <airlied@redhat.com> - 20.1.2-3
 - a fix on top of the big-endian fix (#1847064)