diff --git a/.gitignore b/.gitignore
index 94c9bc6..a4e3e90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-SOURCES/mesa-20.1.4.tar.xz
+SOURCES/mesa-20.3.3.tar.xz
diff --git a/.mesa.metadata b/.mesa.metadata
index ca12abc..3d34979 100644
--- a/.mesa.metadata
+++ b/.mesa.metadata
@@ -1 +1 @@
-78243cd7152a8ba759f8f2bdfcf0a877b455e351 SOURCES/mesa-20.1.4.tar.xz
+c0e42fada2b306a6d9740376398c0d8b0a130427 SOURCES/mesa-20.3.3.tar.xz
diff --git a/SOURCES/0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch b/SOURCES/0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch
deleted file mode 100644
index 0daf825..0000000
--- a/SOURCES/0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From d3ec950f0d8492b980a91844ffd744d7e7824277 Mon Sep 17 00:00:00 2001
-From: Ben Skeggs <bskeggs@redhat.com>
-Date: Sat, 6 Jun 2020 16:58:00 +1000
-Subject: [PATCH] nir: use bitfield_insert instead of bfi in
- nir_lower_double_ops
-
-NVIDIA hardware doesn't have an equivilant to bfi, but we do already have
-a lowering for bitfield_insert->bfi.
-
-Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
-Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
-Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5373>
----
- src/compiler/nir/nir_lower_double_ops.c | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/src/compiler/nir/nir_lower_double_ops.c b/src/compiler/nir/nir_lower_double_ops.c
-index f9c93a910a5..73226fd62ef 100644
---- a/src/compiler/nir/nir_lower_double_ops.c
-+++ b/src/compiler/nir/nir_lower_double_ops.c
-@@ -49,7 +49,9 @@ set_exponent(nir_builder *b, nir_ssa_def *src, nir_ssa_def *exp)
-    /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent
-     * to 1023
-     */
--   nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x7ff00000), exp, hi);
-+   nir_ssa_def *new_hi = nir_bitfield_insert(b, hi, exp,
-+                                             nir_imm_int(b, 20),
-+                                             nir_imm_int(b, 11));
-    /* recombine */
-    return nir_pack_64_2x32_split(b, lo, new_hi);
- }
--- 
-2.26.2
-
diff --git a/SOURCES/Makefile b/SOURCES/Makefile
index 8396596..eea9f33 100644
--- a/SOURCES/Makefile
+++ b/SOURCES/Makefile
@@ -1,4 +1,4 @@
-VERSION ?= 20.1.4
+VERSION ?= 20.3.3
 SANITIZE ?= 1
 
 DIRNAME = mesa-${VERSION}
diff --git a/SOURCES/anv-remove-warning.patch b/SOURCES/anv-remove-warning.patch
new file mode 100644
index 0000000..130a050
--- /dev/null
+++ b/SOURCES/anv-remove-warning.patch
@@ -0,0 +1,13 @@
+diff -up mesa-20.3.3/src/intel/vulkan/anv_perf.c.dma mesa-20.3.3/src/intel/vulkan/anv_perf.c
+--- mesa-20.3.3/src/intel/vulkan/anv_perf.c.dma	2021-02-16 12:56:09.881084752 +1000
++++ mesa-20.3.3/src/intel/vulkan/anv_perf.c	2021-02-16 12:56:14.626213956 +1000
+@@ -47,9 +47,6 @@ anv_get_perf(const struct gen_device_inf
+    gen_perf_init_metrics(perf, devinfo, fd, false /* pipeline statistics */);
+ 
+    if (!perf->n_queries) {
+-      if (perf->platform_supported)
+-         mesa_logw("Performance support disabled, "
+-                   "consider sysctl dev.i915.perf_stream_paranoid=0\n");
+       goto err;
+    }
+ 
diff --git a/SOURCES/cpu-affinity-fixes-20.3.3.patch b/SOURCES/cpu-affinity-fixes-20.3.3.patch
new file mode 100644
index 0000000..d11f5c4
--- /dev/null
+++ b/SOURCES/cpu-affinity-fixes-20.3.3.patch
@@ -0,0 +1,1583 @@
+diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp
+index cb646e2dd30..eac0a244adf 100644
+--- a/src/amd/compiler/tests/main.cpp
++++ b/src/amd/compiler/tests/main.cpp
+@@ -34,6 +34,8 @@
+ #include "aco_ir.h"
+ #include "framework.h"
+ 
++#include "util/u_cpu_detect.h"
++
+ static const char *help_message =
+    "Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n"
+    "\n"
+@@ -227,6 +229,8 @@ int main(int argc, char **argv)
+       return 99;
+    }
+ 
++   util_cpu_detect();
++
+    if (do_list) {
+       for (auto test : tests)
+          printf("%s\n", test.first.c_str());
+diff --git a/src/compiler/glsl/standalone.cpp b/src/compiler/glsl/standalone.cpp
+index ca187001186..2714d8b95ed 100644
+--- a/src/compiler/glsl/standalone.cpp
++++ b/src/compiler/glsl/standalone.cpp
+@@ -401,6 +401,8 @@ standalone_compile_shader(const struct standalone_options *_options,
+    int status = EXIT_SUCCESS;
+    bool glsl_es = false;
+ 
++   util_cpu_detect();
++
+    options = _options;
+ 
+    switch (options->glsl_version) {
+diff --git a/src/compiler/nir/tests/negative_equal_tests.cpp b/src/compiler/nir/tests/negative_equal_tests.cpp
+index f83041a4fbf..76472e48309 100644
+--- a/src/compiler/nir/tests/negative_equal_tests.cpp
++++ b/src/compiler/nir/tests/negative_equal_tests.cpp
+@@ -36,6 +36,7 @@ protected:
+    const_value_negative_equal_test()
+    {
+       glsl_type_singleton_init_or_ref();
++      util_cpu_detect();
+ 
+       memset(c1, 0, sizeof(c1));
+       memset(c2, 0, sizeof(c2));
+@@ -55,6 +56,7 @@ protected:
+    alu_srcs_negative_equal_test()
+    {
+       glsl_type_singleton_init_or_ref();
++      util_cpu_detect();
+ 
+       static const nir_shader_compiler_options options = { };
+       nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options);
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+index 165d73d94fc..33269e528fe 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+@@ -104,13 +104,13 @@ lp_build_min_simple(struct lp_build_context *bld,
+ 
+    /* TODO: optimize the constant case */
+ 
+-   if (type.floating && util_cpu_caps.has_sse) {
++   if (type.floating && util_get_cpu_caps()->has_sse) {
+       if (type.width == 32) {
+          if (type.length == 1) {
+             intrinsic = "llvm.x86.sse.min.ss";
+             intr_size = 128;
+          }
+-         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
++         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
+             intrinsic = "llvm.x86.sse.min.ps";
+             intr_size = 128;
+          }
+@@ -119,12 +119,12 @@ lp_build_min_simple(struct lp_build_context *bld,
+             intr_size = 256;
+          }
+       }
+-      if (type.width == 64 && util_cpu_caps.has_sse2) {
++      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
+          if (type.length == 1) {
+             intrinsic = "llvm.x86.sse2.min.sd";
+             intr_size = 128;
+          }
+-         else if (type.length == 2 || !util_cpu_caps.has_avx) {
++         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
+             intrinsic = "llvm.x86.sse2.min.pd";
+             intr_size = 128;
+          }
+@@ -134,7 +134,7 @@ lp_build_min_simple(struct lp_build_context *bld,
+          }
+       }
+    }
+-   else if (type.floating && util_cpu_caps.has_altivec) {
++   else if (type.floating && util_get_cpu_caps()->has_altivec) {
+       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
+           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
+          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
+@@ -144,7 +144,7 @@ lp_build_min_simple(struct lp_build_context *bld,
+          intrinsic = "llvm.ppc.altivec.vminfp";
+          intr_size = 128;
+       }
+-   } else if (util_cpu_caps.has_altivec) {
++   } else if (util_get_cpu_caps()->has_altivec) {
+       intr_size = 128;
+       if (type.width == 8) {
+          if (!type.sign) {
+@@ -174,7 +174,7 @@ lp_build_min_simple(struct lp_build_context *bld,
+        * The sse intrinsics return the second operator in case of nan by
+        * default so we need to special code to handle those.
+        */
+-      if (util_cpu_caps.has_sse && type.floating &&
++      if (util_get_cpu_caps()->has_sse && type.floating &&
+           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
+           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
+           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
+@@ -274,13 +274,13 @@ lp_build_max_simple(struct lp_build_context *bld,
+ 
+    /* TODO: optimize the constant case */
+ 
+-   if (type.floating && util_cpu_caps.has_sse) {
++   if (type.floating && util_get_cpu_caps()->has_sse) {
+       if (type.width == 32) {
+          if (type.length == 1) {
+             intrinsic = "llvm.x86.sse.max.ss";
+             intr_size = 128;
+          }
+-         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
++         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
+             intrinsic = "llvm.x86.sse.max.ps";
+             intr_size = 128;
+          }
+@@ -289,12 +289,12 @@ lp_build_max_simple(struct lp_build_context *bld,
+             intr_size = 256;
+          }
+       }
+-      if (type.width == 64 && util_cpu_caps.has_sse2) {
++      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
+          if (type.length == 1) {
+             intrinsic = "llvm.x86.sse2.max.sd";
+             intr_size = 128;
+          }
+-         else if (type.length == 2 || !util_cpu_caps.has_avx) {
++         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
+             intrinsic = "llvm.x86.sse2.max.pd";
+             intr_size = 128;
+          }
+@@ -304,7 +304,7 @@ lp_build_max_simple(struct lp_build_context *bld,
+          }
+       }
+    }
+-   else if (type.floating && util_cpu_caps.has_altivec) {
++   else if (type.floating && util_get_cpu_caps()->has_altivec) {
+       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
+           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
+          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
+@@ -314,7 +314,7 @@ lp_build_max_simple(struct lp_build_context *bld,
+          intrinsic = "llvm.ppc.altivec.vmaxfp";
+          intr_size = 128;
+       }
+-   } else if (util_cpu_caps.has_altivec) {
++   } else if (util_get_cpu_caps()->has_altivec) {
+      intr_size = 128;
+      if (type.width == 8) {
+        if (!type.sign) {
+@@ -338,7 +338,7 @@ lp_build_max_simple(struct lp_build_context *bld,
+    }
+ 
+    if (intrinsic) {
+-      if (util_cpu_caps.has_sse && type.floating &&
++      if (util_get_cpu_caps()->has_sse && type.floating &&
+           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
+           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
+           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
+@@ -472,12 +472,12 @@ lp_build_add(struct lp_build_context *bld,
+             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+          }
+          if (type.width * type.length == 128) {
+-            if (util_cpu_caps.has_sse2) {
++            if (util_get_cpu_caps()->has_sse2) {
+                if (type.width == 8)
+                  intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
+                if (type.width == 16)
+                  intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+-            } else if (util_cpu_caps.has_altivec) {
++            } else if (util_get_cpu_caps()->has_altivec) {
+                if (type.width == 8)
+                   intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
+                if (type.width == 16)
+@@ -485,7 +485,7 @@ lp_build_add(struct lp_build_context *bld,
+             }
+          }
+          if (type.width * type.length == 256) {
+-            if (util_cpu_caps.has_avx2) {
++            if (util_get_cpu_caps()->has_avx2) {
+                if (type.width == 8)
+                   intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
+                if (type.width == 16)
+@@ -713,11 +713,11 @@ lp_build_hadd_partial4(struct lp_build_context *bld,
+    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+ 
+-   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
++   if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
+        bld->type.length == 4) {
+       intrinsic = "llvm.x86.sse3.hadd.ps";
+    }
+-   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
++   else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
+             bld->type.length == 8) {
+       intrinsic = "llvm.x86.avx.hadd.ps.256";
+    }
+@@ -796,12 +796,12 @@ lp_build_sub(struct lp_build_context *bld,
+             return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
+          }
+          if (type.width * type.length == 128) {
+-            if (util_cpu_caps.has_sse2) {
++            if (util_get_cpu_caps()->has_sse2) {
+                if (type.width == 8)
+                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
+                if (type.width == 16)
+                   intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+-            } else if (util_cpu_caps.has_altivec) {
++            } else if (util_get_cpu_caps()->has_altivec) {
+                if (type.width == 8)
+                   intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
+                if (type.width == 16)
+@@ -809,7 +809,7 @@ lp_build_sub(struct lp_build_context *bld,
+             }
+          }
+          if (type.width * type.length == 256) {
+-            if (util_cpu_caps.has_avx2) {
++            if (util_get_cpu_caps()->has_avx2) {
+                if (type.width == 8)
+                   intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
+                if (type.width == 16)
+@@ -1078,8 +1078,8 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+     */
+    if (LLVM_VERSION_MAJOR < 7 &&
+        (bld->type.length == 4 || bld->type.length == 8) &&
+-       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+-        util_cpu_caps.has_sse4_1)) {
++       ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
++        util_get_cpu_caps()->has_sse4_1)) {
+       const char *intrinsic = NULL;
+       LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+       LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+@@ -1096,7 +1096,7 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
+       aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+       bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+ 
+-      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
++      if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
+          if (bld->type.sign) {
+             intrinsic = "llvm.x86.avx2.pmul.dq";
+          } else {
+@@ -1331,8 +1331,8 @@ lp_build_div(struct lp_build_context *bld,
+ 
+    /* fast rcp is disabled (just uses div), so makes no sense to try that */
+    if(FALSE &&
+-      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+-       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
++      ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
++       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
+       type.floating)
+       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
+ 
+@@ -1745,7 +1745,7 @@ lp_build_abs(struct lp_build_context *bld,
+       return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+    }
+ 
+-   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
++   if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
+       switch(type.width) {
+       case 8:
+          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
+@@ -1755,7 +1755,7 @@ lp_build_abs(struct lp_build_context *bld,
+          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+       }
+    }
+-   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
++   else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
+       switch(type.width) {
+       case 8:
+          return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
+@@ -1897,15 +1897,15 @@ lp_build_int_to_float(struct lp_build_context *bld,
+ static boolean
+ arch_rounding_available(const struct lp_type type)
+ {
+-   if ((util_cpu_caps.has_sse4_1 &&
++   if ((util_get_cpu_caps()->has_sse4_1 &&
+        (type.length == 1 || type.width*type.length == 128)) ||
+-       (util_cpu_caps.has_avx && type.width*type.length == 256) ||
+-       (util_cpu_caps.has_avx512f && type.width*type.length == 512))
++       (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
++       (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
+       return TRUE;
+-   else if ((util_cpu_caps.has_altivec &&
++   else if ((util_get_cpu_caps()->has_altivec &&
+             (type.width == 32 && type.length == 4)))
+       return TRUE;
+-   else if (util_cpu_caps.has_neon)
++   else if (util_get_cpu_caps()->has_neon)
+       return TRUE;
+ 
+    return FALSE;
+@@ -1935,7 +1935,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+    assert(type.width == 32);
+ 
+    assert(lp_check_value(type, a));
+-   assert(util_cpu_caps.has_sse2);
++   assert(util_get_cpu_caps()->has_sse2);
+ 
+    /* This is relying on MXCSR rounding mode, which should always be nearest. */
+    if (type.length == 1) {
+@@ -1961,7 +1961,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+       }
+       else {
+          assert(type.width*type.length == 256);
+-         assert(util_cpu_caps.has_avx);
++         assert(util_get_cpu_caps()->has_avx);
+ 
+          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+       }
+@@ -1987,7 +1987,7 @@ lp_build_round_altivec(struct lp_build_context *bld,
+    assert(type.floating);
+ 
+    assert(lp_check_value(type, a));
+-   assert(util_cpu_caps.has_altivec);
++   assert(util_get_cpu_caps()->has_altivec);
+ 
+    (void)type;
+ 
+@@ -2014,7 +2014,7 @@ lp_build_round_arch(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_mode mode)
+ {
+-   if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
++   if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
+       LLVMBuilderRef builder = bld->gallivm->builder;
+       const struct lp_type type = bld->type;
+       const char *intrinsic_root;
+@@ -2042,7 +2042,7 @@ lp_build_round_arch(struct lp_build_context *bld,
+       lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
+       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
+    }
+-   else /* (util_cpu_caps.has_altivec) */
++   else /* (util_get_cpu_caps()->has_altivec) */
+      return lp_build_round_altivec(bld, a, mode);
+ }
+ 
+@@ -2377,9 +2377,9 @@ lp_build_iround(struct lp_build_context *bld,
+ 
+    assert(lp_check_value(type, a));
+ 
+-   if ((util_cpu_caps.has_sse2 &&
++   if ((util_get_cpu_caps()->has_sse2 &&
+        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+-       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
++       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
+       return lp_build_iround_nearest_sse2(bld, a);
+    }
+    if (arch_rounding_available(type)) {
+@@ -2664,8 +2664,8 @@ lp_build_rcp(struct lp_build_context *bld,
+     * particular uses that require less workarounds.
+     */
+ 
+-   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+-         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
++   if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
++         (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
+       const unsigned num_iterations = 0;
+       LLVMValueRef res;
+       unsigned i;
+@@ -2784,8 +2784,8 @@ lp_build_fast_rsqrt_available(struct lp_type type)
+ {
+    assert(type.floating);
+ 
+-   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+-       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
++   if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
++       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
+       return true;
+    }
+    return false;
+@@ -3694,7 +3694,7 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
+ LLVMValueRef
+ lp_build_fpstate_get(struct gallivm_state *gallivm)
+ {
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       LLVMBuilderRef builder = gallivm->builder;
+       LLVMValueRef mxcsr_ptr = lp_build_alloca(
+          gallivm,
+@@ -3715,7 +3715,7 @@ void
+ lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
+                                   boolean zero)
+ {
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
+       int daz_ftz = _MM_FLUSH_ZERO_MASK;
+ 
+@@ -3724,7 +3724,7 @@ lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
+       LLVMValueRef mxcsr =
+          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
+ 
+-      if (util_cpu_caps.has_daz) {
++      if (util_get_cpu_caps()->has_daz) {
+          /* Enable denormals are zero mode */
+          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
+       }
+@@ -3745,7 +3745,7 @@ void
+ lp_build_fpstate_set(struct gallivm_state *gallivm,
+                      LLVMValueRef mxcsr_ptr)
+ {
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       LLVMBuilderRef builder = gallivm->builder;
+       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
+                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+index c68b8850473..af445b00c1a 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+@@ -101,7 +101,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
+    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+    LLVMValueRef h;
+ 
+-   if (util_cpu_caps.has_f16c &&
++   if (util_get_cpu_caps()->has_f16c &&
+        (src_length == 4 || src_length == 8)) {
+       if (LLVM_VERSION_MAJOR < 11) {
+          const char *intrinsic = NULL;
+@@ -167,7 +167,7 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
+     * useless.
+     */
+ 
+-   if (util_cpu_caps.has_f16c &&
++   if (util_get_cpu_caps()->has_f16c &&
+        (length == 4 || length == 8)) {
+       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
+       unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
+@@ -489,7 +489,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
+ 
+       /* Special case 4x4x32 --> 1x16x8 */
+       if (src_type.length == 4 &&
+-            (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
++            (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
+       {
+          num_dsts = (num_srcs + 3) / 4;
+          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
+@@ -500,7 +500,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
+ 
+       /* Special case 2x8x32 --> 1x16x8 */
+       if (src_type.length == 8 &&
+-          util_cpu_caps.has_avx)
++          util_get_cpu_caps()->has_avx)
+       {
+          num_dsts = (num_srcs + 1) / 2;
+          dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
+@@ -597,7 +597,7 @@ lp_build_conv(struct gallivm_state *gallivm,
+        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
+         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
+ 
+-       (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
++       (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec))
+    {
+       struct lp_build_context bld;
+       struct lp_type int16_type, int32_type;
+@@ -710,7 +710,7 @@ lp_build_conv(struct gallivm_state *gallivm,
+       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
+        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
+ 
+-      util_cpu_caps.has_avx) {
++      util_get_cpu_caps()->has_avx) {
+ 
+       struct lp_build_context bld;
+       struct lp_type int16_type, int32_type;
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
+index 174857e06d9..e17c7881e7d 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
+@@ -642,8 +642,8 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
+        * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
+        * Much cheaper (but we don't care that much if n == 1).
+        */
+-      if ((util_cpu_caps.has_sse2 && n == 4) ||
+-          (util_cpu_caps.has_avx2 && n == 8)) {
++      if ((util_get_cpu_caps()->has_sse2 && n == 4) ||
++          (util_get_cpu_caps()->has_avx2 && n == 8)) {
+          color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
+          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
+       }
+@@ -1350,7 +1350,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
+    if (is_dxt1_variant) {
+       LLVMValueRef color23_2, color2_2;
+ 
+-      if (util_cpu_caps.has_sse2) {
++      if (util_get_cpu_caps()->has_sse2) {
+          LLVMValueRef intrargs[2];
+          intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
+          /* same interleave as for lerp23 - correct result in 2nd element */
+@@ -1389,7 +1389,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
+       color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
+    }
+ 
+-   if (util_cpu_caps.has_ssse3) {
++   if (util_get_cpu_caps()->has_ssse3) {
+       /*
+        * Use pshufb as mini-lut. (Only doable with intrinsics as the
+        * final shuffles are non-constant. pshufb is awesome!)
+@@ -1689,7 +1689,7 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
+    type16.sign = FALSE;
+    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
+ 
+-   if (!util_cpu_caps.has_ssse3) {
++   if (!util_get_cpu_caps()->has_ssse3) {
+       LLVMValueRef acodeg, mask1, acode0, acode1;
+ 
+       /* extraction of the 3 bit values into something more useful is HARD */
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+index 121452d7596..97deffe1de0 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+@@ -90,7 +90,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
+     * per element. Didn't measure performance but cuts shader size
+     * by quite a bit (less difference if cpu has no sse4.1 support).
+     */
+-   if (util_cpu_caps.has_sse2 && n > 1) {
++   if (util_get_cpu_caps()->has_sse2 && n > 1) {
+       LLVMValueRef sel, tmp, tmp2;
+       struct lp_build_context bld32;
+ 
+@@ -174,7 +174,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
+     * per element. Didn't measure performance but cuts shader size
+     * by quite a bit (less difference if cpu has no sse4.1 support).
+     */
+-   if (util_cpu_caps.has_sse2 && n > 1) {
++   if (util_get_cpu_caps()->has_sse2 && n > 1) {
+       LLVMValueRef sel, tmp;
+       struct lp_build_context bld32;
+ 
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+index e991b0dc375..42cc17371a0 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+@@ -488,7 +488,7 @@ lp_build_gather(struct gallivm_state *gallivm,
+        * 32bit/64bit fetches you're doing it wrong (this is gather, not
+        * conversion) and it would be awkward for floats.
+        */
+-   } else if (util_cpu_caps.has_avx2 && !need_expansion &&
++   } else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&
+               src_width == 32 && (length == 4 || length == 8)) {
+       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                   base_ptr, offsets);
+@@ -500,7 +500,7 @@ lp_build_gather(struct gallivm_state *gallivm,
+     * (In general, should be more of a win if the fetch is 256bit wide -
+     * this is true for the 32bit case above too.)
+     */
+-   } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
++   } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&
+               src_width == 64 && (length == 2 || length == 4)) {
+       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
+                                   base_ptr, offsets);
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
+index 685ed0e58aa..dd428242cb9 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
+@@ -433,6 +433,7 @@ lp_build_init(void)
+    /* For simulating less capable machines */
+ #ifdef DEBUG
+    if (debug_get_bool_option("LP_FORCE_SSE2", FALSE)) {
++      extern struct util_cpu_caps_t util_cpu_caps;
+       assert(util_cpu_caps.has_sse2);
+       util_cpu_caps.has_sse3 = 0;
+       util_cpu_caps.has_ssse3 = 0;
+@@ -445,7 +446,7 @@ lp_build_init(void)
+    }
+ #endif
+ 
+-   if (util_cpu_caps.has_avx2 || util_cpu_caps.has_avx) {
++   if (util_get_cpu_caps()->has_avx2 || util_get_cpu_caps()->has_avx) {
+       lp_native_vector_width = 256;
+    } else {
+       /* Leave it at 128, even when no SIMD extensions are available.
+@@ -460,16 +461,16 @@ lp_build_init(void)
+ #if LLVM_VERSION_MAJOR < 4
+    if (lp_native_vector_width <= 128) {
+       /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by
+-       * "util_cpu_caps.has_avx" predicate, and lack the
++       * "util_get_cpu_caps()->has_avx" predicate, and lack the
+        * "lp_native_vector_width > 128" predicate. And also to ensure a more
+        * consistent behavior, allowing one to test SSE2 on AVX machines.
+        * XXX: should not play games with util_cpu_caps directly as it might
+        * get used for other things outside llvm too.
+        */
+-      util_cpu_caps.has_avx = 0;
+-      util_cpu_caps.has_avx2 = 0;
+-      util_cpu_caps.has_f16c = 0;
+-      util_cpu_caps.has_fma = 0;
++      util_get_cpu_caps()->has_avx = 0;
++      util_get_cpu_caps()->has_avx2 = 0;
++      util_get_cpu_caps()->has_f16c = 0;
++      util_get_cpu_caps()->has_fma = 0;
+    }
+ #endif
+ 
+@@ -482,7 +483,7 @@ lp_build_init(void)
+     * Right now denorms get explicitly disabled (but elsewhere) for x86,
+     * whereas ppc64 explicitly enables them...
+     */
+-   if (util_cpu_caps.has_altivec) {
++   if (util_get_cpu_caps()->has_altivec) {
+       unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+                                 0xFFFF, 0xFFFF, 0xFFFE, 0xFFFF };
+       __asm (
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+index 315977ae745..3ed3b5a74b1 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+@@ -196,7 +196,7 @@ lp_build_compare(struct gallivm_state *gallivm,
+ 
+    if (!type.floating && !type.sign &&
+        type.width * type.length == 128 &&
+-       util_cpu_caps.has_sse2 &&
++       util_get_cpu_caps()->has_sse2 &&
+        (func == PIPE_FUNC_LESS ||
+         func == PIPE_FUNC_LEQUAL ||
+         func == PIPE_FUNC_GREATER ||
+@@ -348,11 +348,11 @@ lp_build_select(struct lp_build_context *bld,
+ 
+       res = LLVMBuildSelect(builder, mask, a, b, "");
+    }
+-   else if (((util_cpu_caps.has_sse4_1 &&
++   else if (((util_get_cpu_caps()->has_sse4_1 &&
+               type.width * type.length == 128) ||
+-             (util_cpu_caps.has_avx &&
++             (util_get_cpu_caps()->has_avx &&
+               type.width * type.length == 256 && type.width >= 32) ||
+-             (util_cpu_caps.has_avx2 &&
++             (util_get_cpu_caps()->has_avx2 &&
+               type.width * type.length == 256)) &&
+             !LLVMIsConstant(a) &&
+             !LLVMIsConstant(b) &&
+@@ -379,7 +379,7 @@ lp_build_select(struct lp_build_context *bld,
+             intrinsic = "llvm.x86.avx.blendv.ps.256";
+             arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+          } else {
+-            assert(util_cpu_caps.has_avx2);
++            assert(util_get_cpu_caps()->has_avx2);
+             intrinsic = "llvm.x86.avx2.pblendvb";
+             arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
+          }
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+index 9b75676a4e2..4f3e696816c 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
++++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+@@ -400,22 +400,22 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+     * http://llvm.org/PR19429
+     * http://llvm.org/PR16721
+     */
+-   MAttrs.push_back(util_cpu_caps.has_sse    ? "+sse"    : "-sse"   );
+-   MAttrs.push_back(util_cpu_caps.has_sse2   ? "+sse2"   : "-sse2"  );
+-   MAttrs.push_back(util_cpu_caps.has_sse3   ? "+sse3"   : "-sse3"  );
+-   MAttrs.push_back(util_cpu_caps.has_ssse3  ? "+ssse3"  : "-ssse3" );
+-   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1");
+-   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2");
++   MAttrs.push_back(util_get_cpu_caps()->has_sse    ? "+sse"    : "-sse"   );
++   MAttrs.push_back(util_get_cpu_caps()->has_sse2   ? "+sse2"   : "-sse2"  );
++   MAttrs.push_back(util_get_cpu_caps()->has_sse3   ? "+sse3"   : "-sse3"  );
++   MAttrs.push_back(util_get_cpu_caps()->has_ssse3  ? "+ssse3"  : "-ssse3" );
++   MAttrs.push_back(util_get_cpu_caps()->has_sse4_1 ? "+sse4.1" : "-sse4.1");
++   MAttrs.push_back(util_get_cpu_caps()->has_sse4_2 ? "+sse4.2" : "-sse4.2");
+    /*
+     * AVX feature is not automatically detected from CPUID by the X86 target
+     * yet, because the old (yet default) JIT engine is not capable of
+     * emitting the opcodes. On newer llvm versions it is and at least some
+     * versions (tested with 3.3) will emit avx opcodes without this anyway.
+     */
+-   MAttrs.push_back(util_cpu_caps.has_avx  ? "+avx"  : "-avx");
+-   MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
+-   MAttrs.push_back(util_cpu_caps.has_fma  ? "+fma"  : "-fma");
+-   MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
++   MAttrs.push_back(util_get_cpu_caps()->has_avx  ? "+avx"  : "-avx");
++   MAttrs.push_back(util_get_cpu_caps()->has_f16c ? "+f16c" : "-f16c");
++   MAttrs.push_back(util_get_cpu_caps()->has_fma  ? "+fma"  : "-fma");
++   MAttrs.push_back(util_get_cpu_caps()->has_avx2 ? "+avx2" : "-avx2");
+    /* disable avx512 and all subvariants */
+    MAttrs.push_back("-avx512cd");
+    MAttrs.push_back("-avx512er");
+@@ -426,7 +426,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+    MAttrs.push_back("-avx512vl");
+ #endif
+ #if defined(PIPE_ARCH_ARM)
+-   if (!util_cpu_caps.has_neon) {
++   if (!util_get_cpu_caps()->has_neon) {
+       MAttrs.push_back("-neon");
+       MAttrs.push_back("-crypto");
+       MAttrs.push_back("-vfp2");
+@@ -434,7 +434,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+ #endif
+ 
+ #if defined(PIPE_ARCH_PPC)
+-   MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
++   MAttrs.push_back(util_get_cpu_caps()->has_altivec ? "+altivec" : "-altivec");
+ #if (LLVM_VERSION_MAJOR < 4)
+    /*
+     * Make sure VSX instructions are disabled
+@@ -444,7 +444,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+     * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0)
+     * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0)
+     */
+-   if (util_cpu_caps.has_altivec) {
++   if (util_get_cpu_caps()->has_altivec) {
+       MAttrs.push_back("-vsx");
+    }
+ #else
+@@ -458,8 +458,8 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+     * Make sure VSX instructions are ENABLED (if supported), unless
+     * VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0.
+     */
+-   if (util_cpu_caps.has_altivec) {
+-      MAttrs.push_back(util_cpu_caps.has_vsx ? "+vsx" : "-vsx");
++   if (util_get_cpu_caps()->has_altivec) {
++      MAttrs.push_back(util_get_cpu_caps()->has_vsx ? "+vsx" : "-vsx");
+    }
+ #endif
+ #endif
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+index e1f652a9342..76e57c52f80 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+@@ -322,7 +322,7 @@ lp_build_interleave2(struct gallivm_state *gallivm,
+ {
+    LLVMValueRef shuffle;
+ 
+-   if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
++   if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) {
+       /*
+        * XXX: This is a workaround for llvm code generation deficiency. Strangely
+        * enough, while this needs vinsertf128/vextractf128 instructions (hence
+@@ -484,7 +484,7 @@ lp_build_unpack2_native(struct gallivm_state *gallivm,
+ 
+    /* Interleave bits */
+ #if UTIL_ARCH_LITTLE_ENDIAN
+-   if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
++   if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) {
+       *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
+       *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
+    } else {
+@@ -585,22 +585,22 @@ lp_build_pack2(struct gallivm_state *gallivm,
+    assert(src_type.length * 2 == dst_type.length);
+ 
+    /* Check for special cases first */
+-   if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
++   if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) &&
+         src_type.width * src_type.length >= 128) {
+       const char *intrinsic = NULL;
+       boolean swap_intrinsic_operands = FALSE;
+ 
+       switch(src_type.width) {
+       case 32:
+-         if (util_cpu_caps.has_sse2) {
++         if (util_get_cpu_caps()->has_sse2) {
+            if (dst_type.sign) {
+               intrinsic = "llvm.x86.sse2.packssdw.128";
+            } else {
+-              if (util_cpu_caps.has_sse4_1) {
++              if (util_get_cpu_caps()->has_sse4_1) {
+                  intrinsic = "llvm.x86.sse41.packusdw";
+               }
+            }
+-         } else if (util_cpu_caps.has_altivec) {
++         } else if (util_get_cpu_caps()->has_altivec) {
+             if (dst_type.sign) {
+                intrinsic = "llvm.ppc.altivec.vpkswss";
+             } else {
+@@ -613,18 +613,18 @@ lp_build_pack2(struct gallivm_state *gallivm,
+          break;
+       case 16:
+          if (dst_type.sign) {
+-            if (util_cpu_caps.has_sse2) {
++            if (util_get_cpu_caps()->has_sse2) {
+                intrinsic = "llvm.x86.sse2.packsswb.128";
+-            } else if (util_cpu_caps.has_altivec) {
++            } else if (util_get_cpu_caps()->has_altivec) {
+                intrinsic = "llvm.ppc.altivec.vpkshss";
+ #if UTIL_ARCH_LITTLE_ENDIAN
+                swap_intrinsic_operands = TRUE;
+ #endif
+             }
+          } else {
+-            if (util_cpu_caps.has_sse2) {
++            if (util_get_cpu_caps()->has_sse2) {
+                intrinsic = "llvm.x86.sse2.packuswb.128";
+-            } else if (util_cpu_caps.has_altivec) {
++            } else if (util_get_cpu_caps()->has_altivec) {
+                intrinsic = "llvm.ppc.altivec.vpkshus";
+ #if UTIL_ARCH_LITTLE_ENDIAN
+                swap_intrinsic_operands = TRUE;
+@@ -740,7 +740,7 @@ lp_build_pack2_native(struct gallivm_state *gallivm,
+ 
+    /* At this point only have special case for avx2 */
+    if (src_type.length * src_type.width == 256 &&
+-       util_cpu_caps.has_avx2) {
++       util_get_cpu_caps()->has_avx2) {
+       switch(src_type.width) {
+       case 32:
+          if (dst_type.sign) {
+@@ -793,7 +793,7 @@ lp_build_packs2(struct gallivm_state *gallivm,
+ 
+    /* All X86 SSE non-interleaved pack instructions take signed inputs and
+     * saturate them, so no need to clamp for those cases. */
+-   if(util_cpu_caps.has_sse2 &&
++   if(util_get_cpu_caps()->has_sse2 &&
+       src_type.width * src_type.length >= 128 &&
+       src_type.sign &&
+       (src_type.width == 32 || src_type.width == 16))
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+index 686abc08620..98dcde912b5 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+@@ -1152,7 +1152,7 @@ lp_build_minify(struct lp_build_context *bld,
+       LLVMValueRef size;
+       assert(bld->type.sign);
+       if (lod_scalar ||
+-         (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
++         (util_get_cpu_caps()->has_avx2 || !util_get_cpu_caps()->has_sse)) {
+          size = LLVMBuildLShr(builder, base_size, level, "minify");
+          size = lp_build_max(bld, size, bld->one);
+       }
+diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+index 2b91edd37c7..6e47640e70d 100644
+--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+@@ -3234,7 +3234,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
+        * as it appears to be a loss with just AVX)
+        */
+       if (num_quads == 1 || !use_aos ||
+-          (util_cpu_caps.has_avx2 &&
++          (util_get_cpu_caps()->has_avx2 &&
+            (bld.num_lods == 1 ||
+             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
+          if (use_aos) {
+diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+index b1c8b990ef1..03b11f914b4 100644
+--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
++++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+@@ -35,10 +35,10 @@
+ 
+ DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", false);
+ 
+-static struct util_cpu_caps *get_cpu_caps(void)
++static const struct util_cpu_caps_t *get_cpu_caps(void)
+ {
+    util_cpu_detect();
+-   return &util_cpu_caps;
++   return util_get_cpu_caps();
+ }
+ 
+ int rtasm_cpu_has_sse(void)
+diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+index ad687f32853..ddd65fb6a08 100644
+--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
++++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+@@ -2152,17 +2152,17 @@ static void x86_init_func_common( struct x86_function *p )
+ {
+    util_cpu_detect();
+    p->caps = 0;
+-   if(util_cpu_caps.has_mmx)
++   if(util_get_cpu_caps()->has_mmx)
+       p->caps |= X86_MMX;
+-   if(util_cpu_caps.has_mmx2)
++   if(util_get_cpu_caps()->has_mmx2)
+       p->caps |= X86_MMX2;
+-   if(util_cpu_caps.has_sse)
++   if(util_get_cpu_caps()->has_sse)
+       p->caps |= X86_SSE;
+-   if(util_cpu_caps.has_sse2)
++   if(util_get_cpu_caps()->has_sse2)
+       p->caps |= X86_SSE2;
+-   if(util_cpu_caps.has_sse3)
++   if(util_get_cpu_caps()->has_sse3)
+       p->caps |= X86_SSE3;
+-   if(util_cpu_caps.has_sse4_1)
++   if(util_get_cpu_caps()->has_sse4_1)
+       p->caps |= X86_SSE4_1;
+    p->csr = p->store;
+ #if defined(PIPE_ARCH_X86)
+diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
+index 1eaff77724e..bf56993db09 100644
+--- a/src/gallium/auxiliary/util/u_threaded_context.c
++++ b/src/gallium/auxiliary/util/u_threaded_context.c
+@@ -2071,8 +2071,8 @@ tc_set_context_param(struct pipe_context *_pipe,
+    if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) {
+       /* Pin the gallium thread as requested. */
+       util_set_thread_affinity(tc->queue.threads[0],
+-                               util_cpu_caps.L3_affinity_mask[value],
+-                               NULL, UTIL_MAX_CPUS);
++                               util_get_cpu_caps()->L3_affinity_mask[value],
++                               NULL, util_get_cpu_caps()->num_cpu_mask_bits);
+ 
+       /* Execute this immediately (without enqueuing).
+        * It's required to be thread-safe.
+@@ -2720,7 +2720,7 @@ threaded_context_create(struct pipe_context *pipe,
+ 
+    util_cpu_detect();
+ 
+-   if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1))
++   if (!debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1))
+       return pipe;
+ 
+    tc = os_malloc_aligned(sizeof(struct threaded_context), 16);
+diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+index 64cf72ae101..913c1bd2462 100644
+--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
++++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+@@ -435,7 +435,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
+    assert(type.length <= 16);
+    assert(type.floating);
+ 
+-   if(util_cpu_caps.has_sse && type.length == 4) {
++   if(util_get_cpu_caps()->has_sse && type.length == 4) {
+       const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+       const char *popcntintr = "llvm.ctpop.i32";
+       LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+@@ -446,7 +446,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
+                                        LLVMInt32TypeInContext(context), bits);
+       count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+    }
+-   else if(util_cpu_caps.has_avx && type.length == 8) {
++   else if(util_get_cpu_caps()->has_avx && type.length == 8) {
+       const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+       const char *popcntintr = "llvm.ctpop.i32";
+       LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
+index f133bbf8a4d..628a4338c1e 100644
+--- a/src/gallium/drivers/llvmpipe/lp_screen.c
++++ b/src/gallium/drivers/llvmpipe/lp_screen.c
+@@ -915,7 +915,7 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
+ 
+    screen->allow_cl = !!getenv("LP_CL");
+    screen->use_tgsi = (LP_DEBUG & DEBUG_TGSI_IR);
+-   screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0;
++   screen->num_threads = util_get_cpu_caps()->nr_cpus > 1 ? util_get_cpu_caps()->nr_cpus : 0;
+ #ifdef EMBEDDED_DEVICE
+    screen->num_threads = 0;
+ #endif
+diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
+index 873dcf37fac..725854cc25c 100644
+--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
++++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
+@@ -382,7 +382,7 @@ flush_denorm_to_zero(float val)
+    fi_val.f = val;
+ 
+ #if defined(PIPE_ARCH_SSE)
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       if ((fi_val.ui & 0x7f800000) == 0) {
+          fi_val.ui &= 0xff800000;
+       }
+@@ -458,7 +458,7 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test, unsigned
+             continue;
+          }
+ 
+-         if (!util_cpu_caps.has_neon &&
++         if (!util_get_cpu_caps()->has_neon &&
+              test->ref == &nearbyintf && length == 2 &&
+              ref != roundf(testval)) {
+             /* FIXME: The generic (non SSE) path in lp_build_iround, which is
+diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
+index 2bf223d66f9..815736166d5 100644
+--- a/src/gallium/drivers/llvmpipe/lp_texture.c
++++ b/src/gallium/drivers/llvmpipe/lp_texture.c
+@@ -85,7 +85,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
+     * of a block for all formats) though this should not be strictly necessary
+     * neither. In any case it can only affect compressed or 1d textures.
+     */
+-   unsigned mip_align = MAX2(64, util_cpu_caps.cacheline);
++   unsigned mip_align = MAX2(64, util_get_cpu_caps()->cacheline);
+ 
+    assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
+    assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
+@@ -123,7 +123,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
+       if (util_format_is_compressed(pt->format))
+          lpr->row_stride[level] = nblocksx * block_size;
+       else
+-         lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
++         lpr->row_stride[level] = align(nblocksx * block_size, util_get_cpu_caps()->cacheline);
+ 
+       /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
+       if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) {
+diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
+index 97db7ca3e8b..d891b6b14e8 100644
+--- a/src/gallium/drivers/swr/swr_loader.cpp
++++ b/src/gallium/drivers/swr/swr_loader.cpp
+@@ -91,7 +91,7 @@ swr_create_screen(struct sw_winsys *winsys)
+ 
+    util_cpu_detect();
+ 
+-   if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) {
++   if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) {
+       swr_print_info("SWR detected KNL instruction support ");
+ #ifndef HAVE_SWR_KNL
+       swr_print_info("(skipping: not built).\n");
+@@ -103,7 +103,7 @@ swr_create_screen(struct sw_winsys *winsys)
+ #endif
+    }
+ 
+-   if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) {
++   if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) {
+       swr_print_info("SWR detected SKX instruction support ");
+ #ifndef HAVE_SWR_SKX
+       swr_print_info("(skipping not built).\n");
+@@ -113,7 +113,7 @@ swr_create_screen(struct sw_winsys *winsys)
+ #endif
+    }
+ 
+-   if (util_cpu_caps.has_avx2) {
++   if (util_get_cpu_caps()->has_avx2) {
+       swr_print_info("SWR detected AVX2 instruction support ");
+ #ifndef HAVE_SWR_AVX2
+       swr_print_info("(skipping not built).\n");
+@@ -123,7 +123,7 @@ swr_create_screen(struct sw_winsys *winsys)
+ #endif
+    }
+ 
+-   if (util_cpu_caps.has_avx) {
++   if (util_get_cpu_caps()->has_avx) {
+       swr_print_info("SWR detected AVX instruction support ");
+ #ifndef HAVE_SWR_AVX
+       swr_print_info("(skipping not built).\n");
+diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
+index 66767e7f1f8..5afe32939a8 100644
+--- a/src/gallium/drivers/vc4/vc4_tiling.h
++++ b/src/gallium/drivers/vc4/vc4_tiling.h
+@@ -90,7 +90,7 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride,
+                   int cpp, const struct pipe_box *box)
+ {
+ #ifdef USE_ARM_ASM
+-        if (util_cpu_caps.has_neon) {
++        if (util_get_cpu_caps()->has_neon) {
+                 vc4_load_lt_image_neon(dst, dst_stride, src, src_stride,
+                                        cpp, box);
+                 return;
+@@ -106,7 +106,7 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride,
+                    int cpp, const struct pipe_box *box)
+ {
+ #ifdef USE_ARM_ASM
+-        if (util_cpu_caps.has_neon) {
++        if (util_get_cpu_caps()->has_neon) {
+                 vc4_store_lt_image_neon(dst, dst_stride, src, src_stride,
+                                         cpp, box);
+                 return;
+diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c
+index 4d9c4e27ebf..782f16e7f78 100644
+--- a/src/gallium/tests/unit/translate_test.c
++++ b/src/gallium/tests/unit/translate_test.c
+@@ -50,6 +50,7 @@ int main(int argc, char** argv)
+ {
+    struct translate *(*create_fn)(const struct translate_key *key) = 0;
+ 
++   extern struct util_cpu_caps_t util_cpu_caps;
+    struct translate_key key;
+    unsigned output_format;
+    unsigned input_format;
+@@ -87,7 +88,7 @@ int main(int argc, char** argv)
+    }
+    else if (!strcmp(argv[1], "sse"))
+    {
+-      if(!util_cpu_caps.has_sse || !rtasm_cpu_has_sse())
++      if(!util_get_cpu_caps()->has_sse || !rtasm_cpu_has_sse())
+       {
+          printf("Error: CPU doesn't support SSE (test with qemu)\n");
+          return 2;
+@@ -99,7 +100,7 @@ int main(int argc, char** argv)
+    }
+    else if (!strcmp(argv[1], "sse2"))
+    {
+-      if(!util_cpu_caps.has_sse2 || !rtasm_cpu_has_sse())
++      if(!util_get_cpu_caps()->has_sse2 || !rtasm_cpu_has_sse())
+       {
+          printf("Error: CPU doesn't support SSE2 (test with qemu)\n");
+          return 2;
+@@ -110,7 +111,7 @@ int main(int argc, char** argv)
+    }
+    else if (!strcmp(argv[1], "sse3"))
+    {
+-      if(!util_cpu_caps.has_sse3 || !rtasm_cpu_has_sse())
++      if(!util_get_cpu_caps()->has_sse3 || !rtasm_cpu_has_sse())
+       {
+          printf("Error: CPU doesn't support SSE3 (test with qemu)\n");
+          return 2;
+@@ -120,7 +121,7 @@ int main(int argc, char** argv)
+    }
+    else if (!strcmp(argv[1], "sse4.1"))
+    {
+-      if(!util_cpu_caps.has_sse4_1 || !rtasm_cpu_has_sse())
++      if(!util_get_cpu_caps()->has_sse4_1 || !rtasm_cpu_has_sse())
+       {
+          printf("Error: CPU doesn't support SSE4.1 (test with qemu)\n");
+          return 2;
+diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c
+index 7f2eba9382b..4474cfb82b0 100644
+--- a/src/gallium/tests/unit/u_half_test.c
++++ b/src/gallium/tests/unit/u_half_test.c
+@@ -36,13 +36,14 @@ test(void)
+ int
+ main(int argc, char **argv)
+ {
+-   assert(!util_cpu_caps.has_f16c);
++   util_cpu_detect();
+    test();
+ 
+-   /* Test f16c. */
+-   util_cpu_detect();
+-   if (util_cpu_caps.has_f16c)
++   /* Test non-f16c. */
++   if (util_get_cpu_caps()->has_f16c) {
++      ((struct util_cpu_caps_t *)util_get_cpu_caps())->has_f16c = false;
+       test();
++   }
+ 
+    printf("Success!\n");
+    return 0;
+diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+index 8a0aedfed64..a18362ce6ea 100644
+--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
++++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+@@ -312,8 +312,8 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws,
+    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
+ 
+    util_set_thread_affinity(ws->cs_queue.threads[0],
+-                            util_cpu_caps.L3_affinity_mask[cache],
+-                            NULL, UTIL_MAX_CPUS);
++                            util_get_cpu_caps()->L3_affinity_mask[cache],
++                            NULL, util_get_cpu_caps()->num_cpu_mask_bits);
+ }
+ 
+ static uint32_t kms_handle_hash(const void *key)
+diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+index f0e1b9f7df3..4430ce50466 100644
+--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
++++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+@@ -801,8 +801,8 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws,
+ 
+    if (util_queue_is_initialized(&rws->cs_queue)) {
+       util_set_thread_affinity(rws->cs_queue.threads[0],
+-                               util_cpu_caps.L3_affinity_mask[cache],
+-                               NULL, UTIL_MAX_CPUS);
++                               util_get_cpu_caps()->L3_affinity_mask[cache],
++                               NULL, util_get_cpu_caps()->num_cpu_mask_bits);
+    }
+ }
+ 
+diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c
+index eb8eb30cabc..c9dfef541fc 100644
+--- a/src/mesa/main/glthread.c
++++ b/src/mesa/main/glthread.c
+@@ -199,19 +199,20 @@ _mesa_glthread_flush_batch(struct gl_context *ctx)
+    /* Pin threads regularly to the same Zen CCX that the main thread is
+     * running on. The main thread can move between CCXs.
+     */
+-   if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
++   if (util_get_cpu_caps()->nr_cpus != util_get_cpu_caps()->cores_per_L3 &&
+        /* driver support */
+        ctx->Driver.PinDriverToL3Cache &&
+        ++glthread->pin_thread_counter % 128 == 0) {
+       int cpu = util_get_current_cpu();
+ 
+       if (cpu >= 0) {
+-         unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
+-
+-         util_set_thread_affinity(glthread->queue.threads[0],
+-                                  util_cpu_caps.L3_affinity_mask[L3_cache],
+-                                  NULL, UTIL_MAX_CPUS);
+-         ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
++         uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu];
++         if (L3_cache != U_CPU_INVALID_L3) {
++            util_set_thread_affinity(glthread->queue.threads[0],
++                                     util_get_cpu_caps()->L3_affinity_mask[L3_cache],
++                                     NULL, util_get_cpu_caps()->num_cpu_mask_bits);
++            ctx->Driver.PinDriverToL3Cache(ctx, L3_cache);
++         }
+       }
+    }
+ 
+diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
+index 40364296664..f27fa7ff29c 100644
+--- a/src/mesa/state_tracker/st_context.c
++++ b/src/mesa/state_tracker/st_context.c
+@@ -815,6 +815,10 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
+          !st->lower_ucp;
+    st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders;
+ 
++   if (util_get_cpu_caps()->cores_per_L3 == util_get_cpu_caps()->nr_cpus ||
++       !st->pipe->set_context_param)
++      st->pin_thread_counter = ST_L3_PINNING_DISABLED;
++
+    st->bitmap.cache.empty = true;
+ 
+    if (ctx->Const.ForceGLNamesReuse && ctx->Shared->RefCount == 1) {
+diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
+index b1fda06ff3e..9ab6969de62 100644
+--- a/src/mesa/state_tracker/st_context.h
++++ b/src/mesa/state_tracker/st_context.h
+@@ -55,6 +55,7 @@ struct st_program;
+ struct st_perf_monitor_group;
+ struct u_upload_mgr;
+ 
++#define ST_L3_PINNING_DISABLED 0xffffffff
+ 
+ struct st_bitmap_cache
+ {
+@@ -130,6 +131,9 @@ struct st_context
+    struct draw_stage *feedback_stage;  /**< For GL_FEEDBACK rendermode */
+    struct draw_stage *selection_stage;  /**< For GL_SELECT rendermode */
+    struct draw_stage *rastpos_stage;  /**< For glRasterPos */
++
++   unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */
++
+    GLboolean clamp_frag_color_in_shader;
+    GLboolean clamp_vert_color_in_shader;
+    boolean clamp_frag_depth_in_shader;
+@@ -235,8 +239,6 @@ struct st_context
+    /** This masks out unused shader resources. Only valid in draw calls. */
+    uint64_t active_states;
+ 
+-   unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */
+-
+    /* If true, further analysis of states is required to know if something
+     * has changed. Used mainly for shaders.
+     */
+diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
+index 996d985510c..159d7017b07 100644
+--- a/src/mesa/state_tracker/st_draw.c
++++ b/src/mesa/state_tracker/st_draw.c
+@@ -124,26 +124,26 @@ prepare_draw(struct st_context *st, struct gl_context *ctx)
+       st_validate_state(st, ST_PIPELINE_RENDER);
+    }
+ 
+-   struct pipe_context *pipe = st->pipe;
+-
+    /* Pin threads regularly to the same Zen CCX that the main thread is
+     * running on. The main thread can move between CCXs.
+     */
+-   if (unlikely(/* AMD Zen */
+-                util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 &&
++   if (unlikely(st->pin_thread_counter != ST_L3_PINNING_DISABLED &&
+                 /* no glthread */
+                 ctx->CurrentClientDispatch != ctx->MarshalExec &&
+-                /* driver support */
+-                pipe->set_context_param &&
+                 /* do it occasionally */
+                 ++st->pin_thread_counter % 512 == 0)) {
++      st->pin_thread_counter = 0;
++
+       int cpu = util_get_current_cpu();
+       if (cpu >= 0) {
+-         unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu];
+-
+-         pipe->set_context_param(pipe,
+-                                 PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
+-                                 L3_cache);
++         struct pipe_context *pipe = st->pipe;
++         uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu];
++
++         if (L3_cache != U_CPU_INVALID_L3) {
++            pipe->set_context_param(pipe,
++                                    PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE,
++                                    L3_cache);
++         }
+       }
+    }
+ }
+diff --git a/src/util/half_float.h b/src/util/half_float.h
+index c52bccf8d1e..8f1a1dbf11d 100644
+--- a/src/util/half_float.h
++++ b/src/util/half_float.h
+@@ -59,7 +59,7 @@ static inline uint16_t
+ _mesa_float_to_half(float val)
+ {
+ #if defined(USE_X86_64_ASM)
+-   if (util_cpu_caps.has_f16c) {
++   if (util_get_cpu_caps()->has_f16c) {
+       __m128 in = {val};
+       __m128i out;
+ 
+@@ -75,7 +75,7 @@ static inline float
+ _mesa_half_to_float(uint16_t val)
+ {
+ #if defined(USE_X86_64_ASM)
+-   if (util_cpu_caps.has_f16c) {
++   if (util_get_cpu_caps()->has_f16c) {
+       __m128i in = {val};
+       __m128 out;
+ 
+@@ -90,7 +90,7 @@ static inline uint16_t
+ _mesa_float_to_float16_rtz(float val)
+ {
+ #if defined(USE_X86_64_ASM)
+-   if (util_cpu_caps.has_f16c) {
++   if (util_get_cpu_caps()->has_f16c) {
+       __m128 in = {val};
+       __m128i out;
+ 
+diff --git a/src/util/tests/format/u_format_test.c b/src/util/tests/format/u_format_test.c
+index f4a62a5c6a8..e6473c2bf6d 100644
+--- a/src/util/tests/format/u_format_test.c
++++ b/src/util/tests/format/u_format_test.c
+@@ -850,6 +850,8 @@ int main(int argc, char **argv)
+ {
+    boolean success;
+ 
++   util_cpu_detect();
++
+    success = test_all();
+ 
+    return success ? 0 : 1;
+diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c
+index 025f2f30156..4a4b06e1bc6 100644
+--- a/src/util/u_cpu_detect.c
++++ b/src/util/u_cpu_detect.c
+@@ -90,7 +90,7 @@
+ DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
+ 
+ 
+-struct util_cpu_caps util_cpu_caps;
++struct util_cpu_caps_t util_cpu_caps;
+ 
+ #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ static int has_cpuid(void);
+@@ -438,26 +438,22 @@ get_cpu_topology(void)
+    util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
+    util_cpu_caps.num_L3_caches = 1;
+ 
++   memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
++
+ #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+    /* AMD Zen */
+    if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
+        util_cpu_caps.family < CPU_AMD_LAST) {
+       uint32_t regs[4];
+ 
+-      /* Query the L3 cache count. */
+-      cpuid_count(0x8000001D, 3, regs);
+-      unsigned cache_level = (regs[0] >> 5) & 0x7;
+-      unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
+-
+-      if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus)
+-         return;
+-
+       uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
+       uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
+-      uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0};
+-      uint32_t apic_id[UTIL_MAX_CPUS];
+       bool saved = false;
+ 
++      uint32_t L3_found[UTIL_MAX_CPUS] = {0};
++      uint32_t num_L3_caches = 0;
++      util_affinity_mask *L3_affinity_masks = NULL;
++
+       /* Query APIC IDs from each CPU core.
+        *
+        * An APIC ID is a logical ID of the CPU with respect to the cache
+@@ -482,41 +478,60 @@ get_cpu_topology(void)
+ 
+          if (util_set_current_thread_affinity(mask,
+                                               !saved ? saved_mask : NULL,
+-                                              UTIL_MAX_CPUS)) {
++                                              util_cpu_caps.num_cpu_mask_bits)) {
+             saved = true;
+-            allowed_mask[i / 32] |= cpu_bit;
+ 
+             /* Query the APIC ID of the current core. */
+             cpuid(0x00000001, regs);
+-            apic_id[i] = regs[1] >> 24;
++            unsigned apic_id = regs[1] >> 24;
++
++            /* Query the total core count for the CPU */
++            uint32_t core_count = 1;
++            if (regs[3] & (1 << 28))
++               core_count = (regs[1] >> 16) & 0xff;
++
++            core_count = util_next_power_of_two(core_count);
++
++            /* Query the L3 cache count. */
++            cpuid_count(0x8000001D, 3, regs);
++            unsigned cache_level = (regs[0] >> 5) & 0x7;
++            unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
++
++            if (cache_level != 3)
++               continue;
++
++            unsigned local_core_id = apic_id & (core_count - 1);
++            unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
++            unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
++#define L3_ID(p, i) (p << 16 | i << 1 | 1);
++
++            unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
++            int idx = -1;
++            for (unsigned c = 0; c < num_L3_caches; c++) {
++               if (L3_found[c] == l3_id) {
++                  idx = c;
++                  break;
++               }
++            }
++            if (idx == -1) {
++               idx = num_L3_caches;
++               L3_found[num_L3_caches++] = l3_id;
++               L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
++               if (!L3_affinity_masks)
++                  return;
++               memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
++            }
++            util_cpu_caps.cpu_to_L3[i] = idx;
++            L3_affinity_masks[idx][i / 32] |= cpu_bit;
++
+          }
+          mask[i / 32] = 0;
+       }
+ 
+-      if (saved) {
+-
+-         /* We succeeded in using at least one CPU. */
+-         util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3;
+-         util_cpu_caps.cores_per_L3 = cores_per_L3;
+-         util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask),
+-                                                 util_cpu_caps.num_L3_caches);
+-
+-         for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS;
+-              i++) {
+-            uint32_t cpu_bit = 1u << (i % 32);
+-
+-            if (allowed_mask[i / 32] & cpu_bit) {
+-               /* Each APIC ID bit represents a topology level, so we need
+-                * to round up to the next power of two.
+-                */
+-               unsigned L3_index = apic_id[i] /
+-                                   util_next_power_of_two(cores_per_L3);
+-
+-               util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit;
+-               util_cpu_caps.cpu_to_L3[i] = L3_index;
+-            }
+-         }
++      util_cpu_caps.num_L3_caches = num_L3_caches;
++      util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
+ 
++      if (saved) {
+          if (debug_get_option_dump_cpu()) {
+             fprintf(stderr, "CPU <-> L3 cache mapping:\n");
+             for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
+@@ -528,7 +543,8 @@ get_cpu_topology(void)
+          }
+ 
+          /* Restore the original affinity mask. */
+-         util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS);
++         util_set_current_thread_affinity(saved_mask, NULL,
++                                          util_cpu_caps.num_cpu_mask_bits);
+       } else {
+          if (debug_get_option_dump_cpu())
+             fprintf(stderr, "Cannot set thread affinity for any thread.\n");
+@@ -547,7 +563,7 @@ util_cpu_detect_once(void)
+    {
+       SYSTEM_INFO system_info;
+       GetSystemInfo(&system_info);
+-      util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
++      util_cpu_caps.nr_cpus = MAX2(1, system_info.dwNumberOfProcessors);
+    }
+ #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
+    util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+@@ -569,6 +585,8 @@ util_cpu_detect_once(void)
+    util_cpu_caps.nr_cpus = 1;
+ #endif
+ 
++   util_cpu_caps.num_cpu_mask_bits = align(util_cpu_caps.nr_cpus, 32);
++
+    /* Make the fallback cacheline size nonzero so that it can be
+     * safely passed to align().
+     */
+diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h
+index a76fd912910..1c7239b2ec7 100644
+--- a/src/util/u_cpu_detect.h
++++ b/src/util/u_cpu_detect.h
+@@ -55,7 +55,7 @@ enum cpu_family {
+ 
+ typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
+ 
+-struct util_cpu_caps {
++struct util_cpu_caps_t {
+    int nr_cpus;
+    enum cpu_family family;
+ 
+@@ -98,14 +98,27 @@ struct util_cpu_caps {
+ 
+    unsigned num_L3_caches;
+    unsigned cores_per_L3;
++   unsigned num_cpu_mask_bits;
+ 
+    uint16_t cpu_to_L3[UTIL_MAX_CPUS];
+    /* Affinity masks for each L3 cache. */
+    util_affinity_mask *L3_affinity_mask;
+ };
+ 
+-extern struct util_cpu_caps
+-util_cpu_caps;
++#define U_CPU_INVALID_L3 0xffff
++
++static inline const struct util_cpu_caps_t *
++util_get_cpu_caps(void)
++{
++	extern struct util_cpu_caps_t util_cpu_caps;
++
++	/* If you hit this assert, it means that something is using the
++	 * cpu-caps without having first called util_cpu_detect()
++	 */
++	assert(util_cpu_caps.nr_cpus >= 1);
++
++	return &util_cpu_caps;
++}
+ 
+ void util_cpu_detect(void);
+ 
+diff --git a/src/util/u_math.c b/src/util/u_math.c
+index 9a8a9ecbbde..41e7f599eb0 100644
+--- a/src/util/u_math.c
++++ b/src/util/u_math.c
+@@ -92,7 +92,7 @@ util_fpstate_get(void)
+    unsigned mxcsr = 0;
+ 
+ #if defined(PIPE_ARCH_SSE)
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       mxcsr = _mm_getcsr();
+    }
+ #endif
+@@ -110,10 +110,10 @@ unsigned
+ util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
+ {
+ #if defined(PIPE_ARCH_SSE)
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       /* Enable flush to zero mode */
+       current_mxcsr |= _MM_FLUSH_ZERO_MASK;
+-      if (util_cpu_caps.has_daz) {
++      if (util_get_cpu_caps()->has_daz) {
+          /* Enable denormals are zero mode */
+          current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
+       }
+@@ -132,7 +132,7 @@ void
+ util_fpstate_set(unsigned mxcsr)
+ {
+ #if defined(PIPE_ARCH_SSE)
+-   if (util_cpu_caps.has_sse) {
++   if (util_get_cpu_caps()->has_sse) {
+       _mm_setcsr(mxcsr);
+    }
+ #endif
+diff --git a/src/util/u_queue.c b/src/util/u_queue.c
+index b11b297a45c..8f21f0667c6 100644
+--- a/src/util/u_queue.c
++++ b/src/util/u_queue.c
+@@ -27,7 +27,7 @@
+ #include "u_queue.h"
+ 
+ #include "c11/threads.h"
+-
++#include "util/u_cpu_detect.h"
+ #include "util/os_time.h"
+ #include "util/u_string.h"
+ #include "util/u_thread.h"
+@@ -258,7 +258,8 @@ util_queue_thread_func(void *input)
+       uint32_t mask[UTIL_MAX_CPUS / 32];
+ 
+       memset(mask, 0xff, sizeof(mask));
+-      util_set_current_thread_affinity(mask, NULL, UTIL_MAX_CPUS);
++      util_set_current_thread_affinity(mask, NULL,
++                                       util_get_cpu_caps()->num_cpu_mask_bits);
+    }
+ 
+ #if defined(__linux__)
diff --git a/SOURCES/lavapipe-disable-env-var.patch b/SOURCES/lavapipe-disable-env-var.patch
new file mode 100644
index 0000000..9b59577
--- /dev/null
+++ b/SOURCES/lavapipe-disable-env-var.patch
@@ -0,0 +1,13 @@
+diff -up mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c.dma mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c
+--- mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c.dma	2020-11-19 15:11:42.483134826 +1000
++++ mesa-20.3.0-rc1/src/gallium/frontends/lavapipe/lvp_device.c	2020-11-19 15:13:08.556425782 +1000
+@@ -118,6 +118,9 @@ VkResult lvp_CreateInstance(
+       client_version = VK_API_VERSION_1_0;
+    }
+ 
++   if (!getenv("RH_SW_VULKAN"))
++      return VK_ERROR_INITIALIZATION_FAILED;
++
+    instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+    if (!instance)
diff --git a/SOURCES/mesa-20.3.3-stable-fixes.patch b/SOURCES/mesa-20.3.3-stable-fixes.patch
new file mode 100644
index 0000000..231e20b
--- /dev/null
+++ b/SOURCES/mesa-20.3.3-stable-fixes.patch
@@ -0,0 +1,930 @@
+diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
+index d49bc0f0564..90512d4f276 100644
+--- a/src/amd/vulkan/radv_query.c
++++ b/src/amd/vulkan/radv_query.c
+@@ -1679,13 +1679,14 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
+ 
+ 			va += 8 * idx;
+ 
+-			si_cs_emit_write_event_eop(cs,
+-						   cmd_buffer->device->physical_device->rad_info.chip_class,
+-						   radv_cmd_buffer_uses_mec(cmd_buffer),
+-						   V_028A90_PS_DONE, 0,
+-						   EOP_DST_SEL_TC_L2,
+-						   EOP_DATA_SEL_GDS,
+-						   va, EOP_DATA_GDS(0, 1), 0);
++			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
++			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
++					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
++					COPY_DATA_WR_CONFIRM);
++			radeon_emit(cs, 0);
++			radeon_emit(cs, 0);
++			radeon_emit(cs, va);
++			radeon_emit(cs, va >> 32);
+ 
+ 			/* Record that the command buffer needs GDS. */
+ 			cmd_buffer->gds_needed = true;
+@@ -1769,13 +1770,14 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
+ 
+ 			va += 8 * idx;
+ 
+-			si_cs_emit_write_event_eop(cs,
+-						   cmd_buffer->device->physical_device->rad_info.chip_class,
+-						   radv_cmd_buffer_uses_mec(cmd_buffer),
+-						   V_028A90_PS_DONE, 0,
+-						   EOP_DST_SEL_TC_L2,
+-						   EOP_DATA_SEL_GDS,
+-						   va, EOP_DATA_GDS(0, 1), 0);
++			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
++			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
++					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
++					COPY_DATA_WR_CONFIRM);
++			radeon_emit(cs, 0);
++			radeon_emit(cs, 0);
++			radeon_emit(cs, va);
++			radeon_emit(cs, va >> 32);
+ 
+ 			cmd_buffer->state.active_pipeline_gds_queries--;
+ 		}
+diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
+index 9d9491d4361..2eb3ba4e64e 100644
+--- a/src/amd/vulkan/radv_shader.h
++++ b/src/amd/vulkan/radv_shader.h
+@@ -573,9 +573,11 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices,
+ 	if (chip_class >= GFX7 && family != CHIP_STONEY)
+ 		hardware_lds_size = 65536;
+ 
+-	num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
++	if (input_patch_size + output_patch_size)
++		num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size));
+ 	/* Make sure the output data fits in the offchip buffer */
+-	num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size);
++	if (output_patch_size)
++		num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size);
+ 	/* Not necessary for correctness, but improves performance. The
+ 	 * specific value is taken from the proprietary driver.
+ 	 */
+diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
+index 1eef6aac70c..a6a663d97a6 100644
+--- a/src/gallium/auxiliary/cso_cache/cso_context.c
++++ b/src/gallium/auxiliary/cso_cache/cso_context.c
+@@ -402,10 +402,13 @@ void cso_destroy_context( struct cso_context *ctx )
+                                                 PIPE_SHADER_CAP_MAX_SHADER_BUFFERS);
+             int maxcb = scr->get_shader_param(scr, sh,
+                                               PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
++            int maximg = scr->get_shader_param(scr, sh,
++                                              PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
+             assert(maxsam <= PIPE_MAX_SAMPLERS);
+             assert(maxview <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
+             assert(maxssbo <= PIPE_MAX_SHADER_BUFFERS);
+             assert(maxcb <= PIPE_MAX_CONSTANT_BUFFERS);
++            assert(maximg <= PIPE_MAX_SHADER_IMAGES);
+             if (maxsam > 0) {
+                ctx->pipe->bind_sampler_states(ctx->pipe, sh, 0, maxsam, zeros);
+             }
+@@ -415,6 +418,9 @@ void cso_destroy_context( struct cso_context *ctx )
+             if (maxssbo > 0) {
+                ctx->pipe->set_shader_buffers(ctx->pipe, sh, 0, maxssbo, ssbos, 0);
+             }
++            if (maximg > 0) {
++               ctx->pipe->set_shader_images(ctx->pipe, sh, 0, maximg, NULL);
++            }
+             for (int i = 0; i < maxcb; i++) {
+                ctx->pipe->set_constant_buffer(ctx->pipe, sh, i, NULL);
+             }
+diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
+index 8157e921850..971fc80b5ac 100644
+--- a/src/gallium/drivers/iris/iris_program.c
++++ b/src/gallium/drivers/iris/iris_program.c
+@@ -2109,8 +2109,8 @@ iris_get_scratch_space(struct iris_context *ice,
+     * in the base configuration.
+     */
+    unsigned subslice_total = screen->subslice_total;
+-   if (devinfo->gen >= 12)
+-      subslice_total = devinfo->num_subslices[0];
++   if (devinfo->gen == 12)
++      subslice_total = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
+    else if (devinfo->gen == 11)
+       subslice_total = 8;
+    else if (devinfo->gen < 11)
+diff --git a/src/gallium/drivers/iris/iris_resolve.c b/src/gallium/drivers/iris/iris_resolve.c
+index 276ad62b1dd..045f43ed8c0 100644
+--- a/src/gallium/drivers/iris/iris_resolve.c
++++ b/src/gallium/drivers/iris/iris_resolve.c
+@@ -793,7 +793,9 @@ iris_resource_set_aux_state(struct iris_context *ice,
+       if (res->aux.state[level][start_layer + a] != aux_state) {
+          res->aux.state[level][start_layer + a] = aux_state;
+          /* XXX: Need to track which bindings to make dirty */
+-         ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
++         ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER |
++                             IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES |
++                             IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES;
+          ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_BINDINGS;
+       }
+    }
+diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c
+index 8747ef4aa8a..3b34e32cd21 100644
+--- a/src/gallium/drivers/iris/iris_resource.c
++++ b/src/gallium/drivers/iris/iris_resource.c
+@@ -1125,6 +1125,20 @@ iris_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
+                                 0, INTEL_REMAINING_LAYERS,
+                                 mod ? mod->aux_usage : ISL_AUX_USAGE_NONE,
+                                 mod ? mod->supports_clear_color : false);
++
++   if (!res->mod_info && res->aux.usage != ISL_AUX_USAGE_NONE) {
++      /* flush_resource may be used to prepare an image for sharing external
++       * to the driver (e.g. via eglCreateImage). To account for this, make
++       * sure to get rid of any compression that a consumer wouldn't know how
++       * to handle.
++       */
++      for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
++         if (iris_batch_references(&ice->batches[i], res->bo))
++            iris_batch_flush(&ice->batches[i]);
++      }
++
++      iris_resource_disable_aux(res);
++   }
+ }
+ 
+ static void
+diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
+index 59a63f7bbab..b9ddb863a16 100644
+--- a/src/gallium/drivers/iris/iris_state.c
++++ b/src/gallium/drivers/iris/iris_state.c
+@@ -1666,6 +1666,8 @@ struct iris_rasterizer_state {
+    bool multisample;
+    bool force_persample_interp;
+    bool conservative_rasterization;
++   bool fill_mode_point;
++   bool fill_mode_line;
+    bool fill_mode_point_or_line;
+    enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
+    uint16_t sprite_coord_enable;
+@@ -1729,11 +1731,15 @@ iris_create_rasterizer_state(struct pipe_context *ctx,
+    cso->conservative_rasterization =
+       state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
+ 
+-   cso->fill_mode_point_or_line =
+-      state->fill_front == PIPE_POLYGON_MODE_LINE ||
++   cso->fill_mode_point =
+       state->fill_front == PIPE_POLYGON_MODE_POINT ||
+-      state->fill_back == PIPE_POLYGON_MODE_LINE ||
+       state->fill_back == PIPE_POLYGON_MODE_POINT;
++   cso->fill_mode_line =
++      state->fill_front == PIPE_POLYGON_MODE_LINE ||
++      state->fill_back == PIPE_POLYGON_MODE_LINE;
++   cso->fill_mode_point_or_line =
++      cso->fill_mode_point ||
++      cso->fill_mode_line;
+ 
+    if (state->clip_plane_enable != 0)
+       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
+@@ -4059,6 +4065,28 @@ iris_emit_sbe_swiz(struct iris_batch *batch,
+    }
+ }
+ 
++static bool
++iris_is_drawing_points(const struct iris_context *ice)
++{
++   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
++
++   if (cso_rast->fill_mode_point) {
++      return true;
++   }
++
++   if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
++      const struct brw_gs_prog_data *gs_prog_data =
++         (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
++      return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
++   } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
++      const struct brw_tes_prog_data *tes_data =
++         (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
++      return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
++   } else {
++      return ice->state.prim_mode == PIPE_PRIM_POINTS;
++   }
++}
++
+ static unsigned
+ iris_calculate_point_sprite_overrides(const struct brw_wm_prog_data *prog_data,
+                                       const struct iris_rasterizer_state *cso)
+@@ -4093,7 +4121,8 @@ iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
+                                       &urb_read_offset, &urb_read_length);
+ 
+    unsigned sprite_coord_overrides =
+-      iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast);
++      iris_is_drawing_points(ice) ?
++      iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast) : 0;
+ 
+    iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
+       sbe.AttributeSwizzleEnable = true;
+diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
+index 8f688fa3650..ef35f86b05f 100644
+--- a/src/gallium/drivers/radeonsi/si_descriptors.c
++++ b/src/gallium/drivers/radeonsi/si_descriptors.c
+@@ -1482,11 +1482,12 @@ void si_update_needs_color_decompress_masks(struct si_context *sctx)
+ /* Reset descriptors of buffer resources after \p buf has been invalidated.
+  * If buf == NULL, reset all descriptors.
+  */
+-static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
++static bool si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_resources *buffers,
+                                       unsigned descriptors_idx, uint64_t slot_mask,
+                                       struct pipe_resource *buf, enum radeon_bo_priority priority)
+ {
+    struct si_descriptors *descs = &sctx->descriptors[descriptors_idx];
++   bool noop = true;
+    uint64_t mask = buffers->enabled_mask & slot_mask;
+ 
+    while (mask) {
+@@ -1501,8 +1502,10 @@ static void si_reset_buffer_resources(struct si_context *sctx, struct si_buffer_
+             sctx, si_resource(buffer),
+             buffers->writable_mask & (1llu << i) ? RADEON_USAGE_READWRITE : RADEON_USAGE_READ,
+             priority, true);
++         noop = false;
+       }
+    }
++   return !noop;
+ }
+ 
+ /* Update all buffer bindings where the buffer is bound, including
+@@ -1577,11 +1580,15 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
+    }
+ 
+    if (!buffer || buffer->bind_history & PIPE_BIND_SHADER_BUFFER) {
+-      for (shader = 0; shader < SI_NUM_SHADERS; shader++)
+-         si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
+-                                   si_const_and_shader_buffer_descriptors_idx(shader),
+-                                   u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf,
+-                                   sctx->const_and_shader_buffers[shader].priority);
++      for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
++         if (si_reset_buffer_resources(sctx, &sctx->const_and_shader_buffers[shader],
++                                       si_const_and_shader_buffer_descriptors_idx(shader),
++                                       u_bit_consecutive64(0, SI_NUM_SHADER_BUFFERS), buf,
++                                       sctx->const_and_shader_buffers[shader].priority) &&
++             shader == PIPE_SHADER_COMPUTE) {
++            sctx->compute_shaderbuf_sgprs_dirty = true;
++         }
++      }
+    }
+ 
+    if (!buffer || buffer->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+@@ -1633,6 +1640,9 @@ void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf)
+                radeon_add_to_gfx_buffer_list_check_mem(sctx, si_resource(buffer),
+                                                        RADEON_USAGE_READWRITE,
+                                                        RADEON_PRIO_SAMPLER_BUFFER, true);
++
++               if (shader == PIPE_SHADER_COMPUTE)
++                  sctx->compute_image_sgprs_dirty = true;
+             }
+          }
+       }
+diff --git a/src/gallium/frontends/dri/dri_helpers.c b/src/gallium/frontends/dri/dri_helpers.c
+index 01a1fb3d96c..5e87df35a55 100644
+--- a/src/gallium/frontends/dri/dri_helpers.c
++++ b/src/gallium/frontends/dri/dri_helpers.c
+@@ -258,7 +258,9 @@ dri2_create_image_from_renderbuffer2(__DRIcontext *context,
+ 				     int renderbuffer, void *loaderPrivate,
+                                      unsigned *error)
+ {
+-   struct gl_context *ctx = ((struct st_context *)dri_context(context)->st)->ctx;
++   struct st_context *st_ctx = (struct st_context *)dri_context(context)->st;
++   struct gl_context *ctx = st_ctx->ctx;
++   struct pipe_context *p_ctx = st_ctx->pipe;
+    struct gl_renderbuffer *rb;
+    struct pipe_resource *tex;
+    __DRIimage *img;
+@@ -299,6 +301,13 @@ dri2_create_image_from_renderbuffer2(__DRIcontext *context,
+ 
+    pipe_resource_reference(&img->texture, tex);
+ 
++   /* If the resource supports EGL_MESA_image_dma_buf_export, make sure that
++    * it's in a shareable state. Do this now while we still have the access to
++    * the context.
++    */
++   if (dri2_get_mapping_by_format(img->dri_format))
++      p_ctx->flush_resource(p_ctx, tex);
++
+    *error = __DRI_IMAGE_ERROR_SUCCESS;
+    return img;
+ }
+@@ -326,7 +335,9 @@ dri2_create_from_texture(__DRIcontext *context, int target, unsigned texture,
+                          void *loaderPrivate)
+ {
+    __DRIimage *img;
+-   struct gl_context *ctx = ((struct st_context *)dri_context(context)->st)->ctx;
++   struct st_context *st_ctx = (struct st_context *)dri_context(context)->st;
++   struct gl_context *ctx = st_ctx->ctx;
++   struct pipe_context *p_ctx = st_ctx->pipe;
+    struct gl_texture_object *obj;
+    struct pipe_resource *tex;
+    GLuint face = 0;
+@@ -376,6 +387,13 @@ dri2_create_from_texture(__DRIcontext *context, int target, unsigned texture,
+ 
+    pipe_resource_reference(&img->texture, tex);
+ 
++   /* If the resource supports EGL_MESA_image_dma_buf_export, make sure that
++    * it's in a shareable state. Do this now while we still have the access to
++    * the context.
++    */
++   if (dri2_get_mapping_by_format(img->dri_format))
++      p_ctx->flush_resource(p_ctx, tex);
++
+    *error = __DRI_IMAGE_ERROR_SUCCESS;
+    return img;
+ }
+@@ -547,6 +565,9 @@ dri2_get_mapping_by_fourcc(int fourcc)
+ const struct dri2_format_mapping *
+ dri2_get_mapping_by_format(int format)
+ {
++   if (format == __DRI_IMAGE_FORMAT_NONE)
++      return NULL;
++
+    for (unsigned i = 0; i < ARRAY_SIZE(dri2_format_table); i++) {
+       if (dri2_format_table[i].dri_format == format)
+          return &dri2_format_table[i];
+diff --git a/src/gallium/frontends/lavapipe/lvp_device.c b/src/gallium/frontends/lavapipe/lvp_device.c
+index 45734f95880..187aecde1f8 100644
+--- a/src/gallium/frontends/lavapipe/lvp_device.c
++++ b/src/gallium/frontends/lavapipe/lvp_device.c
+@@ -52,8 +52,6 @@ lvp_physical_device_init(struct lvp_physical_device *device,
+    if (!device->pscreen)
+       return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+ 
+-   fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n");
+-
+    device->max_images = device->pscreen->get_shader_param(device->pscreen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_MAX_SHADER_IMAGES);
+    lvp_physical_device_get_supported_extensions(device, &device->supported_extensions);
+    result = lvp_init_wsi(device);
+@@ -575,6 +573,19 @@ void lvp_GetPhysicalDeviceProperties2(
+    }
+ }
+ 
++static void lvp_get_physical_device_queue_family_properties(
++   VkQueueFamilyProperties*                    pQueueFamilyProperties)
++{
++   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
++      .queueFlags = VK_QUEUE_GRAPHICS_BIT |
++      VK_QUEUE_COMPUTE_BIT |
++      VK_QUEUE_TRANSFER_BIT,
++      .queueCount = 1,
++      .timestampValidBits = 64,
++      .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
++   };
++}
++
+ void lvp_GetPhysicalDeviceQueueFamilyProperties(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pCount,
+@@ -586,15 +597,21 @@ void lvp_GetPhysicalDeviceQueueFamilyProperties(
+    }
+ 
+    assert(*pCount >= 1);
++   lvp_get_physical_device_queue_family_properties(pQueueFamilyProperties);
++}
+ 
+-   *pQueueFamilyProperties = (VkQueueFamilyProperties) {
+-      .queueFlags = VK_QUEUE_GRAPHICS_BIT |
+-      VK_QUEUE_COMPUTE_BIT |
+-      VK_QUEUE_TRANSFER_BIT,
+-      .queueCount = 1,
+-      .timestampValidBits = 64,
+-      .minImageTransferGranularity = (VkExtent3D) { 1, 1, 1 },
+-   };
++void lvp_GetPhysicalDeviceQueueFamilyProperties2(
++   VkPhysicalDevice                            physicalDevice,
++   uint32_t*                                   pCount,
++   VkQueueFamilyProperties2                   *pQueueFamilyProperties)
++{
++   if (pQueueFamilyProperties == NULL) {
++      *pCount = 1;
++      return;
++   }
++
++   assert(*pCount >= 1);
++   lvp_get_physical_device_queue_family_properties(&pQueueFamilyProperties->queueFamilyProperties);
+ }
+ 
+ void lvp_GetPhysicalDeviceMemoryProperties(
+@@ -617,6 +634,14 @@ void lvp_GetPhysicalDeviceMemoryProperties(
+    };
+ }
+ 
++void lvp_GetPhysicalDeviceMemoryProperties2(
++   VkPhysicalDevice                            physicalDevice,
++   VkPhysicalDeviceMemoryProperties2          *pMemoryProperties)
++{
++   lvp_GetPhysicalDeviceMemoryProperties(physicalDevice,
++                                         &pMemoryProperties->memoryProperties);
++}
++
+ PFN_vkVoidFunction lvp_GetInstanceProcAddr(
+    VkInstance                                  _instance,
+    const char*                                 pName)
+@@ -822,6 +847,8 @@ VkResult lvp_CreateDevice(
+    const VkAllocationCallbacks*                pAllocator,
+    VkDevice*                                   pDevice)
+ {
++   fprintf(stderr, "WARNING: lavapipe is not a conformant vulkan implementation, testing use only.\n");
++
+    LVP_FROM_HANDLE(lvp_physical_device, physical_device, physicalDevice);
+    struct lvp_device *device;
+ 
+diff --git a/src/glx/g_glxglvnddispatchfuncs.c b/src/glx/g_glxglvnddispatchfuncs.c
+index 0f02ed2d321..e0ea27c0b18 100644
+--- a/src/glx/g_glxglvnddispatchfuncs.c
++++ b/src/glx/g_glxglvnddispatchfuncs.c
+@@ -87,6 +87,7 @@ const char * const __glXDispatchTableStrings[DI_LAST_INDEX] = {
+     __ATTRIB(SelectEventSGIX),
+     // glXSwapBuffers implemented by libglvnd
+     __ATTRIB(SwapBuffersMscOML),
++    __ATTRIB(SwapIntervalEXT),
+     __ATTRIB(SwapIntervalMESA),
+     __ATTRIB(SwapIntervalSGI),
+     // glXUseXFont implemented by libglvnd
+@@ -893,6 +894,24 @@ static int dispatch_SwapIntervalMESA(unsigned int interval)
+ 
+ 
+ 
++static void dispatch_SwapIntervalEXT(Display *dpy, GLXDrawable drawable, int interval)
++{
++    PFNGLXSWAPINTERVALEXTPROC pSwapIntervalEXT;
++    __GLXvendorInfo *dd;
++
++    dd = GetDispatchFromDrawable(dpy, drawable);
++    if (dd == NULL)
++        return;
++
++    __FETCH_FUNCTION_PTR(SwapIntervalEXT);
++    if (pSwapIntervalEXT == NULL)
++        return;
++
++    pSwapIntervalEXT(dpy, drawable, interval);
++}
++
++
++
+ static Bool dispatch_WaitForMscOML(Display *dpy, GLXDrawable drawable,
+                                       int64_t target_msc, int64_t divisor,
+                                       int64_t remainder, int64_t *ust,
+@@ -974,6 +993,7 @@ const void * const __glXDispatchFunctions[DI_LAST_INDEX + 1] = {
+     __ATTRIB(ReleaseTexImageEXT),
+     __ATTRIB(SelectEventSGIX),
+     __ATTRIB(SwapBuffersMscOML),
++    __ATTRIB(SwapIntervalEXT),
+     __ATTRIB(SwapIntervalMESA),
+     __ATTRIB(SwapIntervalSGI),
+     __ATTRIB(WaitForMscOML),
+diff --git a/src/glx/g_glxglvnddispatchindices.h b/src/glx/g_glxglvnddispatchindices.h
+index 3ba50a74abb..b65d078098f 100644
+--- a/src/glx/g_glxglvnddispatchindices.h
++++ b/src/glx/g_glxglvnddispatchindices.h
+@@ -79,6 +79,7 @@ typedef enum __GLXdispatchIndex {
+     DI_SelectEventSGIX,
+     // SwapBuffers implemented by libglvnd
+     DI_SwapBuffersMscOML,
++    DI_SwapIntervalEXT,
+     DI_SwapIntervalMESA,
+     DI_SwapIntervalSGI,
+     // UseXFont implemented by libglvnd
+diff --git a/src/intel/common/gen_mi_builder.h b/src/intel/common/gen_mi_builder.h
+index ddd8459ef07..47fb98e99f7 100644
+--- a/src/intel/common/gen_mi_builder.h
++++ b/src/intel/common/gen_mi_builder.h
+@@ -932,6 +932,13 @@ gen_mi_store_address(struct gen_mi_builder *b,
+ static inline void
+ gen_mi_self_mod_barrier(struct gen_mi_builder *b)
+ {
++   /* First make sure all the memory writes from previous modifying commands
++    * have landed. We want to do this before going through the CS cache,
++    * otherwise we could be fetching memory that hasn't been written to yet.
++    */
++   gen_mi_builder_emit(b, GENX(PIPE_CONTROL), pc) {
++      pc.CommandStreamerStallEnable = true;
++   }
+    /* Documentation says Gen11+ should be able to invalidate the command cache
+     * but experiment show it doesn't work properly, so for now just get over
+     * the CS prefetch.
+diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
+index 917c3abfe9e..6896987055f 100644
+--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
++++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
+@@ -437,6 +437,7 @@ instruction_requires_packed_data(fs_inst *inst)
+    case FS_OPCODE_DDX_COARSE:
+    case FS_OPCODE_DDY_FINE:
+    case FS_OPCODE_DDY_COARSE:
++   case SHADER_OPCODE_QUAD_SWIZZLE:
+       return true;
+    default:
+       return false;
+diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
+index 6ba3a6ca97e..3a4acc1834a 100644
+--- a/src/intel/compiler/brw_ir_fs.h
++++ b/src/intel/compiler/brw_ir_fs.h
+@@ -451,13 +451,15 @@ regs_written(const fs_inst *inst)
+  * Return the number of dataflow registers read by the instruction (either
+  * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+  * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+- * UNIFORM and IMM files and 32B for all other files.
++ * UNIFORM files and 32B for all other files.
+  */
+ inline unsigned
+ regs_read(const fs_inst *inst, unsigned i)
+ {
+-   const unsigned reg_size =
+-      inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE;
++   if (inst->src[i].file == IMM)
++      return 1;
++
++   const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
+    return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
+                        inst->size_read(i) -
+                        MIN2(inst->size_read(i), reg_padding(inst->src[i])),
+diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
+index 9007cd00e85..48811912e95 100644
+--- a/src/intel/vulkan/anv_allocator.c
++++ b/src/intel/vulkan/anv_allocator.c
+@@ -1447,8 +1447,8 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
+     * For, Gen11+, scratch space allocation is based on the number of threads
+     * in the base configuration.
+     */
+-   if (devinfo->gen >= 12)
+-      subslices = devinfo->num_subslices[0];
++   if (devinfo->gen == 12)
++      subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
+    else if (devinfo->gen == 11)
+       subslices = 8;
+    else if (devinfo->gen >= 9)
+diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
+index 0290431f145..80307cd612f 100644
+--- a/src/intel/vulkan/anv_image.c
++++ b/src/intel/vulkan/anv_image.c
+@@ -684,6 +684,25 @@ choose_drm_format_mod(const struct anv_physical_device *device,
+       return NULL;
+ }
+ 
++static VkImageUsageFlags
++anv_image_create_usage(const VkImageCreateInfo *pCreateInfo,
++                       VkImageUsageFlags usage)
++{
++   /* Add TRANSFER_SRC usage for multisample attachment images. This is
++    * because we might internally use the TRANSFER_SRC layout on them for
++    * blorp operations associated with resolving those into other attachments
++    * at the end of a subpass.
++    *
++    * Without this additional usage, we compute an incorrect AUX state in
++    * anv_layout_to_aux_state().
++    */
++   if (pCreateInfo->samples > VK_SAMPLE_COUNT_1_BIT &&
++       (usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
++                 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)))
++      usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
++   return usage;
++}
++
+ VkResult
+ anv_image_create(VkDevice _device,
+                  const struct anv_image_create_info *create_info,
+@@ -732,7 +751,7 @@ anv_image_create(VkDevice _device,
+    image->levels = pCreateInfo->mipLevels;
+    image->array_size = pCreateInfo->arrayLayers;
+    image->samples = pCreateInfo->samples;
+-   image->usage = pCreateInfo->usage;
++   image->usage = anv_image_create_usage(pCreateInfo, pCreateInfo->usage);
+    image->create_flags = pCreateInfo->flags;
+    image->tiling = pCreateInfo->tiling;
+    image->disjoint = pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT;
+@@ -745,8 +764,11 @@ anv_image_create(VkDevice _device,
+       const VkImageStencilUsageCreateInfoEXT *stencil_usage_info =
+          vk_find_struct_const(pCreateInfo->pNext,
+                               IMAGE_STENCIL_USAGE_CREATE_INFO_EXT);
+-      if (stencil_usage_info)
+-         image->stencil_usage = stencil_usage_info->stencilUsage;
++      if (stencil_usage_info) {
++         image->stencil_usage =
++            anv_image_create_usage(pCreateInfo,
++                                   stencil_usage_info->stencilUsage);
++      }
+    }
+ 
+    /* In case of external format, We don't know format yet,
+diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
+index af23b87969d..1818f6c587b 100644
+--- a/src/intel/vulkan/anv_pass.c
++++ b/src/intel/vulkan/anv_pass.c
+@@ -23,6 +23,7 @@
+ 
+ #include "anv_private.h"
+ 
++#include "vk_format_info.h"
+ #include "vk_util.h"
+ 
+ static void
+@@ -406,6 +407,70 @@ num_subpass_attachments2(const VkSubpassDescription2KHR *desc)
+           (ds_resolve && ds_resolve->pDepthStencilResolveAttachment);
+ }
+ 
++static bool
++vk_image_layout_depth_only(VkImageLayout layout)
++{
++   switch (layout) {
++   case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL:
++   case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL:
++      return true;
++
++   default:
++      return false;
++   }
++}
++
++/* From the Vulkan Specification 1.2.166 - VkAttachmentReference2:
++ *
++ *   "If layout only specifies the layout of the depth aspect of the
++ *    attachment, the layout of the stencil aspect is specified by the
++ *    stencilLayout member of a VkAttachmentReferenceStencilLayout structure
++ *    included in the pNext chain. Otherwise, layout describes the layout for
++ *    all relevant image aspects."
++ */
++static VkImageLayout
++stencil_ref_layout(const VkAttachmentReference2KHR *att_ref)
++{
++   if (!vk_image_layout_depth_only(att_ref->layout))
++      return att_ref->layout;
++
++   const VkAttachmentReferenceStencilLayoutKHR *stencil_ref =
++      vk_find_struct_const(att_ref->pNext,
++                           ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
++   if (!stencil_ref)
++      return VK_IMAGE_LAYOUT_UNDEFINED;
++   return stencil_ref->stencilLayout;
++}
++
++/* From the Vulkan Specification 1.2.166 - VkAttachmentDescription2:
++ *
++ *   "If format is a depth/stencil format, and initialLayout only specifies
++ *    the initial layout of the depth aspect of the attachment, the initial
++ *    layout of the stencil aspect is specified by the stencilInitialLayout
++ *    member of a VkAttachmentDescriptionStencilLayout structure included in
++ *    the pNext chain. Otherwise, initialLayout describes the initial layout
++ *    for all relevant image aspects."
++ */
++static VkImageLayout
++stencil_desc_layout(const VkAttachmentDescription2KHR *att_desc, bool final)
++{
++   if (!vk_format_has_stencil(att_desc->format))
++      return VK_IMAGE_LAYOUT_UNDEFINED;
++
++   const VkImageLayout main_layout =
++      final ? att_desc->finalLayout : att_desc->initialLayout;
++   if (!vk_image_layout_depth_only(main_layout))
++      return main_layout;
++
++   const VkAttachmentDescriptionStencilLayoutKHR *stencil_desc =
++      vk_find_struct_const(att_desc->pNext,
++                           ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR);
++   assert(stencil_desc);
++   return final ?
++      stencil_desc->stencilFinalLayout :
++      stencil_desc->stencilInitialLayout;
++}
++
+ VkResult anv_CreateRenderPass2(
+     VkDevice                                    _device,
+     const VkRenderPassCreateInfo2KHR*           pCreateInfo,
+@@ -450,10 +515,6 @@ VkResult anv_CreateRenderPass2(
+    pass->subpass_flushes = subpass_flushes;
+ 
+    for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+-      const VkAttachmentDescriptionStencilLayoutKHR *stencil_layout =
+-         vk_find_struct_const(pCreateInfo->pAttachments[i].pNext,
+-                              ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR);
+-
+       pass->attachments[i] = (struct anv_render_pass_attachment) {
+          .format                 = pCreateInfo->pAttachments[i].format,
+          .samples                = pCreateInfo->pAttachments[i].samples,
+@@ -463,12 +524,10 @@ VkResult anv_CreateRenderPass2(
+          .initial_layout         = pCreateInfo->pAttachments[i].initialLayout,
+          .final_layout           = pCreateInfo->pAttachments[i].finalLayout,
+ 
+-         .stencil_initial_layout = (stencil_layout ?
+-                                    stencil_layout->stencilInitialLayout :
+-                                    pCreateInfo->pAttachments[i].initialLayout),
+-         .stencil_final_layout   = (stencil_layout ?
+-                                    stencil_layout->stencilFinalLayout :
+-                                    pCreateInfo->pAttachments[i].finalLayout),
++         .stencil_initial_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i],
++                                                       false),
++         .stencil_final_layout   = stencil_desc_layout(&pCreateInfo->pAttachments[i],
++                                                       true),
+       };
+    }
+ 
+@@ -487,17 +546,11 @@ VkResult anv_CreateRenderPass2(
+          subpass_attachments += desc->inputAttachmentCount;
+ 
+          for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
+-            const VkAttachmentReferenceStencilLayoutKHR *stencil_layout =
+-               vk_find_struct_const(desc->pInputAttachments[j].pNext,
+-                                    ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
+-
+             subpass->input_attachments[j] = (struct anv_subpass_attachment) {
+                .usage =          VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
+                .attachment =     desc->pInputAttachments[j].attachment,
+                .layout =         desc->pInputAttachments[j].layout,
+-               .stencil_layout = (stencil_layout ?
+-                                  stencil_layout->stencilLayout :
+-                                  desc->pInputAttachments[j].layout),
++               .stencil_layout = stencil_ref_layout(&desc->pInputAttachments[j]),
+             };
+          }
+       }
+@@ -531,17 +584,11 @@ VkResult anv_CreateRenderPass2(
+       if (desc->pDepthStencilAttachment) {
+          subpass->depth_stencil_attachment = subpass_attachments++;
+ 
+-         const VkAttachmentReferenceStencilLayoutKHR *stencil_attachment =
+-            vk_find_struct_const(desc->pDepthStencilAttachment->pNext,
+-                                 ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
+-
+          *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
+             .usage =          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+             .attachment =     desc->pDepthStencilAttachment->attachment,
+             .layout =         desc->pDepthStencilAttachment->layout,
+-            .stencil_layout = stencil_attachment ?
+-                              stencil_attachment->stencilLayout :
+-                              desc->pDepthStencilAttachment->layout,
++            .stencil_layout = stencil_ref_layout(desc->pDepthStencilAttachment),
+          };
+       }
+ 
+@@ -552,17 +599,11 @@ VkResult anv_CreateRenderPass2(
+       if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) {
+          subpass->ds_resolve_attachment = subpass_attachments++;
+ 
+-         const VkAttachmentReferenceStencilLayoutKHR *stencil_resolve_attachment =
+-            vk_find_struct_const(ds_resolve->pDepthStencilResolveAttachment->pNext,
+-                                 ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
+-
+          *subpass->ds_resolve_attachment = (struct anv_subpass_attachment) {
+             .usage =          VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+             .attachment =     ds_resolve->pDepthStencilResolveAttachment->attachment,
+             .layout =         ds_resolve->pDepthStencilResolveAttachment->layout,
+-            .stencil_layout = stencil_resolve_attachment ?
+-                              stencil_resolve_attachment->stencilLayout :
+-                              ds_resolve->pDepthStencilResolveAttachment->layout,
++            .stencil_layout = stencil_ref_layout(ds_resolve->pDepthStencilResolveAttachment),
+          };
+          subpass->depth_resolve_mode = ds_resolve->depthResolveMode;
+          subpass->stencil_resolve_mode = ds_resolve->stencilResolveMode;
+diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
+index a9c49e0f592..e3eb376fa5a 100644
+--- a/src/intel/vulkan/genX_cmd_buffer.c
++++ b/src/intel/vulkan/genX_cmd_buffer.c
+@@ -462,8 +462,10 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
+ {
+    uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);
+ 
++   const struct anv_surface *surface = &image->planes[plane].surface;
+    uint64_t base_address =
+-      anv_address_physical(image->planes[plane].address);
++      anv_address_physical(anv_address_add(image->planes[plane].address,
++                                           surface->offset));
+ 
+    const struct isl_surf *isl_surf = &image->planes[plane].surface.isl;
+    uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf);
+@@ -1231,6 +1233,17 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
+             uint32_t level_layer_count =
+                MIN2(layer_count, aux_layers - base_layer);
+ 
++            /* If will_full_fast_clear is set, the caller promises to
++             * fast-clear the largest portion of the specified range as it can.
++             * For color images, that means only the first LOD and array slice.
++             */
++            if (level == 0 && base_layer == 0 && will_full_fast_clear) {
++               base_layer++;
++               level_layer_count--;
++               if (level_layer_count == 0)
++                  continue;
++            }
++
+             anv_image_ccs_op(cmd_buffer, image,
+                              image->planes[plane].surface.isl.format,
+                              ISL_SWIZZLE_IDENTITY,
+@@ -1250,6 +1263,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
+                           "define an MCS buffer.");
+          }
+ 
++         /* If will_full_fast_clear is set, the caller promises to fast-clear
++          * the largest portion of the specified range as it can.
++          */
++         if (will_full_fast_clear)
++            return;
++
+          assert(base_level == 0 && level_count == 1);
+          anv_image_mcs_op(cmd_buffer, image,
+                           image->planes[plane].surface.isl.format,
+diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
+index 205e8677f19..33f071019b7 100644
+--- a/src/intel/vulkan/genX_pipeline.c
++++ b/src/intel/vulkan/genX_pipeline.c
+@@ -1180,7 +1180,22 @@ emit_cb_state(struct anv_graphics_pipeline *pipeline,
+ #endif
+          .LogicOpEnable = info->logicOpEnable,
+          .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
+-         .ColorBufferBlendEnable = a->blendEnable,
++         /* Vulkan specification 1.2.168, VkLogicOp:
++          *
++          *   "Logical operations are controlled by the logicOpEnable and
++          *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
++          *    logicOpEnable is VK_TRUE, then a logical operation selected by
++          *    logicOp is applied between each color attachment and the
++          *    fragment’s corresponding output value, and blending of all
++          *    attachments is treated as if it were disabled."
++          *
++          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
++          * BLEND_STATE_ENTRY:
++          *
++          *   "Enabling LogicOp and Color Buffer Blending at the same time is
++          *    UNDEFINED"
++          */
++         .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
+          .ColorClampRange = COLORCLAMP_RTFORMAT,
+          .PreBlendColorClampEnable = true,
+          .PostBlendColorClampEnable = true,
+diff --git a/src/intel/vulkan/vk_format_info.h b/src/intel/vulkan/vk_format_info.h
+index 006e1f4a6ad..4e72c244742 100644
+--- a/src/intel/vulkan/vk_format_info.h
++++ b/src/intel/vulkan/vk_format_info.h
+@@ -164,4 +164,11 @@ vk_format_has_depth(VkFormat format)
+    return aspects & VK_IMAGE_ASPECT_DEPTH_BIT;
+ }
+ 
++static inline bool
++vk_format_has_stencil(VkFormat format)
++{
++   const VkImageAspectFlags aspects = vk_format_aspects(format);
++   return aspects & VK_IMAGE_ASPECT_STENCIL_BIT;
++}
++
+ #endif /* VK_FORMAT_INFO_H */
+diff --git a/src/mesa/state_tracker/st_pbo.c b/src/mesa/state_tracker/st_pbo.c
+index 65a1ce8862a..b03921c1be6 100644
+--- a/src/mesa/state_tracker/st_pbo.c
++++ b/src/mesa/state_tracker/st_pbo.c
+@@ -431,16 +431,21 @@ create_fs(struct st_context *st, bool download,
+    nir_ssa_def *coord = nir_load_var(&b, fragcoord);
+ 
+    nir_ssa_def *layer = NULL;
+-   if (st->pbo.layers && need_layer && (!download || target == PIPE_TEXTURE_1D_ARRAY ||
+-                                                     target == PIPE_TEXTURE_2D_ARRAY ||
+-                                                     target == PIPE_TEXTURE_3D ||
+-                                                     target == PIPE_TEXTURE_CUBE ||
+-                                                     target == PIPE_TEXTURE_CUBE_ARRAY)) {
+-      nir_variable *var = nir_variable_create(b.shader, nir_var_shader_in,
+-                                              glsl_int_type(), "gl_Layer");
+-      var->data.location = VARYING_SLOT_LAYER;
+-      var->data.interpolation = INTERP_MODE_FLAT;
+-      layer = nir_load_var(&b, var);
++   if (st->pbo.layers && (!download || target == PIPE_TEXTURE_1D_ARRAY ||
++                                       target == PIPE_TEXTURE_2D_ARRAY ||
++                                       target == PIPE_TEXTURE_3D ||
++                                       target == PIPE_TEXTURE_CUBE ||
++                                       target == PIPE_TEXTURE_CUBE_ARRAY)) {
++      if (need_layer) {
++         nir_variable *var = nir_variable_create(b.shader, nir_var_shader_in,
++                                                glsl_int_type(), "gl_Layer");
++         var->data.location = VARYING_SLOT_LAYER;
++         var->data.interpolation = INTERP_MODE_FLAT;
++         layer = nir_load_var(&b, var);
++      }
++      else {
++         layer = zero;
++      }
+    }
+ 
+    /* offset_pos = param.xy + f2i(coord.xy) */
+diff --git a/src/util/format/u_format.csv b/src/util/format/u_format.csv
+index 8acfb869bdb..237c4c95475 100644
+--- a/src/util/format/u_format.csv
++++ b/src/util/format/u_format.csv
+@@ -500,7 +500,7 @@ PIPE_FORMAT_R4G4B4A4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , xy
+ PIPE_FORMAT_B4G4R4A4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , zyxw, rgb, up4 , up4 , up4 , up4 , yzwx
+ PIPE_FORMAT_A4R4G4B4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , yzwx, rgb, up4 , up4 , up4 , up4 , zyxw
+ PIPE_FORMAT_A4B4G4R4_UINT           , plain, 1, 1, 1, up4 , up4 , up4 , up4 , wzyx, rgb, up4 , up4 , up4 , up4 , xyzw
+-PIPE_FORMAT_A1R5G5B5_UINT           , plain, 1, 1, 1, up1 , up5 , up5 , up5 , wzyx, rgb, up5 , up5 , up5 , up1 , zyxw
++PIPE_FORMAT_A1R5G5B5_UINT           , plain, 1, 1, 1, up1 , up5 , up5 , up5 , yzwx, rgb, up5 , up5 , up5 , up1 , zyxw
+ PIPE_FORMAT_A1B5G5R5_UINT           , plain, 1, 1, 1, up1 , up5 , up5 , up5 , wzyx, rgb, up5 , up5 , up5 , up1 , xyzw
+ PIPE_FORMAT_R5G5B5A1_UINT           , plain, 1, 1, 1, up5 , up5 , up5 , up1 , xyzw, rgb, up5 , up5 , up5 , up1 , wzyx
+ PIPE_FORMAT_B5G5R5A1_UINT           , plain, 1, 1, 1, up5 , up5 , up5 , up1 , zyxw, rgb, up1 , up5 , up5 , up5 , yzwx
+diff --git a/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json b/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json
+index 1d5fffd0135..361ae9fe74e 100644
+--- a/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json
++++ b/src/vulkan/device-select-layer/VkLayer_MESA_device_select.json
+@@ -4,7 +4,7 @@
+     "name": "VK_LAYER_MESA_device_select",
+     "type": "GLOBAL",
+     "library_path": "libVkLayer_MESA_device_select.so",
+-    "api_version": "1.1.73",
++    "api_version": "1.2.73",
+     "implementation_version": "1",
+     "description": "Linux device selection layer",
+     "functions": {
diff --git a/SOURCES/nouveau-tu1xx-support.patch b/SOURCES/nouveau-tu1xx-support.patch
deleted file mode 100644
index 3254466..0000000
--- a/SOURCES/nouveau-tu1xx-support.patch
+++ /dev/null
@@ -1,10387 +0,0 @@
-diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
-index 6c360992a53..9de8168fbd9 100644
---- a/src/gallium/drivers/nouveau/Makefile.sources
-+++ b/src/gallium/drivers/nouveau/Makefile.sources
-@@ -151,6 +151,14 @@ NVC0_CODEGEN_SOURCES := \
- 	codegen/nv50_ir_target_nvc0.h
- 
- NVC0_C_SOURCES := \
-+	nvc0/cla0c0qmd.h \
-+	nvc0/clc0c0qmd.h \
-+	nvc0/clc3c0qmd.h \
-+	nvc0/drf.h \
-+	nvc0/qmd.h \
-+	nvc0/qmda0c0.c \
-+	nvc0/qmdc0c0.c \
-+	nvc0/qmdc3c0.c \
- 	nvc0/gm107_texture.xml.h \
- 	nvc0/nvc0_3d.xml.h \
- 	nvc0/nvc0_compute.c \
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
-index 42ee969c66b..d58c0d206ec 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
-@@ -67,8 +67,10 @@ enum operation
-    OP_AND,
-    OP_OR,
-    OP_XOR,
-+   OP_LOP3_LUT,
-    OP_SHL,
-    OP_SHR,
-+   OP_SHF,
-    OP_MAX,
-    OP_MIN,
-    OP_SAT, // CLAMP(f32, 0.0, 1.0)
-@@ -116,6 +118,7 @@ enum operation
-    OP_PINTERP,
-    OP_EMIT,    // emit vertex
-    OP_RESTART, // restart primitive
-+   OP_FINAL, // finish emitting primitives
-    OP_TEX,
-    OP_TXB, // texture bias
-    OP_TXL, // texure lod
-@@ -151,7 +154,10 @@ enum operation
-    OP_INSBF,  // insert first src1[8:15] bits of src0 into src2 at src1[0:7]
-    OP_EXTBF,  // place bits [K,K+N) of src0 into dst, src1 = 0xNNKK
-    OP_BFIND,  // find highest/lowest set bit
-+   OP_BREV,   // bitfield reverse
-+   OP_BMSK,   // bitfield mask
-    OP_PERMT,  // dst = bytes from src2,src0 selected by src1 (nvc0's src order)
-+   OP_SGXT,
-    OP_ATOM,
-    OP_BAR,    // execution barrier, sources = { id, thread count, predicate }
-    OP_VADD,   // byte/word vector operations
-@@ -167,6 +173,7 @@ enum operation
-    OP_SHFL, // warp shuffle
-    OP_VOTE,
-    OP_BUFQ, // buffer query
-+   OP_WARPSYNC,
-    OP_LAST
- };
- 
-@@ -254,11 +261,29 @@ enum operation
- #define NV50_IR_SUBOP_VOTE_ALL 0
- #define NV50_IR_SUBOP_VOTE_ANY 1
- #define NV50_IR_SUBOP_VOTE_UNI 2
-+#define NV50_IR_SUBOP_LOP3_LUT_SRC0 0xf0
-+#define NV50_IR_SUBOP_LOP3_LUT_SRC1 0xcc
-+#define NV50_IR_SUBOP_LOP3_LUT_SRC2 0xaa
-+#define NV50_IR_SUBOP_LOP3_LUT(exp) ({         \
-+      uint8_t a = NV50_IR_SUBOP_LOP3_LUT_SRC0; \
-+      uint8_t b = NV50_IR_SUBOP_LOP3_LUT_SRC1; \
-+      uint8_t c = NV50_IR_SUBOP_LOP3_LUT_SRC2; \
-+      (uint8_t)(exp);                          \
-+})
-+#define NV50_IR_SUBOP_BMSK_C (0 << 0)
-+#define NV50_IR_SUBOP_BMSK_W (1 << 0)
- 
- #define NV50_IR_SUBOP_MINMAX_LOW  1
- #define NV50_IR_SUBOP_MINMAX_MED  2
- #define NV50_IR_SUBOP_MINMAX_HIGH 3
- 
-+#define NV50_IR_SUBOP_SHF_L  (0 << 0)
-+#define NV50_IR_SUBOP_SHF_R  (1 << 0)
-+#define NV50_IR_SUBOP_SHF_LO (0 << 1)
-+#define NV50_IR_SUBOP_SHF_HI (1 << 1)
-+#define NV50_IR_SUBOP_SHF_C  (0 << 2)
-+#define NV50_IR_SUBOP_SHF_W  (1 << 2)
-+
- // xmad(src0, src1, 0) << 16 + src2
- #define NV50_IR_SUBOP_XMAD_PSL (1 << 0)
- // (xmad(src0, src1, src2) & 0xffff) | (src1 << 16)
-@@ -900,7 +925,7 @@ public:
- 
-    uint16_t subOp; // quadop, 1 for mul-high, etc.
- 
--   unsigned encSize    : 4; // encoding size in bytes
-+   unsigned encSize    : 5; // encoding size in bytes
-    unsigned saturate   : 1; // to [0.0f, 1.0f]
-    unsigned join       : 1; // converge control flow (use OP_JOIN until end)
-    unsigned fixed      : 1; // prevent dead code elimination
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
-index 5dc0e24c5dc..63ea7f5e7e8 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
-@@ -29,6 +29,8 @@
- #include "tgsi/tgsi_parse.h"
- #include "tgsi/tgsi_scan.h"
- 
-+struct nir_shader_compiler_options;
-+
- /*
-  * This struct constitutes linkage information in TGSI terminology.
-  *
-@@ -70,10 +72,12 @@ struct nv50_ir_prog_symbol
-    uint32_t offset;
- };
- 
-+#define NVISA_GF100_CHIPSET    0xc0
- #define NVISA_GK104_CHIPSET    0xe0
- #define NVISA_GK20A_CHIPSET    0xea
- #define NVISA_GM107_CHIPSET    0x110
- #define NVISA_GM200_CHIPSET    0x120
-+#define NVISA_GV100_CHIPSET    0x140
- 
- struct nv50_ir_prog_info
- {
-@@ -200,6 +204,9 @@ struct nv50_ir_prog_info
- extern "C" {
- #endif
- 
-+const struct nir_shader_compiler_options *
-+nv50_ir_nir_shader_compiler_options(int chipset);
-+
- extern int nv50_ir_generate_code(struct nv50_ir_prog_info *);
- 
- extern void nv50_ir_relocate_code(void *relocData, uint32_t *code,
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
-index e244bd0d610..dd8e1ab86c4 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
-@@ -23,6 +23,7 @@
-  */
- 
- #include "codegen/nv50_ir_target_gm107.h"
-+#include "codegen/nv50_ir_sched_gm107.h"
- 
- //#define GM107_DEBUG_SCHED_DATA
- 
-@@ -170,6 +171,7 @@ private:
-    void emitBFI();
-    void emitBFE();
-    void emitFLO();
-+   void emitPRMT();
- 
-    void emitLDSTs(int, DataType);
-    void emitLDSTc(int);
-@@ -2371,6 +2373,33 @@ CodeEmitterGM107::emitFLO()
-    emitGPR  (0x00, insn->def(0));
- }
- 
-+void
-+CodeEmitterGM107::emitPRMT()
-+{
-+   switch (insn->src(1).getFile()) {
-+   case FILE_GPR:
-+      emitInsn(0x5bc00000);
-+      emitGPR (0x14, insn->src(1));
-+      break;
-+   case FILE_MEMORY_CONST:
-+      emitInsn(0x4bc00000);
-+      emitCBUF(0x22, -1, 0x14, 16, 2, insn->src(1));
-+      break;
-+   case FILE_IMMEDIATE:
-+      emitInsn(0x36c00000);
-+      emitIMMD(0x14, 19, insn->src(1));
-+      break;
-+   default:
-+      assert(!"bad src1 file");
-+      break;
-+   }
-+
-+   emitField(0x30, 3, insn->subOp);
-+   emitGPR  (0x27, insn->src(2));
-+   emitGPR  (0x08, insn->src(0));
-+   emitGPR  (0x00, insn->def(0));
-+}
-+
- /*******************************************************************************
-  * memory
-  ******************************************************************************/
-@@ -3537,6 +3566,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
-    case OP_BFIND:
-       emitFLO();
-       break;
-+   case OP_PERMT:
-+      emitPRMT();
-+      break;
-    case OP_SLCT:
-       if (isFloatType(insn->dType))
-          emitFCMP();
-@@ -3742,156 +3774,6 @@ CodeEmitterGM107::getMinEncodingSize(const Instruction *i) const
-  * sched data calculator
-  ******************************************************************************/
- 
--class SchedDataCalculatorGM107 : public Pass
--{
--public:
--   SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {}
--
--private:
--   struct RegScores
--   {
--      struct ScoreData {
--         int r[256];
--         int p[8];
--         int c;
--      } rd, wr;
--      int base;
--
--      void rebase(const int base)
--      {
--         const int delta = this->base - base;
--         if (!delta)
--            return;
--         this->base = 0;
--
--         for (int i = 0; i < 256; ++i) {
--            rd.r[i] += delta;
--            wr.r[i] += delta;
--         }
--         for (int i = 0; i < 8; ++i) {
--            rd.p[i] += delta;
--            wr.p[i] += delta;
--         }
--         rd.c += delta;
--         wr.c += delta;
--      }
--      void wipe()
--      {
--         memset(&rd, 0, sizeof(rd));
--         memset(&wr, 0, sizeof(wr));
--      }
--      int getLatest(const ScoreData& d) const
--      {
--         int max = 0;
--         for (int i = 0; i < 256; ++i)
--            if (d.r[i] > max)
--               max = d.r[i];
--         for (int i = 0; i < 8; ++i)
--            if (d.p[i] > max)
--               max = d.p[i];
--         if (d.c > max)
--            max = d.c;
--         return max;
--      }
--      inline int getLatestRd() const
--      {
--         return getLatest(rd);
--      }
--      inline int getLatestWr() const
--      {
--         return getLatest(wr);
--      }
--      inline int getLatest() const
--      {
--         return MAX2(getLatestRd(), getLatestWr());
--      }
--      void setMax(const RegScores *that)
--      {
--         for (int i = 0; i < 256; ++i) {
--            rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
--            wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
--         }
--         for (int i = 0; i < 8; ++i) {
--            rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
--            wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
--         }
--         rd.c = MAX2(rd.c, that->rd.c);
--         wr.c = MAX2(wr.c, that->wr.c);
--      }
--      void print(int cycle)
--      {
--         for (int i = 0; i < 256; ++i) {
--            if (rd.r[i] > cycle)
--               INFO("rd $r%i @ %i\n", i, rd.r[i]);
--            if (wr.r[i] > cycle)
--               INFO("wr $r%i @ %i\n", i, wr.r[i]);
--         }
--         for (int i = 0; i < 8; ++i) {
--            if (rd.p[i] > cycle)
--               INFO("rd $p%i @ %i\n", i, rd.p[i]);
--            if (wr.p[i] > cycle)
--               INFO("wr $p%i @ %i\n", i, wr.p[i]);
--         }
--         if (rd.c > cycle)
--            INFO("rd $c @ %i\n", rd.c);
--         if (wr.c > cycle)
--            INFO("wr $c @ %i\n", wr.c);
--      }
--   };
--
--   RegScores *score; // for current BB
--   std::vector<RegScores> scoreBoards;
--
--   const TargetGM107 *targ;
--   bool visit(Function *);
--   bool visit(BasicBlock *);
--
--   void commitInsn(const Instruction *, int);
--   int calcDelay(const Instruction *, int) const;
--   void setDelay(Instruction *, int, const Instruction *);
--   void recordWr(const Value *, int, int);
--   void checkRd(const Value *, int, int&) const;
--
--   inline void emitYield(Instruction *);
--   inline void emitStall(Instruction *, uint8_t);
--   inline void emitReuse(Instruction *, uint8_t);
--   inline void emitWrDepBar(Instruction *, uint8_t);
--   inline void emitRdDepBar(Instruction *, uint8_t);
--   inline void emitWtDepBar(Instruction *, uint8_t);
--
--   inline int getStall(const Instruction *) const;
--   inline int getWrDepBar(const Instruction *) const;
--   inline int getRdDepBar(const Instruction *) const;
--   inline int getWtDepBar(const Instruction *) const;
--
--   void setReuseFlag(Instruction *);
--
--   inline void printSchedInfo(int, const Instruction *) const;
--
--   struct LiveBarUse {
--      LiveBarUse(Instruction *insn, Instruction *usei)
--         : insn(insn), usei(usei) { }
--      Instruction *insn;
--      Instruction *usei;
--   };
--
--   struct LiveBarDef {
--      LiveBarDef(Instruction *insn, Instruction *defi)
--         : insn(insn), defi(defi) { }
--      Instruction *insn;
--      Instruction *defi;
--   };
--
--   bool insertBarriers(BasicBlock *);
--
--   bool doesInsnWriteTo(const Instruction *insn, const Value *val) const;
--   Instruction *findFirstUse(const Instruction *) const;
--   Instruction *findFirstDef(const Instruction *) const;
--
--   bool needRdDepBar(const Instruction *) const;
--   bool needWrDepBar(const Instruction *) const;
--};
--
- inline void
- SchedDataCalculatorGM107::emitStall(Instruction *insn, uint8_t cnt)
- {
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp
-new file mode 100644
-index 00000000000..ef33743e610
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.cpp
-@@ -0,0 +1,2052 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#include "codegen/nv50_ir_emit_gv100.h"
-+#include "codegen/nv50_ir_sched_gm107.h"
-+
-+namespace nv50_ir {
-+
-+/*******************************************************************************
-+ * instruction format helpers
-+ ******************************************************************************/
-+
-+#define FA_NODEF (1 << 0)
-+#define FA_RRR   (1 << 1)
-+#define FA_RRI   (1 << 2)
-+#define FA_RRC   (1 << 3)
-+#define FA_RIR   (1 << 4)
-+#define FA_RCR   (1 << 5)
-+
-+#define FA_SRC_MASK 0x0ff
-+#define FA_SRC_NEG  0x100
-+#define FA_SRC_ABS  0x200
-+
-+#define EMPTY -1
-+#define __(a) (a) // no source modifiers
-+#define _A(a) ((a) | FA_SRC_ABS)
-+#define N_(a) ((a) | FA_SRC_NEG)
-+#define NA(a) ((a) | FA_SRC_NEG | FA_SRC_ABS)
-+
-+void
-+CodeEmitterGV100::emitFormA_I32(int src)
-+{
-+   emitIMMD(32, 32, insn->src(src));
-+   if (insn->src(src).mod.abs())
-+      code[1] &= 0x7fffffff;
-+   if (insn->src(src).mod.neg())
-+      code[1] ^= 0x80000000;
-+}
-+
-+void
-+CodeEmitterGV100::emitFormA_RRC(uint16_t op, int src1, int src2)
-+{
-+   emitInsn(op);
-+   if (src1 >= 0) {
-+      emitNEG (75, (src1 & FA_SRC_MASK), (src1 & FA_SRC_NEG));
-+      emitABS (74, (src1 & FA_SRC_MASK), (src1 & FA_SRC_ABS));
-+      emitGPR (64, insn->src(src1 & FA_SRC_MASK));
-+   }
-+   if (src2 >= 0) {
-+      emitNEG (63, (src2 & FA_SRC_MASK), (src2 & FA_SRC_NEG));
-+      emitABS (62, (src2 & FA_SRC_MASK), (src2 & FA_SRC_ABS));
-+      emitCBUF(54, -1, 38, 0, 2, insn->src(src2 & FA_SRC_MASK));
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitFormA_RRI(uint16_t op, int src1, int src2)
-+{
-+   emitInsn(op);
-+   if (src1 >= 0) {
-+      emitNEG (75, (src1 & FA_SRC_MASK), (src1 & FA_SRC_NEG));
-+      emitABS (74, (src1 & FA_SRC_MASK), (src1 & FA_SRC_ABS));
-+      emitGPR (64, insn->src(src1 & FA_SRC_MASK));
-+   }
-+   if (src2 >= 0)
-+      emitFormA_I32(src2 & FA_SRC_MASK);
-+}
-+
-+void
-+CodeEmitterGV100::emitFormA_RRR(uint16_t op, int src1, int src2)
-+{
-+   emitInsn(op);
-+   if (src2 >= 0) {
-+      emitNEG (75, (src2 & FA_SRC_MASK), (src2 & FA_SRC_NEG));
-+      emitABS (74, (src2 & FA_SRC_MASK), (src2 & FA_SRC_ABS));
-+      emitGPR (64, insn->src(src2 & FA_SRC_MASK));
-+   }
-+
-+   if (src1 >= 0) {
-+      emitNEG (63, (src1 & FA_SRC_MASK), (src1 & FA_SRC_NEG));
-+      emitABS (62, (src1 & FA_SRC_MASK), (src1 & FA_SRC_ABS));
-+      emitGPR (32, insn->src(src1 & FA_SRC_MASK));
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitFormA(uint16_t op, uint8_t forms,
-+                            int src0, int src1, int src2)
-+{
-+   switch ((src1 < 0) ? FILE_GPR : insn->src(src1 & FA_SRC_MASK).getFile()) {
-+   case FILE_GPR:
-+      switch ((src2 < 0) ? FILE_GPR : insn->src(src2 & FA_SRC_MASK).getFile()) {
-+      case FILE_GPR:
-+         assert(forms & FA_RRR);
-+         emitFormA_RRR((1 << 9) | op, src1, src2);
-+         break;
-+      case FILE_IMMEDIATE:
-+         assert(forms & FA_RRI);
-+         emitFormA_RRI((2 << 9) | op, src1, src2);
-+         break;
-+      case FILE_MEMORY_CONST:
-+         assert(forms & FA_RRC);
-+         emitFormA_RRC((3 << 9) | op, src1, src2);
-+         break;
-+      default:
-+         assert(!"bad src2 file");
-+         break;
-+      }
-+      break;
-+   case FILE_IMMEDIATE:
-+      assert((src2 < 0) || insn->src(src2 & FA_SRC_MASK).getFile() == FILE_GPR);
-+      assert(forms & FA_RIR);
-+      emitFormA_RRI((4 << 9) | op, src2, src1);
-+      break;
-+   case FILE_MEMORY_CONST:
-+      assert((src2 < 0) || insn->src(src2 & FA_SRC_MASK).getFile() == FILE_GPR);
-+      assert(forms & FA_RCR);
-+      emitFormA_RRC((5 << 9) | op, src2, src1);
-+      break;
-+   default:
-+      assert(!"bad src1 file");
-+      break;
-+   }
-+
-+   if (src0 >= 0) {
-+      assert(insn->src(src0 & FA_SRC_MASK).getFile() == FILE_GPR);
-+      emitABS(73, (src0 & FA_SRC_MASK), (src0 & FA_SRC_ABS));
-+      emitNEG(72, (src0 & FA_SRC_MASK), (src0 & FA_SRC_NEG));
-+      emitGPR(24, insn->src(src0 & FA_SRC_MASK));
-+   }
-+
-+   if (!(forms & FA_NODEF))
-+      emitGPR(16, insn->def(0));
-+}
-+
-+/*******************************************************************************
-+ * control
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitBRA()
-+{
-+   const FlowInstruction *insn = this->insn->asFlow();
-+   int64_t target = ((int64_t)insn->target.bb->binPos - (codeSize + 0x10)) / 4;
-+
-+   assert(!insn->indirect && !insn->absolute);
-+
-+   emitInsn (0x947);
-+   emitField(34, 48, target);
-+   emitPRED (87);
-+   emitField(86, 2, 0); // ./.INC/.DEC
-+}
-+
-+void
-+CodeEmitterGV100::emitEXIT()
-+{
-+   emitInsn (0x94d);
-+   emitNOT  (90);
-+   emitPRED (87);
-+   emitField(85, 1, 0); // .NO_ATEXIT
-+   emitField(84, 2, 0); // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3
-+}
-+
-+void
-+CodeEmitterGV100::emitKILL()
-+{
-+   emitInsn(0x95b);
-+   emitPRED(87);
-+}
-+
-+void
-+CodeEmitterGV100::emitNOP()
-+{
-+   emitInsn(0x918);
-+}
-+
-+void
-+CodeEmitterGV100::emitWARPSYNC()
-+{
-+   emitFormA(0x148, FA_NODEF | FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+   emitNOT  (90);
-+   emitPRED (87);
-+}
-+
-+/*******************************************************************************
-+ * movement / conversion
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitCS2R()
-+{
-+   emitInsn(0x805);
-+   emitSYS (72, insn->src(0));
-+   emitGPR (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitF2F()
-+{
-+   if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8)
-+      emitFormA(0x104, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   else
-+      emitFormA(0x110, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   emitField(84, 2, util_logbase2(typeSizeof(insn->sType)));
-+   emitFMZ  (80, 1);
-+   emitRND  (78);
-+   emitField(75, 2, util_logbase2(typeSizeof(insn->dType)));
-+   emitField(60, 2, insn->subOp); // ./.H1/.INVALID2/.INVALID3
-+}
-+
-+void
-+CodeEmitterGV100::emitF2I()
-+{
-+   if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8)
-+      emitFormA(0x105, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   else
-+      emitFormA(0x111, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   emitField(84, 2, util_logbase2(typeSizeof(insn->sType)));
-+   emitFMZ  (80, 1);
-+   emitRND  (78);
-+   emitField(77, 1, 0); // .NTZ
-+   emitField(75, 2, util_logbase2(typeSizeof(insn->dType)));
-+   emitField(72, 1, isSignedType(insn->dType));
-+}
-+
-+void
-+CodeEmitterGV100::emitFRND()
-+{
-+   int subop = 0;
-+
-+   switch (insn->op) {
-+   case OP_CVT:
-+      switch (insn->rnd) {
-+      case ROUND_NI: subop = 0; break;
-+      case ROUND_MI: subop = 1; break;
-+      case ROUND_PI: subop = 2; break;
-+      case ROUND_ZI: subop = 3; break;
-+      default:
-+         assert(!"invalid FRND mode");
-+         break;
-+      }
-+      break;
-+   case OP_FLOOR: subop = 1; break;
-+   case OP_CEIL : subop = 2; break;
-+   case OP_TRUNC: subop = 3; break;
-+   default:
-+      assert(!"invalid FRND opcode");
-+      break;
-+   }
-+
-+   if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8)
-+      emitFormA(0x107, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   else
-+      emitFormA(0x113, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   emitField(84, 2, util_logbase2(typeSizeof(insn->sType)));
-+   emitFMZ  (80, 1);
-+   emitField(78, 2, subop);
-+   emitField(75, 2, util_logbase2(typeSizeof(insn->dType)));
-+}
-+
-+void
-+CodeEmitterGV100::emitI2F()
-+{
-+   if (typeSizeof(insn->sType) != 8 && typeSizeof(insn->dType) != 8)
-+      emitFormA(0x106, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+   else
-+      emitFormA(0x112, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+   emitField(84, 2, util_logbase2(typeSizeof(insn->sType)));
-+   emitRND  (78);
-+   emitField(75, 2, util_logbase2(typeSizeof(insn->dType)));
-+   emitField(74, 1, isSignedType(insn->sType));
-+   if (typeSizeof(insn->sType) == 2)
-+      emitField(60, 2, insn->subOp >> 1);
-+   else
-+      emitField(60, 2, insn->subOp); // ./.B1/.B2/.B3
-+}
-+
-+void
-+CodeEmitterGV100::emitMOV()
-+{
-+   switch (insn->def(0).getFile()) {
-+   case FILE_GPR:
-+      switch (insn->src(0).getFile()) {
-+      case FILE_GPR:
-+      case FILE_MEMORY_CONST:
-+      case FILE_IMMEDIATE:
-+         emitFormA(0x002, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+         emitField(72, 4, insn->lanes);
-+         break;
-+      case FILE_PREDICATE:
-+         emitInsn (0x807);
-+         emitGPR  (16, insn->def(0));
-+         emitGPR  (24);
-+         emitField(32, 32, 0xffffffff);
-+         emitField(90,  1, 1);
-+         emitPRED (87, insn->src(0));
-+         break;
-+      default:
-+         assert(!"bad src file");
-+         break;
-+      }
-+      break;
-+   case FILE_PREDICATE:
-+      emitInsn (0x20c);
-+      emitPRED (87);
-+      emitPRED (84);
-+      emitNOT  (71);
-+      emitPRED (68);
-+      emitPRED (81, insn->def(0));
-+      emitCond3(76, CC_NE);
-+      emitGPR  (24, insn->src(0));
-+      emitGPR  (32);
-+      break;
-+   default:
-+      assert(!"bad dst file");
-+      break;
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitPRMT()
-+{
-+   emitFormA(0x016, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), __(2));
-+   emitField(72, 3, insn->subOp);
-+}
-+
-+void
-+CodeEmitterGV100::emitS2R()
-+{
-+   emitInsn(0x919);
-+   emitSYS (72, insn->src(0));
-+   emitGPR (16, insn->def(0));
-+}
-+
-+static void
-+selpFlip(const FixupEntry *entry, uint32_t *code, const FixupData& data)
-+{
-+   int loc = entry->loc;
-+   if (data.force_persample_interp)
-+      code[loc + 2] |= 1 << 26;
-+   else
-+      code[loc + 2] &= ~(1 << 26);
-+}
-+
-+void
-+CodeEmitterGV100::emitSEL()
-+{
-+   emitFormA(0x007, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY);
-+   emitNOT  (90, insn->src(2));
-+   emitPRED (87, insn->src(2));
-+   if (insn->subOp == 1)
-+      addInterp(0, 0, selpFlip);
-+}
-+
-+void
-+CodeEmitterGV100::emitSHFL()
-+{
-+   switch (insn->src(1).getFile()) {
-+   case FILE_GPR:
-+      switch (insn->src(2).getFile()) {
-+      case FILE_GPR:
-+         emitInsn(0x389);
-+         emitGPR (64, insn->src(2));
-+         break;
-+      case FILE_IMMEDIATE:
-+         emitInsn(0x589);
-+         emitIMMD(40, 13, insn->src(2));
-+         break;
-+      default:
-+         assert(!"bad src2 file");
-+         break;
-+      }
-+      emitGPR(32, insn->src(1));
-+      break;
-+   case FILE_IMMEDIATE:
-+      switch (insn->src(2).getFile()) {
-+      case FILE_GPR:
-+         emitInsn(0x989);
-+         emitGPR (64, insn->src(2));
-+         break;
-+      case FILE_IMMEDIATE:
-+         emitInsn(0xf89);
-+         emitIMMD(40, 13, insn->src(2));
-+         break;
-+      default:
-+         assert(!"bad src2 file");
-+         break;
-+      }
-+      emitIMMD(53, 5, insn->src(1));
-+      break;
-+   default:
-+      assert(!"bad src1 file");
-+      break;
-+   }
-+
-+   if (insn->defExists(1))
-+      emitPRED(81, insn->def(1));
-+   else
-+      emitPRED(81);
-+
-+   emitField(58, 2, insn->subOp);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+/*******************************************************************************
-+ * fp32
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitFADD()
-+{
-+   if (insn->src(1).getFile() == FILE_GPR)
-+      emitFormA(0x021, FA_RRR         , NA(0), NA(1), EMPTY);
-+   else
-+      emitFormA(0x021, FA_RRI | FA_RRC, NA(0), EMPTY, NA(1));
-+   emitFMZ  (80, 1);
-+   emitRND  (78);
-+   emitSAT  (77);
-+}
-+
-+void
-+CodeEmitterGV100::emitFFMA()
-+{
-+   emitFormA(0x023, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, NA(0), NA(1), NA(2));
-+   emitField(80, 1, insn->ftz);
-+   emitRND  (78);
-+   emitSAT  (77);
-+   emitField(76, 1, insn->dnz);
-+}
-+
-+void
-+CodeEmitterGV100::emitFMNMX()
-+{
-+   emitFormA(0x009, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY);
-+   emitField(90, 1, insn->op == OP_MAX);
-+   emitPRED (87);
-+   emitFMZ  (80, 1);
-+}
-+
-+void
-+CodeEmitterGV100::emitFMUL()
-+{
-+   emitFormA(0x020, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY);
-+   emitField(80, 1, insn->ftz);
-+   emitPDIV (84);
-+   emitRND  (78);
-+   emitSAT  (77);
-+   emitField(76, 1, insn->dnz);
-+}
-+
-+void
-+CodeEmitterGV100::emitFSET_BF()
-+{
-+   const CmpInstruction *insn = this->insn->asCmp();
-+
-+   emitFormA(0x00a, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY);
-+   emitFMZ  (80, 1);
-+   emitCond4(76, insn->setCond);
-+
-+   if (insn->op != OP_SET) {
-+      switch (insn->op) {
-+      case OP_SET_AND: emitField(74, 2, 0); break;
-+      case OP_SET_OR : emitField(74, 2, 1); break;
-+      case OP_SET_XOR: emitField(74, 2, 2); break;
-+      default:
-+         assert(!"invalid set op");
-+         break;
-+      }
-+      emitNOT (90, insn->src(2));
-+      emitPRED(87, insn->src(2));
-+   } else {
-+      emitPRED(87);
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitFSETP()
-+{
-+   const CmpInstruction *insn = this->insn->asCmp();
-+
-+   emitFormA(0x00b, FA_NODEF | FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY);
-+   emitFMZ  (80, 1);
-+   emitCond4(76, insn->setCond);
-+
-+   if (insn->op != OP_SET) {
-+      switch (insn->op) {
-+      case OP_SET_AND: emitField(74, 2, 0); break;
-+      case OP_SET_OR : emitField(74, 2, 1); break;
-+      case OP_SET_XOR: emitField(74, 2, 2); break;
-+      default:
-+         assert(!"invalid set op");
-+         break;
-+      }
-+      emitNOT (90, insn->src(2));
-+      emitPRED(87, insn->src(2));
-+   } else {
-+      emitPRED(87);
-+   }
-+
-+   if (insn->defExists(1))
-+      emitPRED(84, insn->def(1));
-+   else
-+      emitPRED(84);
-+   emitPRED(81, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitFSWZADD()
-+{
-+   uint8_t subOp = 0;
-+
-+   // NP/PN swapped vs SM60
-+   for (int i = 0; i < 4; i++) {
-+      uint8_t p = ((insn->subOp >> (i * 2)) & 3);
-+      if (p == 1 || p == 2)
-+         p ^= 3;
-+      subOp |= p << (i * 2);
-+   }
-+
-+   emitInsn (0x822);
-+   emitFMZ  (80, 1);
-+   emitRND  (78);
-+   emitField(77, 1, insn->lanes); /* abused for .ndv */
-+   emitGPR  (64, insn->src(1));
-+   emitField(32, 8, subOp);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitMUFU()
-+{
-+   int mufu = 0;
-+
-+   switch (insn->op) {
-+   case OP_COS : mufu = 0; break;
-+   case OP_SIN : mufu = 1; break;
-+   case OP_EX2 : mufu = 2; break;
-+   case OP_LG2 : mufu = 3; break;
-+   case OP_RCP : mufu = 4 + 2 * insn->subOp; break;
-+   case OP_RSQ : mufu = 5 + 2 * insn->subOp; break;
-+   case OP_SQRT: mufu = 8; break;
-+   default:
-+      assert(!"invalid mufu");
-+      break;
-+   }
-+
-+   emitFormA(0x108, FA_RRR | FA_RIR | FA_RCR, EMPTY, NA(0), EMPTY);
-+   emitField(74, 4, mufu);
-+}
-+
-+/*******************************************************************************
-+ * fp64
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitDADD()
-+{
-+   emitFormA(0x029, FA_RRR | FA_RRI | FA_RRC, NA(0), EMPTY, NA(1));
-+   emitRND(78);
-+}
-+
-+void
-+CodeEmitterGV100::emitDFMA()
-+{
-+   emitFormA(0x02b, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, NA(0), NA(1), NA(2));
-+   emitRND(78);
-+}
-+
-+void
-+CodeEmitterGV100::emitDMUL()
-+{
-+   emitFormA(0x028, FA_RRR | FA_RIR | FA_RCR, NA(0), NA(1), EMPTY);
-+   emitRND(78);
-+}
-+
-+void
-+CodeEmitterGV100::emitDSETP()
-+{
-+   const CmpInstruction *insn = this->insn->asCmp();
-+
-+   if (insn->src(1).getFile() == FILE_GPR)
-+      emitFormA(0x02a, FA_NODEF | FA_RRR         , NA(0), NA(1), EMPTY);
-+   else
-+      emitFormA(0x02a, FA_NODEF | FA_RRI | FA_RRC, NA(0), EMPTY, NA(1));
-+
-+   if (insn->op != OP_SET) {
-+      switch (insn->op) {
-+      case OP_SET_AND: emitField(74, 2, 0); break;
-+      case OP_SET_OR : emitField(74, 2, 1); break;
-+      case OP_SET_XOR: emitField(74, 2, 2); break;
-+      default:
-+         assert(!"invalid set op");
-+         break;
-+      }
-+      emitNOT (90, insn->src(2));
-+      emitPRED(87, insn->src(2));
-+   } else {
-+      emitPRED(87);
-+   }
-+
-+   if (insn->defExists(1))
-+      emitPRED(84, insn->def(1));
-+   else
-+      emitPRED(84);
-+   emitPRED (81, insn->def(0));
-+   emitCond4(76, insn->setCond);
-+}
-+
-+/*******************************************************************************
-+ * integer
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitBMSK()
-+{
-+   emitFormA(0x01b, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY);
-+   emitField(75, 1, insn->subOp); // .C/.W
-+}
-+
-+void
-+CodeEmitterGV100::emitBREV()
-+{
-+   emitFormA(0x101, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+}
-+
-+void
-+CodeEmitterGV100::emitFLO()
-+{
-+   emitFormA(0x100, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+   emitPRED (81);
-+   emitField(74, 1, insn->subOp == NV50_IR_SUBOP_BFIND_SAMT);
-+   emitField(73, 1, isSignedType(insn->dType));
-+   emitNOT  (63, insn->src(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitIABS()
-+{
-+   emitFormA(0x013, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+}
-+
-+void
-+CodeEmitterGV100::emitIADD3()
-+{
-+//   emitFormA(0x010, FA_RRR | FA_RIR | FA_RCR, N_(0), N_(1), N_(2));
-+   emitFormA(0x010, FA_RRR | FA_RIR | FA_RCR, N_(0), N_(1), EMPTY);
-+   emitGPR  (64); //XXX: fix when switching back to N_(2)
-+   emitPRED (84, NULL); // .CC1
-+   emitPRED (81, insn->flagsDef >= 0 ? insn->getDef(insn->flagsDef) : NULL);
-+   if (insn->flagsSrc >= 0) {
-+      emitField(74, 1, 1); // .X
-+      emitPRED (87, insn->getSrc(insn->flagsSrc));
-+      emitField(77, 4, 0xf); // .X1
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitIMAD()
-+{
-+   emitFormA(0x024, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), N_(2));
-+   emitField(73, 1, isSignedType(insn->sType));
-+}
-+
-+void
-+CodeEmitterGV100::emitIMAD_WIDE()
-+{
-+   emitFormA(0x025, FA_RRR |          FA_RRC | FA_RIR | FA_RCR, __(0), __(1), N_(2));
-+   emitPRED (81);
-+   emitField(73, 1, isSignedType(insn->sType));
-+}
-+
-+void
-+CodeEmitterGV100::emitISETP()
-+{
-+   const CmpInstruction *insn = this->insn->asCmp();
-+
-+   emitFormA(0x00c, FA_NODEF | FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY);
-+
-+   if (insn->op != OP_SET) {
-+      switch (insn->op) {
-+      case OP_SET_AND: emitField(74, 2, 0); break;
-+      case OP_SET_OR : emitField(74, 2, 1); break;
-+      case OP_SET_XOR: emitField(74, 2, 2); break;
-+      default:
-+         assert(!"invalid set op");
-+         break;
-+      }
-+      emitNOT (90, insn->src(2));
-+      emitPRED(87, insn->src(2));
-+   } else {
-+      emitPRED(87);
-+   }
-+
-+   //XXX: CC->pred
-+   if (insn->flagsSrc >= 0) {
-+      assert(0);
-+      emitField(68, 4, 6);
-+   } else {
-+      emitNOT (71);
-+      if (!insn->subOp)
-+         emitPRED(68);
-+   }
-+
-+   if (insn->defExists(1))
-+      emitPRED(84, insn->def(1));
-+   else
-+      emitPRED(84);
-+   emitPRED (81, insn->def(0));
-+   emitCond3(76, insn->setCond);
-+   emitField(73, 1, isSignedType(insn->sType));
-+
-+   if (insn->subOp) { // .EX
-+      assert(0);
-+      emitField(72, 1, 1);
-+      emitPRED (68, insn->srcExists(3) ? insn->src(3) : insn->src(2));
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitLEA()
-+{
-+   assert(insn->src(1).get()->asImm());
-+
-+   emitFormA(0x011, FA_RRR | FA_RIR | FA_RCR, N_(0), N_(2), EMPTY);
-+   emitPRED (81);
-+   emitIMMD (75, 5, insn->src(1));
-+   emitGPR  (64);
-+}
-+
-+void
-+CodeEmitterGV100::emitLOP3_LUT()
-+{
-+   emitFormA(0x012, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), __(2));
-+   emitField(90, 1, 1);
-+   emitPRED (87);
-+   emitPRED (81);
-+   emitField(80, 1, 0); // .PAND
-+   emitField(72, 8, insn->subOp);
-+}
-+
-+void
-+CodeEmitterGV100::emitPOPC()
-+{
-+   emitFormA(0x109, FA_RRR | FA_RIR | FA_RCR, EMPTY, __(0), EMPTY);
-+   emitNOT  (63, insn->src(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitSGXT()
-+{
-+   emitFormA(0x01a, FA_RRR | FA_RIR | FA_RCR, __(0), __(1), EMPTY);
-+   emitField(75, 1, 0); // .W
-+   emitField(73, 1, 1); // /.U32
-+}
-+
-+void
-+CodeEmitterGV100::emitSHF()
-+{
-+   emitFormA(0x019, FA_RRR | FA_RRI | FA_RRC | FA_RIR | FA_RCR, __(0), __(1), __(2));
-+   emitField(80, 1, !!(insn->subOp & NV50_IR_SUBOP_SHF_HI));
-+   emitField(76, 1, !!(insn->subOp & NV50_IR_SUBOP_SHF_R));
-+   emitField(75, 1, !!(insn->subOp & NV50_IR_SUBOP_SHF_W));
-+
-+   switch (insn->sType) {
-+   case TYPE_S64: emitField(73, 2, 0); break;
-+   case TYPE_U64: emitField(73, 2, 1); break;
-+   case TYPE_S32: emitField(73, 2, 2); break;
-+   case TYPE_U32:
-+   default:
-+      emitField(73, 2, 3);
-+      break;
-+   }
-+}
-+
-+/*******************************************************************************
-+ * load/stores
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitALD()
-+{
-+   emitInsn (0x321);
-+   emitField(74, 2, (insn->getDef(0)->reg.size / 4) - 1);
-+   emitGPR  (32, insn->src(0).getIndirect(1));
-+   emitO    (79);
-+   emitP    (76);
-+   emitADDR (24, 40, 10, 0, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitAST()
-+{
-+   emitInsn (0x322);
-+   emitField(74, 2, (typeSizeof(insn->dType) / 4) - 1);
-+   emitGPR  (64, insn->src(0).getIndirect(1));
-+   emitP    (76);
-+   emitADDR (24, 40, 10, 0, insn->src(0));
-+   emitGPR  (32, insn->src(1));
-+}
-+
-+void
-+CodeEmitterGV100::emitATOM()
-+{
-+   unsigned subOp, dType;
-+
-+   if (insn->subOp != NV50_IR_SUBOP_ATOM_CAS) {
-+      emitInsn(0x38a);
-+
-+      if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH)
-+         subOp = 8;
-+      else
-+         subOp = insn->subOp;
-+      emitField(87, 4, subOp);
-+
-+      switch (insn->dType) {
-+      case TYPE_U32 : dType = 0; break;
-+      case TYPE_S32 : dType = 1; break;
-+      case TYPE_U64 : dType = 2; break;
-+      case TYPE_F32 : dType = 3; break;
-+      case TYPE_B128: dType = 4; break;
-+      case TYPE_S64 : dType = 5; break;
-+      default:
-+         assert(!"unexpected dType");
-+         dType = 0;
-+         break;
-+      }
-+      emitField(73, 3, dType);
-+   } else {
-+      emitInsn(0x38b);
-+
-+      switch (insn->dType) {
-+      case TYPE_U32: dType = 0; break;
-+      case TYPE_U64: dType = 2; break;
-+      default:
-+         assert(!"unexpected dType");
-+         dType = 0;
-+         break;
-+      }
-+      emitField(73, 3, dType);
-+      emitGPR  (64, insn->src(2));
-+   }
-+
-+   emitPRED (81);
-+   emitField(79, 2, 1);
-+   emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8);
-+   emitGPR  (32, insn->src(1));
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitATOMS()
-+{
-+   unsigned dType, subOp;
-+
-+   if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) {
-+      switch (insn->dType) {
-+      case TYPE_U32: dType = 0; break;
-+      case TYPE_S32: dType = 1; break;
-+      case TYPE_U64: dType = 2; break;
-+      default: assert(!"unexpected dType"); dType = 0; break;
-+      }
-+
-+      emitInsn (0x38d);
-+      emitField(87, 1, 0); // ATOMS.CAS/ATOMS.CAST
-+      emitField(73, 2, dType);
-+      emitGPR  (64, insn->src(2));
-+   } else {
-+      emitInsn(0x38c);
-+
-+      if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH)
-+         subOp = 8;
-+      else
-+         subOp = insn->subOp;
-+      emitField(87, 4, subOp);
-+
-+      switch (insn->dType) {
-+      case TYPE_U32: dType = 0; break;
-+      case TYPE_S32: dType = 1; break;
-+      case TYPE_U64: dType = 2; break;
-+      default: assert(!"unexpected dType"); dType = 0; break;
-+      }
-+
-+      emitField(73, 2, dType);
-+   }
-+
-+   emitGPR  (32, insn->src(1));
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+static void
-+interpApply(const FixupEntry *entry, uint32_t *code, const FixupData& data)
-+{
-+   int ipa = entry->ipa;
-+   int loc = entry->loc;
-+
-+   if (data.force_persample_interp &&
-+       (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
-+       (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
-+      ipa |= NV50_IR_INTERP_CENTROID;
-+   }
-+
-+   int sample;
-+   switch (ipa & NV50_IR_INTERP_SAMPLE_MASK) {
-+   case NV50_IR_INTERP_DEFAULT : sample = 0; break;
-+   case NV50_IR_INTERP_CENTROID: sample = 1; break;
-+   case NV50_IR_INTERP_OFFSET  : sample = 2; break;
-+   default: assert(!"invalid sample mode");
-+   }
-+
-+   int interp;
-+   switch (ipa & NV50_IR_INTERP_MODE_MASK) {
-+   case NV50_IR_INTERP_LINEAR     :
-+   case NV50_IR_INTERP_PERSPECTIVE: interp = 0; break;
-+   case NV50_IR_INTERP_FLAT       : interp = 1; break;
-+   case NV50_IR_INTERP_SC         : interp = 2; break;
-+   default: assert(!"invalid ipa mode");
-+   }
-+
-+   code[loc + 2] &= ~(0xf << 12);
-+   code[loc + 2] |= sample << 12;
-+   code[loc + 2] |= interp << 14;
-+}
-+
-+void
-+CodeEmitterGV100::emitIPA()
-+{
-+   emitInsn (0x326);
-+   emitPRED (81, insn->defExists(1) ? insn->def(1) : NULL);
-+
-+   switch (insn->getInterpMode()) {
-+   case NV50_IR_INTERP_LINEAR     :
-+   case NV50_IR_INTERP_PERSPECTIVE: emitField(78, 2, 0); break;
-+   case NV50_IR_INTERP_FLAT       : emitField(78, 2, 1); break;
-+   case NV50_IR_INTERP_SC         : emitField(78, 2, 2); break;
-+   default:
-+      assert(!"invalid ipa mode");
-+      break;
-+   }
-+
-+   switch (insn->getSampleMode()) {
-+   case NV50_IR_INTERP_DEFAULT : emitField(76, 2, 0); break;
-+   case NV50_IR_INTERP_CENTROID: emitField(76, 2, 1); break;
-+   case NV50_IR_INTERP_OFFSET  : emitField(76, 2, 2); break;
-+   default:
-+      assert(!"invalid sample mode");
-+      break;
-+   }
-+
-+   if (insn->getSampleMode() != NV50_IR_INTERP_OFFSET) {
-+      emitGPR  (32);
-+      addInterp(insn->ipa, 0xff, interpApply);
-+   } else {
-+      emitGPR  (32, insn->src(1));
-+      addInterp(insn->ipa, insn->getSrc(1)->reg.data.id, interpApply);
-+   }
-+
-+   assert(!insn->src(0).isIndirect(0));
-+   emitADDR (-1, 64, 8, 2, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitISBERD()
-+{
-+   emitInsn(0x923);
-+   emitGPR (24, insn->src(0));
-+   emitGPR (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitLDSTc(int posm, int poso)
-+{
-+   int mode = 0;
-+   int order = 1;
-+
-+   switch (insn->cache) {
-+   case CACHE_CA: mode = 0; order = 1; break;
-+   case CACHE_CG: mode = 2; order = 2; break;
-+   case CACHE_CV: mode = 3; order = 2; break;
-+   default:
-+      assert(!"invalid caching mode");
-+      break;
-+   }
-+
-+   emitField(poso, 2, order);
-+   emitField(posm, 2, mode);
-+}
-+
-+void
-+CodeEmitterGV100::emitLDSTs(int pos, DataType type)
-+{
-+   int data = 0;
-+
-+   switch (typeSizeof(type)) {
-+   case  1: data = isSignedType(type) ? 1 : 0; break;
-+   case  2: data = isSignedType(type) ? 3 : 2; break;
-+   case  4: data = 4; break;
-+   case  8: data = 5; break;
-+   case 16: data = 6; break;
-+   default:
-+      assert(!"bad type");
-+      break;
-+   }
-+
-+   emitField(pos, 3, data);
-+}
-+
-+void
-+CodeEmitterGV100::emitLD()
-+{
-+   emitInsn (0x980);
-+   emitField(79, 2, 2); // .CONSTANT/./.STRONG/.MMIO
-+   emitField(77, 2, 2); // .CTA/.SM/.GPU/.SYS
-+   emitLDSTs(73, insn->dType);
-+   emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8);
-+   emitADDR (24, 32, 32, 0, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitLDC()
-+{
-+   emitFormA(0x182, FA_RCR, EMPTY, __(0), EMPTY);
-+   emitField(78, 2, insn->subOp);
-+   emitLDSTs(73, insn->dType);
-+   emitGPR  (24, insn->src(0).getIndirect(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitLDL()
-+{
-+   emitInsn (0x983);
-+   emitField(84, 3, 1); // .EF/./.EL/.LU/.EU/.NA/.INVALID6/.INVALID7
-+   emitLDSTs(73, insn->dType);
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitLDS()
-+{
-+   emitInsn (0x984);
-+   emitLDSTs(73, insn->dType);
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitOUT()
-+{
-+   const int cut  = insn->op == OP_RESTART || insn->subOp;
-+   const int emit = insn->op == OP_EMIT;
-+
-+   if (insn->op != OP_FINAL)
-+      emitFormA(0x124, FA_RRR | FA_RIR, __(0), __(1), EMPTY);
-+   else
-+      emitFormA(0x124, FA_RRR | FA_RIR, __(0), EMPTY, EMPTY);
-+   emitField(78, 2, (cut << 1) | emit);
-+}
-+
-+void
-+CodeEmitterGV100::emitRED()
-+{
-+   unsigned dType;
-+
-+   switch (insn->dType) {
-+   case TYPE_U32: dType = 0; break;
-+   case TYPE_S32: dType = 1; break;
-+   case TYPE_U64: dType = 2; break;
-+   case TYPE_F32: dType = 3; break;
-+   case TYPE_B128: dType = 4; break;
-+   case TYPE_S64: dType = 5; break;
-+   default: assert(!"unexpected dType"); dType = 0; break;
-+   }
-+
-+   emitInsn (0x98e);
-+   emitField(87, 3, insn->subOp);
-+   emitField(84, 3, 1); // 0=.EF, 1=, 2=.EL, 3=.LU, 4=.EU, 5=.NA
-+   emitField(79, 2, 2); // .INVALID0/./.STRONG/.INVALID3
-+   emitField(77, 2, 2); // .CTA/.SM/.GPU/.SYS
-+   emitField(73, 3, dType);
-+   emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8);
-+   emitGPR  (32, insn->src(1));
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitST()
-+{
-+   emitInsn (0x385);
-+   emitField(79, 2, 2); // .INVALID0/./.STRONG/.MMIO
-+   emitField(77, 2, 2); // .CTA/.SM/.GPU/.SYS
-+   emitLDSTs(73, insn->dType);
-+   emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8);
-+   emitGPR  (64, insn->src(1));
-+   emitADDR (24, 32, 32, 0, insn->src(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitSTL()
-+{
-+   emitInsn (0x387);
-+   emitField(84, 3, 1); // .EF/./.EL/.LU/.EU/.NA/.INVALID6/.INVALID7
-+   emitLDSTs(73, insn->dType);
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+   emitGPR  (32, insn->src(1));
-+}
-+
-+void
-+CodeEmitterGV100::emitSTS()
-+{
-+   emitInsn (0x388);
-+   emitLDSTs(73, insn->dType);
-+   emitADDR (24, 40, 24, 0, insn->src(0));
-+   emitGPR  (32, insn->src(1));
-+}
-+
-+/*******************************************************************************
-+ * texture
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitTEXs(int pos)
-+{
-+   int src1 = insn->predSrc == 1 ? 2 : 1;
-+   if (insn->srcExists(src1))
-+      emitGPR(pos, insn->src(src1));
-+   else
-+      emitGPR(pos);
-+}
-+
-+void
-+CodeEmitterGV100::emitTEX()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+   int lodm = 0;
-+
-+   if (!insn->tex.levelZero) {
-+      switch (insn->op) {
-+      case OP_TEX: lodm = 0; break;
-+      case OP_TXB: lodm = 2; break;
-+      case OP_TXL: lodm = 3; break;
-+      default:
-+         assert(!"invalid tex op");
-+         break;
-+      }
-+   } else {
-+      lodm = 1;
-+   }
-+
-+   if (insn->tex.rIndirectSrc < 0) {
-+      emitInsn (0xb60);
-+      emitField(54, 5, prog->driver->io.auxCBSlot);
-+      emitField(40, 14, insn->tex.r);
-+   } else {
-+      emitInsn (0x361);
-+      emitField(59, 1, 1); // .B
-+   }
-+   emitField(90, 1, insn->tex.liveOnly); // .NODEP
-+   emitField(87, 3, lodm);
-+   emitField(84, 3, 1); // 0=.EF, 1=, 2=.EL, 3=.LU, 4=.EU, 5=.NA
-+   emitField(78, 1, insn->tex.target.isShadow()); // .DC
-+   emitField(77, 1, insn->tex.derivAll); // .NDV
-+   emitField(76, 1, insn->tex.useOffsets == 1); // .AOFFI
-+   emitPRED (81);
-+   emitGPR  (64, insn->def(1));
-+   emitGPR  (16, insn->def(0));
-+   emitGPR  (24, insn->src(0));
-+   emitTEXs (32);
-+   emitField(63, 1, insn->tex.target.isArray());
-+   emitField(61, 2, insn->tex.target.isCube() ? 3 :
-+                    insn->tex.target.getDim() - 1);
-+   emitField(72, 4, insn->tex.mask);
-+}
-+
-+void
-+CodeEmitterGV100::emitTLD()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+
-+   if (insn->tex.rIndirectSrc < 0) {
-+      emitInsn (0xb66);
-+      emitField(54, 5, prog->driver->io.auxCBSlot);
-+      emitField(40, 14, insn->tex.r);
-+   } else {
-+      emitInsn (0x367);
-+      emitField(59, 1, 1); // .B
-+   }
-+   emitField(90, 1, insn->tex.liveOnly);
-+   emitField(87, 3, insn->tex.levelZero ? 1 /* .LZ */ : 3 /* .LL */);
-+   emitPRED (81);
-+   emitField(78, 1, insn->tex.target.isMS());
-+   emitField(76, 1, insn->tex.useOffsets == 1);
-+   emitField(72, 4, insn->tex.mask);
-+   emitGPR  (64, insn->def(1));
-+   emitField(63, 1, insn->tex.target.isArray());
-+   emitField(61, 2, insn->tex.target.isCube() ? 3 :
-+                    insn->tex.target.getDim() - 1);
-+   emitTEXs (32);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitTLD4()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+
-+   int offsets = 0;
-+   switch (insn->tex.useOffsets) {
-+   case 4: offsets = 2; break;
-+   case 1: offsets = 1; break;
-+   case 0: offsets = 0; break;
-+   default: assert(!"invalid offsets count"); break;
-+   }
-+
-+   if (insn->tex.rIndirectSrc < 0) {
-+      emitInsn (0xb63);
-+      emitField(54, 5, prog->driver->io.auxCBSlot);
-+      emitField(40, 14, insn->tex.r);
-+   } else {
-+      emitInsn (0x364);
-+      emitField(59, 1, 1); // .B
-+   }
-+   emitField(90, 1, insn->tex.liveOnly);
-+   emitField(87, 2, insn->tex.gatherComp);
-+   emitField(84, 1, 1); // !.EF
-+   emitPRED (81);
-+   emitField(78, 1, insn->tex.target.isShadow());
-+   emitField(76, 2, offsets);
-+   emitField(72, 4, insn->tex.mask);
-+   emitGPR  (64, insn->def(1));
-+   emitField(63, 1, insn->tex.target.isArray());
-+   emitField(61, 2, insn->tex.target.isCube() ? 3 :
-+                    insn->tex.target.getDim() - 1);
-+   emitTEXs (32);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitTMML()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+
-+   if (insn->tex.rIndirectSrc < 0) {
-+      emitInsn (0xb69);
-+      emitField(54, 5, prog->driver->io.auxCBSlot);
-+      emitField(40, 14, insn->tex.r);
-+   } else {
-+      emitInsn (0x36a);
-+      emitField(59, 1, 1); // .B
-+   }
-+   emitField(90, 1, insn->tex.liveOnly);
-+   emitField(77, 1, insn->tex.derivAll);
-+   emitField(72, 4, insn->tex.mask);
-+   emitGPR  (64, insn->def(1));
-+   emitField(63, 1, insn->tex.target.isArray());
-+   emitField(61, 2, insn->tex.target.isCube() ? 3 :
-+                    insn->tex.target.getDim() - 1);
-+   emitTEXs (32);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitTXD()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+
-+   if (insn->tex.rIndirectSrc < 0) {
-+      emitInsn (0xb6c);
-+      emitField(54, 5, prog->driver->io.auxCBSlot);
-+      emitField(40, 14, insn->tex.r);
-+   } else {
-+      emitInsn (0x36d);
-+      emitField(59, 1, 1); // .B
-+   }
-+   emitField(90, 1, insn->tex.liveOnly);
-+   emitPRED (81);
-+   emitField(76, 1, insn->tex.useOffsets == 1);
-+   emitField(72, 4, insn->tex.mask);
-+   emitGPR  (64, insn->def(1));
-+   emitField(63, 1, insn->tex.target.isArray());
-+   emitField(61, 2, insn->tex.target.isCube() ? 3 :
-+                    insn->tex.target.getDim() - 1);
-+   emitTEXs (32);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitTXQ()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+   int type = 0;
-+
-+   switch (insn->tex.query) {
-+   case TXQ_DIMS           : type = 0x00; break;
-+   case TXQ_TYPE           : type = 0x01; break;
-+   case TXQ_SAMPLE_POSITION: type = 0x02; break;
-+   default:
-+      assert(!"invalid txq query");
-+      break;
-+   }
-+
-+   if (insn->tex.rIndirectSrc < 0) {
-+      emitInsn (0xb6f);
-+      emitField(54, 5, prog->driver->io.auxCBSlot);
-+      emitField(40, 14, insn->tex.r);
-+   } else {
-+      emitInsn (0x370);
-+      emitField(59, 1, 1); // .B
-+   }
-+   emitField(90, 1, insn->tex.liveOnly);
-+   emitField(72, 4, insn->tex.mask);
-+   emitGPR  (64, insn->def(1));
-+   emitField(62, 2, type);
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+/*******************************************************************************
-+ * surface
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitSUHandle(const int s)
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+
-+   assert(insn->op >= OP_SULDB && insn->op <= OP_SUREDP);
-+
-+   if (insn->src(s).getFile() == FILE_GPR) {
-+      emitGPR(64, insn->src(s));
-+   } else {
-+      assert(0);
-+      //XXX: not done
-+      ImmediateValue *imm = insn->getSrc(s)->asImm();
-+      assert(imm);
-+      emitField(0x33, 1, 1);
-+      emitField(0x24, 13, imm->reg.data.u32);
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitSUTarget()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+   int target = 0;
-+
-+   assert(insn->op >= OP_SULDB && insn->op <= OP_SUREDP);
-+
-+   if (insn->tex.target == TEX_TARGET_BUFFER) {
-+      target = 1;
-+   } else if (insn->tex.target == TEX_TARGET_1D_ARRAY) {
-+      target = 2;
-+   } else if (insn->tex.target == TEX_TARGET_2D ||
-+              insn->tex.target == TEX_TARGET_RECT) {
-+      target = 3;
-+   } else if (insn->tex.target == TEX_TARGET_2D_ARRAY ||
-+              insn->tex.target == TEX_TARGET_CUBE ||
-+              insn->tex.target == TEX_TARGET_CUBE_ARRAY) {
-+      target = 4;
-+   } else if (insn->tex.target == TEX_TARGET_3D) {
-+      target = 5;
-+   } else {
-+      assert(insn->tex.target == TEX_TARGET_1D);
-+   }
-+   emitField(61, 3, target);
-+}
-+
-+void
-+CodeEmitterGV100::emitSUATOM()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+   uint8_t type = 0, subOp;
-+
-+   if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS)
-+      emitInsn(0x396);   // SUATOM.D.CAS
-+   else
-+      emitInsn(0x394);   // SUATOM.D
-+
-+   emitSUTarget();
-+
-+   // destination type
-+   switch (insn->dType) {
-+   case TYPE_S32: type = 1; break;
-+   case TYPE_U64: type = 2; break;
-+   case TYPE_F32: type = 3; break;
-+   case TYPE_S64: type = 5; break;
-+   default:
-+      assert(insn->dType == TYPE_U32);
-+      break;
-+   }
-+
-+   // atomic operation
-+   if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) {
-+      subOp = 0;
-+   } else if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
-+      subOp = 8;
-+   } else {
-+      subOp = insn->subOp;
-+   }
-+
-+   emitField(87, 4, subOp);
-+   emitPRED (81);
-+   emitField(79, 2, 1);
-+   emitField(73, 3, type);
-+   emitField(72, 1, 0); // .BA
-+   emitGPR  (32, insn->src(1));
-+   emitGPR  (24, insn->src(0));
-+   emitGPR  (16, insn->def(0));
-+
-+   emitSUHandle(2);
-+}
-+
-+void
-+CodeEmitterGV100::emitSULD()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+   int type = 0;
-+
-+   if (insn->op == OP_SULDB) {
-+      emitInsn(0x99a);
-+      emitSUTarget();
-+
-+      switch (insn->dType) {
-+      case TYPE_U8:   type = 0; break;
-+      case TYPE_S8:   type = 1; break;
-+      case TYPE_U16:  type = 2; break;
-+      case TYPE_S16:  type = 3; break;
-+      case TYPE_U32:  type = 4; break;
-+      case TYPE_U64:  type = 5; break;
-+      case TYPE_B128: type = 6; break;
-+      default:
-+         assert(0);
-+         break;
-+      }
-+      emitField(73, 3, type);
-+   } else {
-+      emitInsn(0x998);
-+      emitSUTarget();
-+      emitField(72, 4, 0xf); // rgba
-+   }
-+
-+   emitPRED (81);
-+   emitLDSTc(77, 79);
-+
-+   emitGPR  (16, insn->def(0));
-+   emitGPR  (24, insn->src(0));
-+
-+   emitSUHandle(1);
-+}
-+
-+void
-+CodeEmitterGV100::emitSUST()
-+{
-+   const TexInstruction *insn = this->insn->asTex();
-+
-+   emitInsn(0x99c); // SUST.P
-+#if 0
-+   if (insn->op == OP_SUSTB)
-+      emitField(0x34, 1, 1);
-+#endif
-+   emitSUTarget();
-+
-+   emitLDSTc(77, 79);
-+   emitField(72, 4, 0xf); // rgba
-+   emitGPR(32, insn->src(1));
-+   emitGPR(24, insn->src(0));
-+   emitSUHandle(2);
-+}
-+
-+/*******************************************************************************
-+ * misc
-+ ******************************************************************************/
-+
-+void
-+CodeEmitterGV100::emitAL2P()
-+{
-+   emitInsn (0x920);
-+   emitO    (79);
-+   emitField(74, 2, (insn->getDef(0)->reg.size / 4) - 1);
-+   emitField(40, 11, insn->src(0).get()->reg.data.offset);
-+   emitGPR  (24, insn->src(0).getIndirect(0));
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitBAR()
-+{
-+   uint8_t subop, redop = 0x00;
-+
-+   // 80
-+   //    01: DEFER_BLOCKING
-+   // 78:77
-+   //    00: SYNC
-+   //    01: ARV
-+   //    02: RED
-+   //    03: SCAN
-+   // 75:74
-+   //    00: RED.POPC
-+   //    01: RED.AND
-+   //    02: RED.OR
-+
-+   switch (insn->subOp) {
-+   case NV50_IR_SUBOP_BAR_RED_POPC: subop = 0x02; redop = 0x00; break;
-+   case NV50_IR_SUBOP_BAR_RED_AND : subop = 0x02; redop = 0x01; break;
-+   case NV50_IR_SUBOP_BAR_RED_OR  : subop = 0x02; redop = 0x02; break;
-+   case NV50_IR_SUBOP_BAR_ARRIVE  : subop = 0x01; break;
-+   default:
-+      subop = 0x00;
-+      assert(insn->subOp == NV50_IR_SUBOP_BAR_SYNC);
-+      break;
-+   }
-+
-+   if (insn->src(0).getFile() == FILE_GPR) {
-+      emitInsn ((1 << 9) | 0x11d);
-+      emitGPR  (32, insn->src(0)); //XXX: nvdisasm shows src0==src1
-+   } else {
-+      ImmediateValue *imm = insn->getSrc(0)->asImm();
-+      assert(imm);
-+      if (insn->src(1).getFile() == FILE_GPR) {
-+         emitInsn ((4 << 9) | 0x11d);
-+         emitGPR  (32, insn->src(1));
-+      } else {
-+         emitInsn ((5 << 9) | 0x11d);
-+      }
-+      emitField(54, 4, imm->reg.data.u32);
-+   }
-+
-+   emitField(77, 2, subop);
-+   emitField(74, 2, redop);
-+
-+   if (insn->srcExists(2) && (insn->predSrc != 2)) {
-+      emitField(90, 1, insn->src(2).mod == Modifier(NV50_IR_MOD_NOT));
-+      emitPRED (87, insn->src(2));
-+   } else {
-+      emitField(87, 3, 7);
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitCCTL()
-+{
-+   if (insn->src(0).getFile() == FILE_MEMORY_GLOBAL)
-+      emitInsn(0x98f);
-+   else
-+      emitInsn(0x990);
-+   emitField(87, 4, insn->subOp);
-+   emitField(72, 1, insn->src(0).getIndirect(0)->getSize() == 8);
-+   emitADDR (24, 32, 32, 0, insn->src(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitMEMBAR()
-+{
-+   emitInsn (0x992);
-+   switch (NV50_IR_SUBOP_MEMBAR_SCOPE(insn->subOp)) {
-+   case NV50_IR_SUBOP_MEMBAR_CTA: emitField(76, 3, 0); break;
-+   case NV50_IR_SUBOP_MEMBAR_GL : emitField(76, 3, 2); break;
-+   case NV50_IR_SUBOP_MEMBAR_SYS: emitField(76, 3, 3); break;
-+   default:
-+      assert(!"invalid scope");
-+      break;
-+   }
-+}
-+
-+void
-+CodeEmitterGV100::emitPIXLD()
-+{
-+   emitInsn (0x925);
-+   switch (insn->subOp) {
-+   case NV50_IR_SUBOP_PIXLD_COVMASK : emitField(78, 3, 1); break; // .COVMASK
-+   case NV50_IR_SUBOP_PIXLD_SAMPLEID: emitField(78, 3, 3); break; // .MY_INDEX
-+   default:
-+      assert(0);
-+      break;
-+   }
-+   emitPRED (71);
-+   emitGPR  (16, insn->def(0));
-+}
-+
-+void
-+CodeEmitterGV100::emitPLOP3_LUT()
-+{
-+   uint8_t op[2] = {};
-+
-+   switch (insn->op) {
-+   case OP_AND: op[0] = 0xf0 & 0xcc; break;
-+   case OP_OR : op[0] = 0xf0 | 0xcc; break;
-+   case OP_XOR: op[0] = 0xf0 ^ 0xcc; break;
-+   default:
-+      assert(!"invalid PLOP3");
-+      break;
-+   }
-+
-+   emitInsn(0x81c);
-+   emitNOT (90, insn->src(0));
-+   emitPRED(87, insn->src(0));
-+   emitPRED(84); // def(1)
-+   emitPRED(81, insn->def(0));
-+   emitNOT (80, insn->src(1));
-+   emitPRED(77, insn->src(1));
-+   emitField(72, 5, op[0] >> 3);
-+   emitNOT (71); // src(2)
-+   emitPRED(68); // src(2)
-+   emitField(64, 3, op[0] & 7);
-+   emitField(16, 8, op[1]);
-+}
-+
-+void
-+CodeEmitterGV100::emitVOTE()
-+{
-+   const ImmediateValue *imm;
-+   uint32_t u32;
-+
-+   int r = -1, p = -1;
-+   for (int i = 0; insn->defExists(i); i++) {
-+      if (insn->def(i).getFile() == FILE_GPR)
-+         r = i;
-+      else if (insn->def(i).getFile() == FILE_PREDICATE)
-+         p = i;
-+   }
-+
-+   emitInsn (0x806);
-+   emitField(72, 2, insn->subOp);
-+   if (r >= 0)
-+      emitGPR  (16, insn->def(r));
-+   else
-+      emitGPR  (16);
-+   if (p >= 0)
-+      emitPRED (81, insn->def(p));
-+   else
-+      emitPRED (81);
-+
-+   switch (insn->src(0).getFile()) {
-+   case FILE_PREDICATE:
-+      emitField(90, 1, insn->src(0).mod == Modifier(NV50_IR_MOD_NOT));
-+      emitPRED (87, insn->src(0));
-+      break;
-+   case FILE_IMMEDIATE:
-+      imm = insn->getSrc(0)->asImm();
-+      assert(imm);
-+      u32 = imm->reg.data.u32;
-+      assert(u32 == 0 || u32 == 1);
-+      emitField(90, 1, u32 == 0);
-+      emitPRED (87);
-+      break;
-+   default:
-+      assert(!"Unhandled src");
-+      break;
-+   }
-+}
-+
-+bool
-+CodeEmitterGV100::emitInstruction(Instruction *i)
-+{
-+   insn = i;
-+
-+   switch (insn->op) {
-+   case OP_ABS:
-+      assert(!isFloatType(insn->dType));
-+      emitIABS();
-+      break;
-+   case OP_ADD:
-+      if (isFloatType(insn->dType)) {
-+         if (insn->dType == TYPE_F32)
-+            emitFADD();
-+         else
-+            emitDADD();
-+      } else {
-+         emitIADD3();
-+      }
-+      break;
-+   case OP_AFETCH:
-+      emitAL2P();
-+      break;
-+   case OP_AND:
-+   case OP_OR:
-+   case OP_XOR:
-+      if (insn->def(0).getFile() == FILE_PREDICATE) {
-+         emitPLOP3_LUT();
-+      } else {
-+         assert(!"invalid logop");
-+         emitNOP();
-+      }
-+      break;
-+   case OP_ATOM:
-+      if (insn->src(0).getFile() == FILE_MEMORY_SHARED)
-+         emitATOMS();
-+      else
-+         if (!insn->defExists(0) && insn->subOp < NV50_IR_SUBOP_ATOM_CAS)
-+            emitRED();
-+         else
-+            emitATOM();
-+      break;
-+   case OP_BAR:
-+      emitBAR();
-+      break;
-+   case OP_BFIND:
-+      emitFLO();
-+      break;
-+   case OP_BMSK:
-+      emitBMSK();
-+      break;
-+   case OP_BREV:
-+      emitBREV();
-+      break;
-+   case OP_BRA:
-+   case OP_JOIN: //XXX
-+      emitBRA();
-+      break;
-+   case OP_CCTL:
-+      emitCCTL();
-+      break;
-+   case OP_CEIL:
-+   case OP_CVT:
-+   case OP_FLOOR:
-+   case OP_TRUNC:
-+      if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE ||
-+                                 insn->src(0).getFile() == FILE_PREDICATE)) {
-+         emitMOV();
-+      } else if (isFloatType(insn->dType)) {
-+         if (isFloatType(insn->sType)) {
-+            if (insn->sType == insn->dType)
-+               emitFRND();
-+            else
-+               emitF2F();
-+         } else {
-+            emitI2F();
-+         }
-+      } else {
-+         if (isFloatType(insn->sType)) {
-+            emitF2I();
-+         } else {
-+            assert(!"I2I");
-+            emitNOP();
-+         }
-+      }
-+      break;
-+   case OP_COS:
-+   case OP_EX2:
-+   case OP_LG2:
-+   case OP_RCP:
-+   case OP_RSQ:
-+   case OP_SIN:
-+   case OP_SQRT:
-+      emitMUFU();
-+      break;
-+   case OP_DISCARD:
-+      emitKILL();
-+      break;
-+   case OP_EMIT:
-+   case OP_FINAL:
-+   case OP_RESTART:
-+      emitOUT();
-+      break;
-+   case OP_EXIT:
-+      emitEXIT();
-+      break;
-+   case OP_EXPORT:
-+      emitAST();
-+      break;
-+   case OP_FMA:
-+   case OP_MAD:
-+      if (isFloatType(insn->dType)) {
-+         if (insn->dType == TYPE_F32)
-+            emitFFMA();
-+         else
-+            emitDFMA();
-+      } else {
-+         if (typeSizeof(insn->dType) != 8)
-+            emitIMAD();
-+         else
-+            emitIMAD_WIDE();
-+      }
-+      break;
-+   case OP_JOINAT: //XXX
-+      emitNOP();
-+      break;
-+   case OP_LINTERP:
-+      emitIPA();
-+      break;
-+   case OP_LOAD:
-+      switch (insn->src(0).getFile()) {
-+      case FILE_MEMORY_CONST : emitLDC(); break;
-+      case FILE_MEMORY_LOCAL : emitLDL(); break;
-+      case FILE_MEMORY_SHARED: emitLDS(); break;
-+      case FILE_MEMORY_GLOBAL: emitLD(); break;
-+      default:
-+         assert(!"invalid load");
-+         emitNOP();
-+         break;
-+      }
-+      break;
-+   case OP_LOP3_LUT:
-+      emitLOP3_LUT();
-+      break;
-+   case OP_MAX:
-+   case OP_MIN:
-+      if (isFloatType(insn->dType)) {
-+         if (insn->dType == TYPE_F32) {
-+            emitFMNMX();
-+         } else {
-+            assert(!"invalid FMNMX");
-+            emitNOP();
-+         }
-+      } else {
-+         assert(!"invalid MNMX");
-+         emitNOP();
-+      }
-+      break;
-+   case OP_MEMBAR:
-+      emitMEMBAR();
-+      break;
-+   case OP_MOV:
-+      emitMOV();
-+      break;
-+   case OP_MUL:
-+      if (isFloatType(insn->dType)) {
-+         if (insn->dType == TYPE_F32)
-+            emitFMUL();
-+         else
-+            emitDMUL();
-+      } else {
-+         assert(!"invalid IMUL");
-+         emitNOP();
-+      }
-+      break;
-+   case OP_PERMT:
-+      emitPRMT();
-+      break;
-+   case OP_PFETCH:
-+      emitISBERD();
-+      break;
-+   case OP_PIXLD:
-+      emitPIXLD();
-+      break;
-+   case OP_POPCNT:
-+      emitPOPC();
-+      break;
-+   case OP_QUADOP:
-+      emitFSWZADD();
-+      break;
-+   case OP_RDSV:
-+      if (targ->isCS2RSV(insn->getSrc(0)->reg.data.sv.sv))
-+         emitCS2R();
-+      else
-+         emitS2R();
-+      break;
-+   case OP_SELP:
-+      emitSEL();
-+      break;
-+   case OP_SET:
-+   case OP_SET_AND:
-+   case OP_SET_OR:
-+   case OP_SET_XOR:
-+      if (insn->def(0).getFile() != FILE_PREDICATE) {
-+         if (isFloatType(insn->dType)) {
-+            if (insn->dType == TYPE_F32) {
-+               emitFSET_BF();
-+            } else {
-+               assert(!"invalid FSET");
-+               emitNOP();
-+            }
-+         } else {
-+            assert(!"invalid SET");
-+            emitNOP();
-+         }
-+      } else {
-+         if (isFloatType(insn->sType))
-+            if (insn->sType == TYPE_F64)
-+               emitDSETP();
-+            else
-+               emitFSETP();
-+         else
-+            emitISETP();
-+      }
-+      break;
-+   case OP_SGXT:
-+      emitSGXT();
-+      break;
-+   case OP_SHF:
-+      emitSHF();
-+      break;
-+   case OP_SHFL:
-+      emitSHFL();
-+      break;
-+   case OP_SHLADD:
-+      emitLEA();
-+      break;
-+   case OP_STORE:
-+      switch (insn->src(0).getFile()) {
-+      case FILE_MEMORY_LOCAL : emitSTL(); break;
-+      case FILE_MEMORY_SHARED: emitSTS(); break;
-+      case FILE_MEMORY_GLOBAL: emitST(); break;
-+      default:
-+         assert(!"invalid store");
-+         emitNOP();
-+         break;
-+      }
-+      break;
-+   case OP_SULDB:
-+   case OP_SULDP:
-+      emitSULD();
-+      break;
-+   case OP_SUREDB:
-+   case OP_SUREDP:
-+      emitSUATOM();
-+      break;
-+   case OP_SUSTB:
-+   case OP_SUSTP:
-+      emitSUST();
-+      break;
-+   case OP_TEX:
-+   case OP_TXB:
-+   case OP_TXL:
-+      emitTEX();
-+      break;
-+   case OP_TXD:
-+      emitTXD();
-+      break;
-+   case OP_TXF:
-+      emitTLD();
-+      break;
-+   case OP_TXG:
-+      emitTLD4();
-+      break;
-+   case OP_TXLQ:
-+      emitTMML();
-+      break;
-+   case OP_TXQ:
-+      emitTXQ();
-+      break;
-+   case OP_VFETCH:
-+      emitALD();
-+      break;
-+   case OP_VOTE:
-+      emitVOTE();
-+      break;
-+   case OP_WARPSYNC:
-+      emitWARPSYNC();
-+      break;
-+   default:
-+      assert(!"invalid opcode");
-+      emitNOP();
-+      break;
-+   }
-+
-+   code[3] &= 0x000001ff;
-+   code[3] |= insn->sched << 9;
-+   code += 4;
-+   codeSize += 16;
-+   return true;
-+}
-+
-+void
-+CodeEmitterGV100::prepareEmission(BasicBlock *bb)
-+{
-+   Function *func = bb->getFunction();
-+   Instruction *i;
-+   int j;
-+
-+   for (j = func->bbCount - 1; j >= 0 && !func->bbArray[j]->binSize; --j);
-+
-+   for (; j >= 0; --j) {
-+      BasicBlock *in = func->bbArray[j];
-+      Instruction *exit = in->getExit();
-+
-+      if (exit && exit->op == OP_BRA && exit->asFlow()->target.bb == bb) {
-+         in->binSize -= 16;
-+         func->binSize -= 16;
-+
-+         for (++j; j < func->bbCount; ++j)
-+            func->bbArray[j]->binPos -= 16;
-+
-+         in->remove(exit);
-+      }
-+      bb->binPos = in->binPos + in->binSize;
-+      if (in->binSize) // no more no-op branches to bb
-+         break;
-+   }
-+   func->bbArray[func->bbCount++] = bb;
-+
-+   if (!bb->getExit())
-+      return;
-+
-+   for (i = bb->getEntry(); i; i = i->next) {
-+      i->encSize = getMinEncodingSize(i);
-+      bb->binSize += i->encSize;
-+   }
-+
-+   assert(!bb->getEntry() || (bb->getExit() && bb->getExit()->encSize == 16));
-+
-+   func->binSize += bb->binSize;
-+}
-+
-+void
-+CodeEmitterGV100::prepareEmission(Function *func)
-+{
-+   SchedDataCalculatorGM107 sched(targ);
-+   CodeEmitter::prepareEmission(func);
-+   sched.run(func, true, true);
-+}
-+
-+void
-+CodeEmitterGV100::prepareEmission(Program *prog)
-+{
-+   for (ArrayList::Iterator fi = prog->allFuncs.iterator();
-+        !fi.end(); fi.next()) {
-+      Function *func = reinterpret_cast<Function *>(fi.get());
-+      func->binPos = prog->binSize;
-+      prepareEmission(func);
-+      prog->binSize += func->binSize;
-+   }
-+
-+   this->prog = prog;
-+}
-+
-+CodeEmitterGV100::CodeEmitterGV100(TargetGV100 *target)
-+   : CodeEmitter(target), targ(target)
-+{
-+   code = NULL;
-+   codeSize = codeSizeLimit = 0;
-+   relocInfo = NULL;
-+}
-+};
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h
-new file mode 100644
-index 00000000000..15ab717e460
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gv100.h
-@@ -0,0 +1,403 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#ifndef __NV50_IR_EMIT_GV100_H__
-+#define __NV50_IR_EMIT_GV100_H__
-+#include "codegen/nv50_ir_target_gv100.h"
-+
-+namespace nv50_ir {
-+
-+class CodeEmitterGV100 : public CodeEmitter {
-+public:
-+   CodeEmitterGV100(TargetGV100 *target);
-+
-+   virtual bool emitInstruction(Instruction *);
-+   virtual uint32_t getMinEncodingSize(const Instruction *) const { return 16; }
-+
-+private:
-+   const Program *prog;
-+   const TargetGV100 *targ;
-+   const Instruction *insn;
-+
-+   virtual void prepareEmission(Program *);
-+   virtual void prepareEmission(Function *);
-+   virtual void prepareEmission(BasicBlock *);
-+
-+   inline void emitInsn(uint32_t op) {
-+      code[0] = op;
-+      code[1] = 0;
-+      code[2] = 0;
-+      code[3] = 0;
-+      if (insn->predSrc >= 0) {
-+         emitField(12, 3, insn->getSrc(insn->predSrc)->rep()->reg.data.id);
-+         emitField(15, 1, insn->cc == CC_NOT_P);
-+      } else {
-+         emitField(12, 3, 7);
-+      }
-+   };
-+
-+   inline void emitField(int b, int s, uint64_t v) {
-+      if (b >= 0) {
-+         uint64_t m = ~0ULL >> (64 - s);
-+         uint64_t d = v & m;
-+         assert(!(v & ~m) || (v & ~m) == ~m);
-+         if (b < 64 && b + s > 64) {
-+            *(uint64_t *)&code[0] |= d << b;
-+            *(uint64_t *)&code[2] |= d >> (64 - b);
-+         } else {
-+            *(uint64_t *)&code[(b/64*2)] |= d << (b & 0x3f);
-+         }
-+      }
-+   };
-+
-+   inline void emitABS(int pos, int src, bool supported)
-+   {
-+      if (insn->src(src).mod.abs()) {
-+         assert(supported);
-+         emitField(pos, 1, 1);
-+      }
-+   }
-+
-+   inline void emitABS(int pos, int src)
-+   {
-+      emitABS(pos, src, true);
-+   }
-+
-+   inline void emitNEG(int pos, int src, bool supported) {
-+      if (insn->src(src).mod.neg()) {
-+         assert(supported);
-+         emitField(pos, 1, 1);
-+      }
-+   }
-+
-+   inline void emitNEG(int pos, int src) {
-+      emitNEG(pos, src, true);
-+   }
-+
-+   inline void emitNOT(int pos) {
-+      emitField(pos, 1, 0);
-+   };
-+
-+   inline void emitNOT(int pos, const ValueRef &ref) {
-+      emitField(pos, 1, !!(ref.mod & Modifier(NV50_IR_MOD_NOT)));
-+   }
-+
-+   inline void emitSAT(int pos) {
-+      emitField(pos, 1, insn->saturate);
-+   }
-+
-+   inline void emitRND(int rmp, RoundMode rnd, int rip) {
-+      int rm = 0, ri = 0;
-+      switch (rnd) {
-+      case ROUND_NI: ri = 1;
-+      case ROUND_N : rm = 0; break;
-+      case ROUND_MI: ri = 1;
-+      case ROUND_M : rm = 1; break;
-+      case ROUND_PI: ri = 1;
-+      case ROUND_P : rm = 2; break;
-+      case ROUND_ZI: ri = 1;
-+      case ROUND_Z : rm = 3; break;
-+      default:
-+         assert(!"invalid round mode");
-+         break;
-+      }
-+      emitField(rip, 1, ri);
-+      emitField(rmp, 2, rm);
-+   }
-+
-+   inline void emitRND(int pos) {
-+      emitRND(pos, insn->rnd, -1);
-+   }
-+
-+   inline void emitFMZ(int pos, int len) {
-+      emitField(pos, len, insn->dnz << 1 | insn->ftz);
-+   }
-+
-+   inline void emitPDIV(int pos) {
-+      emitField(pos, 3, insn->postFactor + 4);
-+   }
-+
-+   inline void emitO(int pos) {
-+      emitField(pos, 1, insn->getSrc(0)->reg.file == FILE_SHADER_OUTPUT);
-+   }
-+
-+   inline void emitP(int pos) {
-+      emitField(pos, 1, insn->perPatch);
-+   }
-+
-+   inline void emitCond3(int pos, CondCode code) {
-+      int data = 0;
-+
-+      switch (code) {
-+      case CC_FL : data = 0x00; break;
-+      case CC_LTU:
-+      case CC_LT : data = 0x01; break;
-+      case CC_EQU:
-+      case CC_EQ : data = 0x02; break;
-+      case CC_LEU:
-+      case CC_LE : data = 0x03; break;
-+      case CC_GTU:
-+      case CC_GT : data = 0x04; break;
-+      case CC_NEU:
-+      case CC_NE : data = 0x05; break;
-+      case CC_GEU:
-+      case CC_GE : data = 0x06; break;
-+      case CC_TR : data = 0x07; break;
-+      default:
-+         assert(!"invalid cond3");
-+         break;
-+      }
-+
-+      emitField(pos, 3, data);
-+   }
-+
-+   inline void emitCond4(int pos, CondCode code) {
-+      int data = 0;
-+
-+      switch (code) {
-+      case CC_FL: data = 0x00; break;
-+      case CC_LT: data = 0x01; break;
-+      case CC_EQ: data = 0x02; break;
-+      case CC_LE: data = 0x03; break;
-+      case CC_GT: data = 0x04; break;
-+      case CC_NE: data = 0x05; break;
-+      case CC_GE: data = 0x06; break;
-+   //   case CC_NUM: data = 0x07; break;
-+   //   case CC_NAN: data = 0x08; break;
-+      case CC_LTU: data = 0x09; break;
-+      case CC_EQU: data = 0x0a; break;
-+      case CC_LEU: data = 0x0b; break;
-+      case CC_GTU: data = 0x0c; break;
-+      case CC_NEU: data = 0x0d; break;
-+      case CC_GEU: data = 0x0e; break;
-+      case CC_TR:  data = 0x0f; break;
-+      default:
-+         assert(!"invalid cond4");
-+         break;
-+      }
-+
-+      emitField(pos, 4, data);
-+   }
-+
-+   inline void emitSYS(int pos, const Value *val) {
-+      int id = val ? val->reg.data.id : -1;
-+
-+      switch (id) {
-+      case SV_LANEID         : id = 0x00; break;
-+      case SV_VERTEX_COUNT   : id = 0x10; break;
-+      case SV_INVOCATION_ID  : id = 0x11; break;
-+      case SV_THREAD_KILL    : id = 0x13; break;
-+      case SV_INVOCATION_INFO: id = 0x1d; break;
-+      case SV_COMBINED_TID   : id = 0x20; break;
-+      case SV_TID            : id = 0x21 + val->reg.data.sv.index; break;
-+      case SV_CTAID          : id = 0x25 + val->reg.data.sv.index; break;
-+      case SV_LANEMASK_EQ    : id = 0x38; break;
-+      case SV_LANEMASK_LT    : id = 0x39; break;
-+      case SV_LANEMASK_LE    : id = 0x3a; break;
-+      case SV_LANEMASK_GT    : id = 0x3b; break;
-+      case SV_LANEMASK_GE    : id = 0x3c; break;
-+      case SV_CLOCK          : id = 0x50 + val->reg.data.sv.index; break;
-+      default:
-+         assert(!"invalid system value");
-+         id = 0;
-+         break;
-+      }
-+
-+      emitField(pos, 8, id);
-+   }
-+
-+   inline void emitSYS(int pos, const ValueRef &ref) {
-+      emitSYS(pos, ref.get() ? ref.rep() : (const Value *)NULL);
-+   }
-+
-+   inline void emitGPR(int pos, const Value *val, int off) {
-+      emitField(pos, 8, val && !val->inFile(FILE_FLAGS) ?
-+                val->reg.data.id + off: 255);
-+   }
-+
-+   inline void emitGPR(int pos, const Value *v) {
-+      emitGPR(pos, v, 0);
-+   }
-+
-+   inline void emitGPR(int pos) {
-+      emitGPR(pos, (const Value *)NULL);
-+   }
-+
-+   inline void emitGPR(int pos, const ValueRef &ref) {
-+      emitGPR(pos, ref.get() ? ref.rep() : (const Value *)NULL);
-+   }
-+
-+   inline void emitGPR(int pos, const ValueRef *ref) {
-+      emitGPR(pos, ref ? ref->rep() : (const Value *)NULL);
-+   }
-+
-+   inline void emitGPR(int pos, const ValueDef &def) {
-+      emitGPR(pos, def.get() ? def.rep() : (const Value *)NULL);
-+   }
-+
-+   inline void emitGPR(int pos, const ValueDef &def, int off) {
-+      emitGPR(pos, def.get() ? def.rep() : (const Value *)NULL, off);
-+   }
-+
-+   inline void emitPRED(int pos, const Value *val) {
-+      emitField(pos, 3, val ? val->reg.data.id : 7);
-+   };
-+
-+   inline void emitPRED(int pos) {
-+      emitPRED(pos, (const Value *)NULL);
-+   }
-+
-+   inline void emitPRED(int pos, const ValueRef &ref) {
-+      emitPRED(pos, ref.get() ? ref.rep() : (const Value *)NULL);
-+   }
-+
-+   inline void emitPRED(int pos, const ValueDef &def) {
-+      emitPRED(pos, def.get() ? def.rep() : (const Value *)NULL);
-+   }
-+
-+   inline void emitCBUF(int buf, int gpr, int off, int len, int align,
-+                        const ValueRef &ref) {
-+      const Value *v = ref.get();
-+      const Symbol *s = v->asSym();
-+
-+      assert(!(s->reg.data.offset & ((1 << align) - 1)));
-+
-+      emitField(buf,  5, v->reg.fileIndex);
-+      if (gpr >= 0)
-+         emitGPR(gpr, ref.getIndirect(0));
-+      emitField(off, 16, s->reg.data.offset);
-+   }
-+
-+   inline void emitIMMD(int pos, int len, const ValueRef &ref) {
-+      const ImmediateValue *imm = ref.get()->asImm();
-+      uint32_t val = imm->reg.data.u32;
-+
-+      if (insn->sType == TYPE_F64) {
-+         assert(!(imm->reg.data.u64 & 0x00000000ffffffffULL));
-+         val = imm->reg.data.u64 >> 32;
-+      }
-+
-+      emitField(pos, len, val);
-+   }
-+
-+   inline void emitADDR(int gpr, int off, int len, int shr,
-+                        const ValueRef &ref) {
-+      const Value *v = ref.get();
-+      assert(!(v->reg.data.offset & ((1 << shr) - 1)));
-+      if (gpr >= 0)
-+         emitGPR(gpr, ref.getIndirect(0));
-+      emitField(off, len, v->reg.data.offset >> shr);
-+   }
-+
-+   inline void emitFormA(uint16_t op, uint8_t forms, int src0, int src1, int src2);
-+   inline void emitFormA_RRR(uint16_t op, int src1, int src2);
-+   inline void emitFormA_RRI(uint16_t op, int src1, int src2);
-+   inline void emitFormA_RRC(uint16_t op, int src1, int src2);
-+   inline void emitFormA_I32(int src);
-+
-+   void emitBRA();
-+   void emitEXIT();
-+   void emitKILL();
-+   void emitNOP();
-+   void emitWARPSYNC();
-+
-+   void emitCS2R();
-+   void emitF2F();
-+   void emitF2I();
-+   void emitFRND();
-+   void emitI2F();
-+   void emitMOV();
-+   void emitPRMT();
-+   void emitS2R();
-+   void emitSEL();
-+   void emitSHFL();
-+
-+   void emitFADD();
-+   void emitFFMA();
-+   void emitFMNMX();
-+   void emitFMUL();
-+   void emitFSET_BF();
-+   void emitFSETP();
-+   void emitFSWZADD();
-+   void emitMUFU();
-+
-+   void emitDADD();
-+   void emitDFMA();
-+   void emitDMUL();
-+   void emitDSETP();
-+
-+   void emitBMSK();
-+   void emitBREV();
-+   void emitFLO();
-+   void emitIABS();
-+   void emitIADD3();
-+   void emitIMAD();
-+   void emitIMAD_WIDE();
-+   void emitISETP();
-+   void emitLEA();
-+   void emitLOP3_LUT();
-+   void emitPOPC();
-+   void emitSGXT();
-+   void emitSHF();
-+
-+   void emitALD();
-+   void emitAST();
-+   void emitATOM();
-+   void emitATOMS();
-+   void emitIPA();
-+   void emitISBERD();
-+   void emitLDSTc(int, int);
-+   void emitLDSTs(int, DataType);
-+   void emitLD();
-+   void emitLDC();
-+   void emitLDL();
-+   void emitLDS();
-+   void emitOUT();
-+   void emitRED();
-+   void emitST();
-+   void emitSTL();
-+   void emitSTS();
-+
-+   void emitTEXs(int);
-+   void emitTEX();
-+   void emitTLD();
-+   void emitTLD4();
-+   void emitTMML();
-+   void emitTXD();
-+   void emitTXQ();
-+
-+   void emitSUHandle(const int);
-+   void emitSUTarget();
-+   void emitSUATOM();
-+   void emitSULD();
-+   void emitSUST();
-+
-+   void emitAL2P();
-+   void emitBAR();
-+   void emitCCTL();
-+   void emitMEMBAR();
-+   void emitPIXLD();
-+   void emitPLOP3_LUT();
-+   void emitVOTE();
-+};
-+
-+};
-+#endif
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
-index bd78b76f384..eee9aa67256 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp
-@@ -170,6 +170,7 @@ private:
-    NirArrayLMemOffsets regToLmemOffset;
-    NirBlockMap blocks;
-    unsigned int curLoopDepth;
-+   unsigned int curIfDepth;
- 
-    BasicBlock *exit;
-    Value *zero;
-@@ -188,6 +189,7 @@ Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info)
-    : ConverterCommon(prog, info),
-      nir(nir),
-      curLoopDepth(0),
-+     curIfDepth(0),
-      clipVertexOutput(-1)
- {
-    zero = mkImm((uint32_t)0);
-@@ -571,6 +573,10 @@ Converter::getSubOp(nir_op op)
-    case nir_op_imul_high:
-    case nir_op_umul_high:
-       return NV50_IR_SUBOP_MUL_HIGH;
-+   case nir_op_ishl:
-+   case nir_op_ishr:
-+   case nir_op_ushr:
-+      return NV50_IR_SUBOP_SHIFT_WRAP;
-    default:
-       return 0;
-    }
-@@ -909,7 +915,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
-    uint16_t slots;
-    switch (stage) {
-    case Program::TYPE_GEOMETRY:
--      slots = type->uniform_locations();
-+      slots = type->count_attribute_slots(false);
-       if (input)
-          slots /= info.gs.vertices_in;
-       break;
-@@ -917,9 +923,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
-    case Program::TYPE_TESSELLATION_EVAL:
-       // remove first dimension
-       if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL))
--         slots = type->uniform_locations();
-+         slots = type->count_attribute_slots(false);
-       else
--         slots = type->fields.array->uniform_locations();
-+         slots = type->fields.array->count_attribute_slots(false);
-       break;
-    default:
-       slots = type->count_attribute_slots(false);
-@@ -929,6 +935,24 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info,
-    return slots;
- }
- 
-+static uint8_t
-+getMaskForType(const glsl_type *type, uint8_t slot) {
-+   uint16_t comp = type->without_array()->components();
-+   comp = comp ? comp : 4;
-+
-+   if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
-+      comp *= 2;
-+      if (comp > 4) {
-+         if (slot % 2)
-+            comp -= 4;
-+         else
-+            comp = 4;
-+      }
-+   }
-+
-+   return (1 << comp) - 1;
-+}
-+
- bool Converter::assignSlots() {
-    unsigned name;
-    unsigned index;
-@@ -981,16 +1005,8 @@ bool Converter::assignSlots() {
-       const glsl_type *type = var->type;
-       int slot = var->data.location;
-       uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var);
--      uint32_t comp = type->is_array() ? type->without_array()->component_slots()
--                                       : type->component_slots();
--      uint32_t frac = var->data.location_frac;
-       uint32_t vary = var->data.driver_location;
- 
--      if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
--         if (comp > 2)
--            slots *= 2;
--      }
--
-       assert(vary + slots <= PIPE_MAX_SHADER_INPUTS);
- 
-       switch(prog->getType()) {
-@@ -1014,6 +1030,8 @@ bool Converter::assignSlots() {
-             info->numPatchConstants = MAX2(info->numPatchConstants, index + slots);
-          break;
-       case Program::TYPE_VERTEX:
-+         if (slot >= VERT_ATTRIB_GENERIC0)
-+            slot = VERT_ATTRIB_GENERIC0 + vary;
-          vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index);
-          switch (name) {
-          case TGSI_SEMANTIC_EDGEFLAG:
-@@ -1029,17 +1047,12 @@ bool Converter::assignSlots() {
-       }
- 
-       for (uint16_t i = 0u; i < slots; ++i, ++vary) {
--         info->in[vary].id = vary;
--         info->in[vary].patch = var->data.patch;
--         info->in[vary].sn = name;
--         info->in[vary].si = index + i;
--         if (glsl_base_type_is_64bit(type->without_array()->base_type))
--            if (i & 0x1)
--               info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
--            else
--               info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
--         else
--            info->in[vary].mask |= ((1 << comp) - 1) << frac;
-+         nv50_ir_varying *v = &info->in[vary];
-+
-+         v->patch = var->data.patch;
-+         v->sn = name;
-+         v->si = index + i;
-+         v->mask |= getMaskForType(type, i) << var->data.location_frac;
-       }
-       info->numInputs = std::max<uint8_t>(info->numInputs, vary);
-    }
-@@ -1048,16 +1061,8 @@ bool Converter::assignSlots() {
-       const glsl_type *type = var->type;
-       int slot = var->data.location;
-       uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var);
--      uint32_t comp = type->is_array() ? type->without_array()->component_slots()
--                                       : type->component_slots();
--      uint32_t frac = var->data.location_frac;
-       uint32_t vary = var->data.driver_location;
- 
--      if (glsl_base_type_is_64bit(type->without_array()->base_type)) {
--         if (comp > 2)
--            slots *= 2;
--      }
--
-       assert(vary < PIPE_MAX_SHADER_OUTPUTS);
- 
-       switch(prog->getType()) {
-@@ -1067,7 +1072,11 @@ bool Converter::assignSlots() {
-          case TGSI_SEMANTIC_COLOR:
-             if (!var->data.fb_fetch_output)
-                info->prop.fp.numColourResults++;
--            info->prop.fp.separateFragData = true;
-+
-+            if (var->data.location == FRAG_RESULT_COLOR &&
-+                nir->info.outputs_written & BITFIELD64_BIT(var->data.location))
-+               info->prop.fp.separateFragData = true;
-+
-             // sometimes we get FRAG_RESULT_DATAX with data.index 0
-             // sometimes we get FRAG_RESULT_DATA0 with data.index X
-             index = index == 0 ? var->data.index : index;
-@@ -1118,20 +1127,14 @@ bool Converter::assignSlots() {
-       }
- 
-       for (uint16_t i = 0u; i < slots; ++i, ++vary) {
--         info->out[vary].id = vary;
--         info->out[vary].patch = var->data.patch;
--         info->out[vary].sn = name;
--         info->out[vary].si = index + i;
--         if (glsl_base_type_is_64bit(type->without_array()->base_type))
--            if (i & 0x1)
--               info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4);
--            else
--               info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf);
--         else
--            info->out[vary].mask |= ((1 << comp) - 1) << frac;
-+         nv50_ir_varying *v = &info->out[vary];
-+         v->patch = var->data.patch;
-+         v->sn = name;
-+         v->si = index + i;
-+         v->mask |= getMaskForType(type, i) << var->data.location_frac;
- 
-          if (nir->info.outputs_read & 1ull << slot)
--            info->out[vary].oread = 1;
-+            v->oread = 1;
-       }
-       info->numOutputs = std::max<uint8_t>(info->numOutputs, vary);
-    }
-@@ -1275,6 +1278,7 @@ Converter::parseNIR()
-    info->bin.tlsSpace = 0;
-    info->io.clipDistances = nir->info.clip_distance_array_size;
-    info->io.cullDistances = nir->info.cull_distance_array_size;
-+   info->io.layer_viewport_relative = nir->info.layer_viewport_relative;
- 
-    switch(prog->getType()) {
-    case Program::TYPE_COMPUTE:
-@@ -1291,7 +1295,7 @@ Converter::parseNIR()
-       info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage;
-       info->prop.fp.readsSampleLocations =
-          (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
--      info->prop.fp.usesDiscard = nir->info.fs.uses_discard;
-+      info->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote;
-       info->prop.fp.usesSampleMaskIn =
-          !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN);
-       break;
-@@ -1426,64 +1430,69 @@ Converter::visit(nir_block *block)
- bool
- Converter::visit(nir_if *nif)
- {
-+   curIfDepth++;
-+
-    DataType sType = getSType(nif->condition, false, false);
-    Value *src = getSrc(&nif->condition, 0);
- 
-    nir_block *lastThen = nir_if_last_then_block(nif);
-    nir_block *lastElse = nir_if_last_else_block(nif);
- 
--   assert(!lastThen->successors[1]);
--   assert(!lastElse->successors[1]);
--
-+   BasicBlock *headBB = bb;
-    BasicBlock *ifBB = convert(nir_if_first_then_block(nif));
-    BasicBlock *elseBB = convert(nir_if_first_else_block(nif));
- 
-    bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
-    bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
- 
--   // we only insert joinats, if both nodes end up at the end of the if again.
--   // the reason for this to not happens are breaks/continues/ret/... which
--   // have their own handling
--   if (lastThen->successors[0] == lastElse->successors[0])
--      bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]),
--                          CC_ALWAYS, NULL);
--
-+   bool insertJoins = lastThen->successors[0] == lastElse->successors[0];
-    mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType);
- 
-    foreach_list_typed(nir_cf_node, node, node, &nif->then_list) {
-       if (!visit(node))
-          return false;
-    }
-+
-    setPosition(convert(lastThen), true);
--   if (!bb->getExit() ||
--       !bb->getExit()->asFlow() ||
--        bb->getExit()->asFlow()->op == OP_JOIN) {
-+   if (!bb->isTerminated()) {
-       BasicBlock *tailBB = convert(lastThen->successors[0]);
-       mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
-       bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
-+   } else {
-+      insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
-    }
- 
-    foreach_list_typed(nir_cf_node, node, node, &nif->else_list) {
-       if (!visit(node))
-          return false;
-    }
-+
-    setPosition(convert(lastElse), true);
--   if (!bb->getExit() ||
--       !bb->getExit()->asFlow() ||
--        bb->getExit()->asFlow()->op == OP_JOIN) {
-+   if (!bb->isTerminated()) {
-       BasicBlock *tailBB = convert(lastElse->successors[0]);
-       mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL);
-       bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD);
-+   } else {
-+      insertJoins = insertJoins && bb->getExit()->op == OP_BRA;
-    }
- 
--   if (lastThen->successors[0] == lastElse->successors[0]) {
--      setPosition(convert(lastThen->successors[0]), true);
-+   /* only insert joins for the most outer if */
-+   if (--curIfDepth)
-+      insertJoins = false;
-+
-+   /* we made sure that all threads would converge at the same block */
-+   if (insertJoins) {
-+      BasicBlock *conv = convert(lastThen->successors[0]);
-+      setPosition(headBB->getExit(), false);
-+      headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL);
-+      setPosition(conv, false);
-       mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
-    }
- 
-    return true;
- }
- 
-+// TODO: add convergency
- bool
- Converter::visit(nir_loop *loop)
- {
-@@ -1491,8 +1500,8 @@ Converter::visit(nir_loop *loop)
-    func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth);
- 
-    BasicBlock *loopBB = convert(nir_loop_first_block(loop));
--   BasicBlock *tailBB =
--      convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
-+   BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node)));
-+
-    bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE);
- 
-    mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL);
-@@ -1503,19 +1512,15 @@ Converter::visit(nir_loop *loop)
-       if (!visit(node))
-          return false;
-    }
--   Instruction *insn = bb->getExit();
--   if (bb->cfg.incidentCount() != 0) {
--      if (!insn || !insn->asFlow()) {
--         mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
--         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
--      } else if (insn && insn->op == OP_BRA && !insn->getPredicate() &&
--                 tailBB->cfg.incidentCount() == 0) {
--         // RA doesn't like having blocks around with no incident edge,
--         // so we create a fake one to make it happy
--         bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
--      }
-+
-+   if (!bb->isTerminated()) {
-+      mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
-+      bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
-    }
- 
-+   if (tailBB->cfg.incidentCount() == 0)
-+      loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE);
-+
-    curLoopDepth -= 1;
- 
-    return true;
-@@ -1560,6 +1565,7 @@ Converter::convert(nir_intrinsic_op intr)
-       return SV_DRAWID;
-    case nir_intrinsic_load_front_face:
-       return SV_FACE;
-+   case nir_intrinsic_is_helper_invocation:
-    case nir_intrinsic_load_helper_invocation:
-       return SV_THREAD_KILL;
-    case nir_intrinsic_load_instance_id:
-@@ -1617,6 +1623,7 @@ Converter::visit(nir_intrinsic_instr *insn)
- {
-    nir_intrinsic_op op = insn->intrinsic;
-    const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op];
-+   unsigned dest_components = nir_intrinsic_dest_components(insn);
- 
-    switch (op) {
-    case nir_intrinsic_load_uniform: {
-@@ -1624,7 +1631,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       const DataType dType = getDType(insn);
-       Value *indirect;
-       uint32_t coffset = getIndirect(insn, 0, 0, indirect);
--      for (uint8_t i = 0; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0; i < dest_components; ++i) {
-          loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect);
-       }
-       break;
-@@ -1635,7 +1642,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       DataType dType = getSType(insn->src[0], false, false);
-       uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
-          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
-             continue;
- 
-@@ -1652,6 +1659,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-             break;
-          }
-          case Program::TYPE_GEOMETRY:
-+         case Program::TYPE_TESSELLATION_EVAL:
-          case Program::TYPE_VERTEX: {
-             if (info->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) {
-                mkMov(clipVtx[i], src);
-@@ -1688,7 +1696,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-          srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0)));
-          srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0)));
- 
--         for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+         for (uint8_t i = 0u; i < dest_components; ++i) {
-             defs.push_back(newDefs[i]);
-             mask |= 1 << i;
-          }
-@@ -1715,15 +1723,25 @@ Converter::visit(nir_intrinsic_instr *insn)
- 
-       // see load_barycentric_* handling
-       if (prog->getType() == Program::TYPE_FRAGMENT) {
--         mode = translateInterpMode(&vary, nvirOp);
-          if (op == nir_intrinsic_load_interpolated_input) {
-             ImmediateValue immMode;
-             if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode))
--               mode |= immMode.reg.data.u32;
-+               mode = immMode.reg.data.u32;
-+         }
-+         if (mode == NV50_IR_INTERP_DEFAULT)
-+            mode |= translateInterpMode(&vary, nvirOp);
-+         else {
-+            if (vary.linear) {
-+               nvirOp = OP_LINTERP;
-+               mode |= NV50_IR_INTERP_LINEAR;
-+            } else {
-+               nvirOp = OP_PINTERP;
-+               mode |= NV50_IR_INTERP_PERSPECTIVE;
-+            }
-          }
-       }
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < dest_components; ++i) {
-          uint32_t address = getSlotAddress(insn, idx, i);
-          Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address);
-          if (prog->getType() == Program::TYPE_FRAGMENT) {
-@@ -1814,9 +1832,11 @@ Converter::visit(nir_intrinsic_instr *insn)
-       loadImm(newDefs[1], mode);
-       break;
-    }
-+   case nir_intrinsic_demote:
-    case nir_intrinsic_discard:
-       mkOp(OP_DISCARD, TYPE_NONE, NULL);
-       break;
-+   case nir_intrinsic_demote_if:
-    case nir_intrinsic_discard_if: {
-       Value *pred = getSSA(1, FILE_PREDICATE);
-       if (insn->num_components > 1) {
-@@ -1832,6 +1852,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-    case nir_intrinsic_load_base_instance:
-    case nir_intrinsic_load_draw_id:
-    case nir_intrinsic_load_front_face:
-+   case nir_intrinsic_is_helper_invocation:
-    case nir_intrinsic_load_helper_invocation:
-    case nir_intrinsic_load_instance_id:
-    case nir_intrinsic_load_invocation_id:
-@@ -1858,7 +1879,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       SVSemantic sv = convert(op);
-       LValues &newDefs = convert(&insn->dest);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < nir_intrinsic_dest_components(insn); ++i) {
-          Value *def;
-          if (typeSizeof(dType) == 8)
-             def = getSSA();
-@@ -1910,12 +1931,12 @@ Converter::visit(nir_intrinsic_instr *insn)
- 
-       if (op == nir_intrinsic_read_first_invocation) {
-          mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY;
--         mkOp2(OP_EXTBF, TYPE_U32, tmp, tmp, mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+         mkOp1(OP_BREV, TYPE_U32, tmp, tmp);
-          mkOp1(OP_BFIND, TYPE_U32, tmp, tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
-       } else
-          tmp = getSrc(&insn->src[1], 0);
- 
--      for (uint8_t i = 0; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0; i < dest_components; ++i) {
-          mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f))
-             ->subOp = NV50_IR_SUBOP_SHFL_IDX;
-       }
-@@ -1931,7 +1952,7 @@ Converter::visit(nir_intrinsic_instr *insn)
- 
-       Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS),
-                               mkImm(baseVertex), indirectVertex);
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < dest_components; ++i) {
-          uint32_t address = getSlotAddress(insn, idx, i);
-          loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0,
-                   indirectOffset, vtxBase, info->in[idx].patch);
-@@ -1954,19 +1975,24 @@ Converter::visit(nir_intrinsic_instr *insn)
- 
-       vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < dest_components; ++i) {
-          uint32_t address = getSlotAddress(insn, idx, i);
-          loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0,
-                   indirectOffset, vtxBase, info->in[idx].patch);
-       }
-       break;
-    }
--   case nir_intrinsic_emit_vertex:
-+   case nir_intrinsic_emit_vertex: {
-       if (info->io.genUserClip > 0)
-          handleUserClipPlanes();
--      // fallthrough
-+      uint32_t idx = nir_intrinsic_stream_id(insn);
-+      mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
-+      break;
-+   }
-    case nir_intrinsic_end_primitive: {
-       uint32_t idx = nir_intrinsic_stream_id(insn);
-+      if (idx)
-+         break;
-       mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1;
-       break;
-    }
-@@ -1978,7 +2004,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1;
-       uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < dest_components; ++i) {
-          loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i,
-                   indirectOffset, indirectIndex);
-       }
-@@ -2001,7 +2027,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer);
-       uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
-          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
-             continue;
-          Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType,
-@@ -2020,7 +2046,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer);
-       uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i)
-+      for (uint8_t i = 0u; i < dest_components; ++i)
-          loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i,
-                   indirectOffset, indirectBuffer);
- 
-@@ -2314,7 +2340,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       Value *indirectOffset;
-       uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i) {
-+      for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
-          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
-             continue;
-          Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, sType, offset + i * typeSizeof(sType));
-@@ -2328,7 +2354,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       Value *indirectOffset;
-       uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
- 
--      for (uint8_t i = 0u; i < insn->num_components; ++i)
-+      for (uint8_t i = 0u; i < dest_components; ++i)
-          loadFrom(FILE_MEMORY_SHARED, 0, dType, newDefs[i], offset, i, indirectOffset);
- 
-       break;
-@@ -2367,7 +2393,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-       Value *indirectOffset;
-       uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset);
- 
--      for (auto i = 0u; i < insn->num_components; ++i)
-+      for (auto i = 0u; i < dest_components; ++i)
-          loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset);
- 
-       info->io.globalAccess |= 0x1;
-@@ -2376,7 +2402,7 @@ Converter::visit(nir_intrinsic_instr *insn)
-    case nir_intrinsic_store_global: {
-       DataType sType = getSType(insn->src[0], false, false);
- 
--      for (auto i = 0u; i < insn->num_components; ++i) {
-+      for (auto i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) {
-          if (!((1u << i) & nir_intrinsic_write_mask(insn)))
-             continue;
-          if (typeSizeof(sType) == 8) {
-@@ -2418,7 +2444,6 @@ Converter::visit(nir_jump_instr *insn)
-    case nir_jump_continue: {
-       bool isBreak = insn->type == nir_jump_break;
-       nir_block *block = insn->instr.block;
--      assert(!block->successors[1]);
-       BasicBlock *target = convert(block->successors[0]);
-       mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL);
-       bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK);
-@@ -2774,7 +2799,7 @@ Converter::visit(nir_alu_instr *insn)
-    case nir_op_bfm: {
-       DEFAULT_CHECKS;
-       LValues &newDefs = convert(&insn->dest);
--      mkOp3(OP_INSBF, dType, newDefs[0], getSrc(&insn->src[0]), loadImm(NULL, 0x808), getSrc(&insn->src[1]));
-+      mkOp2(OP_BMSK, dType, newDefs[0], getSrc(&insn->src[1]), getSrc(&insn->src[0]))->subOp = NV50_IR_SUBOP_BMSK_W;
-       break;
-    }
-    case nir_op_bitfield_insert: {
-@@ -2794,17 +2819,69 @@ Converter::visit(nir_alu_instr *insn)
-    case nir_op_bitfield_reverse: {
-       DEFAULT_CHECKS;
-       LValues &newDefs = convert(&insn->dest);
--      mkOp2(OP_EXTBF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+      mkOp1(OP_BREV, TYPE_U32, newDefs[0], getSrc(&insn->src[0]));
-       break;
-    }
-    case nir_op_find_lsb: {
-       DEFAULT_CHECKS;
-       LValues &newDefs = convert(&insn->dest);
-       Value *tmp = getSSA();
--      mkOp2(OP_EXTBF, TYPE_U32, tmp, getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+      mkOp1(OP_BREV, TYPE_U32, tmp, getSrc(&insn->src[0]));
-       mkOp1(OP_BFIND, TYPE_U32, newDefs[0], tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
-       break;
-    }
-+   case nir_op_extract_u8: {
-+      DEFAULT_CHECKS;
-+      LValues &newDefs = convert(&insn->dest);
-+      Value *prmt = getSSA();
-+      mkOp2(OP_OR, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x4440));
-+      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-+      break;
-+   }
-+   case nir_op_extract_i8: {
-+      DEFAULT_CHECKS;
-+      LValues &newDefs = convert(&insn->dest);
-+      Value *prmt = getSSA();
-+      mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x1111), loadImm(NULL, 0x8880));
-+      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-+      break;
-+   }
-+   case nir_op_extract_u16: {
-+      DEFAULT_CHECKS;
-+      LValues &newDefs = convert(&insn->dest);
-+      Value *prmt = getSSA();
-+      mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x22), loadImm(NULL, 0x4410));
-+      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-+      break;
-+   }
-+   case nir_op_extract_i16: {
-+      DEFAULT_CHECKS;
-+      LValues &newDefs = convert(&insn->dest);
-+      Value *prmt = getSSA();
-+      mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x2222), loadImm(NULL, 0x9910));
-+      mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0));
-+      break;
-+   }
-+   case nir_op_urol: {
-+      DEFAULT_CHECKS;
-+      LValues &newDefs = convert(&insn->dest);
-+      mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]),
-+            getSrc(&insn->src[1]), getSrc(&insn->src[0]))
-+         ->subOp = NV50_IR_SUBOP_SHF_L |
-+                   NV50_IR_SUBOP_SHF_W |
-+                   NV50_IR_SUBOP_SHF_HI;
-+      break;
-+   }
-+   case nir_op_uror: {
-+      DEFAULT_CHECKS;
-+      LValues &newDefs = convert(&insn->dest);
-+      mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]),
-+            getSrc(&insn->src[1]), getSrc(&insn->src[0]))
-+         ->subOp = NV50_IR_SUBOP_SHF_R |
-+                   NV50_IR_SUBOP_SHF_W |
-+                   NV50_IR_SUBOP_SHF_LO;
-+      break;
-+   }
-    // boolean conversions
-    case nir_op_b2f32: {
-       DEFAULT_CHECKS;
-@@ -2990,14 +3067,11 @@ Converter::handleDeref(nir_deref_instr *deref, Value * &indirect, const nir_vari
- CacheMode
- Converter::convert(enum gl_access_qualifier access)
- {
--   switch (access) {
--   case ACCESS_VOLATILE:
-+   if (access & ACCESS_VOLATILE)
-       return CACHE_CV;
--   case ACCESS_COHERENT:
-+   if (access & ACCESS_COHERENT)
-       return CACHE_CG;
--   default:
--      return CACHE_CA;
--   }
-+   return CACHE_CA;
- }
- 
- CacheMode
-@@ -3224,6 +3298,11 @@ Converter::run()
-    NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-    NIR_PASS_V(nir, nir_lower_phis_to_scalar);
- 
-+   /*TODO: improve this lowering/optimisation loop so that we can use
-+    *      nir_opt_idiv_const effectively before this.
-+    */
-+   NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_precise);
-+
-    do {
-       progress = false;
-       NIR_PASS(progress, nir, nir_copy_prop);
-@@ -3285,3 +3364,125 @@ Program::makeFromNIR(struct nv50_ir_prog_info *info)
- }
- 
- } // namespace nv50_ir
-+
-+static nir_shader_compiler_options
-+nvir_nir_shader_compiler_options(int chipset)
-+{
-+   nir_shader_compiler_options op = {};
-+   op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET);
-+   op.lower_ffma = false;
-+   op.fuse_ffma = false; /* nir doesn't track mad vs fma */
-+   op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET);
-+   op.lower_flrp32 = true;
-+   op.lower_flrp64 = true;
-+   op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it
-+   op.lower_fsat = false;
-+   op.lower_fsqrt = false; // TODO: only before gm200
-+   op.lower_sincos = false;
-+   op.lower_fmod = true;
-+   op.lower_bitfield_extract = false;
-+   op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
-+   op.lower_bitfield_insert = false;
-+   op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET);
-+   op.lower_bitfield_insert_to_bitfield_select = false;
-+   op.lower_bitfield_reverse = false;
-+   op.lower_bit_count = false;
-+   op.lower_ifind_msb = false;
-+   op.lower_find_lsb = false;
-+   op.lower_uadd_carry = true; // TODO
-+   op.lower_usub_borrow = true; // TODO
-+   op.lower_mul_high = false;
-+   op.lower_negate = false;
-+   op.lower_sub = true;
-+   op.lower_scmp = true; // TODO: not implemented yet
-+   op.lower_vector_cmp = false;
-+   op.lower_idiv = true;
-+   op.lower_bitops = false;
-+   op.lower_isign = (chipset >= NVISA_GV100_CHIPSET);
-+   op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET);
-+   op.lower_fdph = false;
-+   op.lower_fdot = false;
-+   op.fdot_replicates = false; // TODO
-+   op.lower_ffloor = false; // TODO
-+   op.lower_ffract = true;
-+   op.lower_fceil = false; // TODO
-+   op.lower_ftrunc = false;
-+   op.lower_ldexp = true;
-+   op.lower_pack_half_2x16 = true;
-+   op.lower_pack_unorm_2x16 = true;
-+   op.lower_pack_snorm_2x16 = true;
-+   op.lower_pack_unorm_4x8 = true;
-+   op.lower_pack_snorm_4x8 = true;
-+   op.lower_unpack_half_2x16 = true;
-+   op.lower_unpack_unorm_2x16 = true;
-+   op.lower_unpack_snorm_2x16 = true;
-+   op.lower_unpack_unorm_4x8 = true;
-+   op.lower_unpack_snorm_4x8 = true;
-+   op.lower_pack_split = false;
-+   op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET);
-+   op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET);
-+   op.lower_all_io_to_temps = false;
-+   op.lower_all_io_to_elements = false;
-+   op.vertex_id_zero_based = false;
-+   op.lower_base_vertex = false;
-+   op.lower_helper_invocation = false;
-+   op.optimize_sample_mask_in = false;
-+   op.lower_cs_local_index_from_id = true;
-+   op.lower_cs_local_id_from_index = false;
-+   op.lower_device_index_to_zero = false; // TODO
-+   op.lower_wpos_pntc = false; // TODO
-+   op.lower_hadd = true; // TODO
-+   op.lower_add_sat = true; // TODO
-+   op.vectorize_io = false;
-+   op.lower_to_scalar = false;
-+   op.unify_interfaces = false;
-+   op.use_interpolated_input_intrinsics = true;
-+   op.lower_mul_2x32_64 = true; // TODO
-+   op.lower_rotate = (chipset < NVISA_GV100_CHIPSET);
-+   op.has_imul24 = false;
-+   op.intel_vec4 = false;
-+   op.max_unroll_iterations = 32;
-+   op.lower_int64_options = (nir_lower_int64_options) (
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) |
-+      nir_lower_divmod64 |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) |
-+      ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) |
-+      nir_lower_ufind_msb64
-+   );
-+   op.lower_doubles_options = (nir_lower_doubles_options) (
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) |
-+      nir_lower_dmod |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) |
-+      ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0)
-+   );
-+   return op;
-+}
-+
-+static const nir_shader_compiler_options gf100_nir_shader_compiler_options =
-+nvir_nir_shader_compiler_options(NVISA_GF100_CHIPSET);
-+static const nir_shader_compiler_options gm107_nir_shader_compiler_options =
-+nvir_nir_shader_compiler_options(NVISA_GM107_CHIPSET);
-+static const nir_shader_compiler_options gv100_nir_shader_compiler_options =
-+nvir_nir_shader_compiler_options(NVISA_GV100_CHIPSET);
-+
-+const nir_shader_compiler_options *
-+nv50_ir_nir_shader_compiler_options(int chipset)
-+{
-+   if (chipset >= NVISA_GV100_CHIPSET)
-+      return &gv100_nir_shader_compiler_options;
-+   if (chipset >= NVISA_GM107_CHIPSET)
-+      return &gm107_nir_shader_compiler_options;
-+   return &gf100_nir_shader_compiler_options;
-+}
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
-index 60f3d582a0b..3fd76f64de0 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
-@@ -3401,8 +3401,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
-       // ReadInvocationARB(src, findLSB(ballot(true)))
-       val0 = getScratch();
-       mkOp1(OP_VOTE, TYPE_U32, val0, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY;
--      mkOp2(OP_EXTBF, TYPE_U32, val0, val0, mkImm(0x2000))
--         ->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+      mkOp1(OP_BREV, TYPE_U32, val0, val0);
-       mkOp1(OP_BFIND, TYPE_U32, val0, val0)->subOp = NV50_IR_SUBOP_BFIND_SAMT;
-       src1 = val0;
-       /* fallthrough */
-@@ -3820,8 +3819,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
-       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
-          src0 = fetchSrc(0, c);
-          val0 = getScratch();
--         geni = mkOp2(OP_EXTBF, TYPE_U32, val0, src0, mkImm(0x2000));
--         geni->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+         mkOp1(OP_BREV, TYPE_U32, val0, src0);
-          geni = mkOp1(OP_BFIND, TYPE_U32, dst0[c], val0);
-          geni->subOp = NV50_IR_SUBOP_BFIND_SAMT;
-       }
-@@ -3836,8 +3834,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
-    case TGSI_OPCODE_BREV:
-       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
-          src0 = fetchSrc(0, c);
--         geni = mkOp2(OP_EXTBF, TYPE_U32, dst0[c], src0, mkImm(0x2000));
--         geni->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+         mkOp1(OP_BREV, TYPE_U32, dst0[c], src0);
-       }
-       break;
-    case TGSI_OPCODE_POPC:
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
-index 49a5f3b01f2..9fad1dcfe89 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
-@@ -239,9 +239,8 @@ GM107LoweringPass::handlePFETCH(Instruction *i)
-    Value *tmp1 = bld.getScratch();
-    Value *tmp2 = bld.getScratch();
-    bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
--   bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16));
--   bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff));
--   bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff));
-+   bld.mkOp3(OP_PERMT, TYPE_U32, tmp1, tmp0, bld.mkImm(0x4442), bld.mkImm(0));
-+   bld.mkOp3(OP_PERMT, TYPE_U32, tmp0, tmp0, bld.mkImm(0x4440), bld.mkImm(0));
-    if (i->getSrc(1))
-       bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));
-    else
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h
-index 71e5ea6417a..dfa1d035dac 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.h
-@@ -21,6 +21,7 @@ class GM107LegalizeSSA : public NVC0LegalizeSSA
- private:
-    virtual bool visit(Instruction *);
- 
-+protected:
-    void handlePFETCH(Instruction *);
-    void handleLOAD(Instruction *);
- };
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp
-new file mode 100644
-index 00000000000..644d4928327
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.cpp
-@@ -0,0 +1,481 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#include "codegen/nv50_ir.h"
-+#include "codegen/nv50_ir_build_util.h"
-+
-+#include "codegen/nv50_ir_target_nvc0.h"
-+#include "codegen/nv50_ir_lowering_gv100.h"
-+
-+#include <limits>
-+
-+namespace nv50_ir {
-+
-+bool
-+GV100LegalizeSSA::handleCMP(Instruction *i)
-+{
-+   Value *pred = bld.getSSA(1, FILE_PREDICATE);
-+
-+   bld.mkCmp(OP_SET, reverseCondCode(i->asCmp()->setCond), TYPE_U8, pred,
-+             i->sType, bld.mkImm(0), i->getSrc(2))->ftz = i->ftz;
-+   bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);
-+   return true;
-+}
-+
-+// NIR deals with most of these for us, but codegen generates more in pointer
-+// calculations from other lowering passes.
-+bool
-+GV100LegalizeSSA::handleIADD64(Instruction *i)
-+{
-+   Value *carry = bld.getSSA(1, FILE_PREDICATE);
-+   Value *def[2] = { bld.getSSA(), bld.getSSA() };
-+   Value *src[2][2];
-+
-+   for (int s = 0; s < 2; s++) {
-+      if (i->getSrc(s)->reg.size == 8) {
-+         bld.mkSplit(src[s], 4, i->getSrc(s));
-+      } else {
-+         src[s][0] = i->getSrc(s);
-+         src[s][1] = bld.mkImm(0);
-+      }
-+   }
-+
-+   bld.mkOp2(OP_ADD, TYPE_U32, def[0], src[0][0], src[1][0])->
-+      setFlagsDef(1, carry);
-+   bld.mkOp2(OP_ADD, TYPE_U32, def[1], src[0][1], src[1][1])->
-+      setFlagsSrc(2, carry);
-+   bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleIMAD_HIGH(Instruction *i)
-+{
-+   Value *def = bld.getSSA(8), *defs[2];
-+   Value *src2;
-+
-+   if (i->srcExists(2) &&
-+       (!i->getSrc(2)->asImm() || i->getSrc(2)->asImm()->reg.data.u32)) {
-+      Value *src2s[2] = { bld.getSSA(), bld.getSSA() };
-+      bld.mkMov(src2s[0], bld.mkImm(0));
-+      bld.mkMov(src2s[1], i->getSrc(2));
-+      src2 = bld.mkOp2(OP_MERGE, TYPE_U64, bld.getSSA(8), src2s[0], src2s[1])->getDef(0);
-+   } else {
-+      src2 = bld.mkImm(0);
-+   }
-+
-+   bld.mkOp3(OP_MAD, isSignedType(i->sType) ? TYPE_S64 : TYPE_U64, def,
-+             i->getSrc(0), i->getSrc(1), src2);
-+
-+   bld.mkSplit(defs, 4, def);
-+   i->def(0).replace(defs[1], false);
-+   return true;
-+}
-+
-+// XXX: We should be able to do this in GV100LoweringPass, but codegen messes
-+//      up somehow and swaps the condcode without swapping the sources.
-+//      - tests/spec/glsl-1.50/execution/geometry/primitive-id-in.shader_test
-+bool
-+GV100LegalizeSSA::handleIMNMX(Instruction *i)
-+{
-+   Value *pred = bld.getSSA(1, FILE_PREDICATE);
-+
-+   bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, i->dType, pred,
-+             i->sType, i->getSrc(0), i->getSrc(1));
-+   bld.mkOp3(OP_SELP, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1), pred);
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleIMUL(Instruction *i)
-+{
-+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-+      return handleIMAD_HIGH(i);
-+
-+   bld.mkOp3(OP_MAD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1),
-+             bld.mkImm(0));
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleLOP2(Instruction *i)
-+{
-+   uint8_t src0 = NV50_IR_SUBOP_LOP3_LUT_SRC0;
-+   uint8_t src1 = NV50_IR_SUBOP_LOP3_LUT_SRC1;
-+   uint8_t subOp;
-+
-+   if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
-+      src0 = ~src0;
-+   if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
-+      src1 = ~src1;
-+
-+   switch (i->op) {
-+   case OP_AND: subOp = src0 & src1; break;
-+   case OP_OR : subOp = src0 | src1; break;
-+   case OP_XOR: subOp = src0 ^ src1; break;
-+   default:
-+      assert(!"invalid LOP2 opcode");
-+      break;
-+   }
-+
-+   bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), i->getSrc(0), i->getSrc(1),
-+             bld.mkImm(0))->subOp = subOp;
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleNOT(Instruction *i)
-+{
-+   bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), bld.mkImm(0), i->getSrc(0),
-+             bld.mkImm(0))->subOp = (uint8_t)~NV50_IR_SUBOP_LOP3_LUT_SRC1;
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handlePREEX2(Instruction *i)
-+{
-+   i->def(0).replace(i->src(0), false);
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleQUADON(Instruction *i)
-+{
-+   handleSHFL(i); // Inserts OP_WARPSYNC
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleQUADPOP(Instruction *i)
-+{
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleSET(Instruction *i)
-+{
-+   Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL;
-+   Value *pred = bld.getSSA(1, FILE_PREDICATE), *met;
-+   Instruction *xsetp;
-+
-+   if (isFloatType(i->dType)) {
-+      if (i->sType == TYPE_F32)
-+         return false; // HW has FSET.BF
-+      met = bld.mkImm(0x3f800000);
-+   } else {
-+      met = bld.mkImm(0xffffffff);
-+   }
-+
-+   xsetp = bld.mkCmp(i->op, i->asCmp()->setCond, TYPE_U8, pred, i->sType,
-+                     i->getSrc(0), i->getSrc(1));
-+   xsetp->src(0).mod = i->src(0).mod;
-+   xsetp->src(1).mod = i->src(1).mod;
-+   xsetp->setSrc(2, src2);
-+   xsetp->ftz = i->ftz;
-+
-+   i = bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), bld.mkImm(0), met, pred);
-+   i->src(2).mod = Modifier(NV50_IR_MOD_NOT);
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleSHFL(Instruction *i)
-+{
-+   Instruction *sync = new_Instruction(func, OP_WARPSYNC, TYPE_NONE);
-+   sync->fixed = 1;
-+   sync->setSrc(0, bld.mkImm(0xffffffff));
-+   i->bb->insertBefore(i, sync);
-+   return false;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleShift(Instruction *i)
-+{
-+   Value *zero = bld.mkImm(0);
-+   Value *src1 = i->getSrc(1);
-+   Value *src0, *src2;
-+   uint8_t subOp = i->op == OP_SHL ? NV50_IR_SUBOP_SHF_L : NV50_IR_SUBOP_SHF_R;
-+
-+   if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR) {
-+      src0 = i->getSrc(0);
-+      src2 = zero;
-+   } else {
-+      src0 = zero;
-+      src2 = i->getSrc(0);
-+      subOp |= NV50_IR_SUBOP_SHF_HI;
-+   }
-+   if (i->subOp & NV50_IR_SUBOP_SHIFT_WRAP)
-+      subOp |= NV50_IR_SUBOP_SHF_W;
-+
-+   bld.mkOp3(OP_SHF, i->dType, i->getDef(0), src0, src1, src2)->subOp = subOp;
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::handleSUB(Instruction *i)
-+{
-+   Instruction *xadd =
-+      bld.mkOp2(OP_ADD, i->dType, i->getDef(0), i->getSrc(0), i->getSrc(1));
-+   xadd->src(0).mod = i->src(0).mod;
-+   xadd->src(1).mod = i->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
-+   xadd->ftz = i->ftz;
-+   return true;
-+}
-+
-+bool
-+GV100LegalizeSSA::visit(Instruction *i)
-+{
-+   bool lowered = false;
-+
-+   bld.setPosition(i, false);
-+   if (i->sType == TYPE_F32 && i->dType != TYPE_F16 &&
-+       prog->getType() != Program::TYPE_COMPUTE)
-+      handleFTZ(i);
-+
-+   switch (i->op) {
-+   case OP_AND:
-+   case OP_OR:
-+   case OP_XOR:
-+      if (i->def(0).getFile() != FILE_PREDICATE)
-+         lowered = handleLOP2(i);
-+      break;
-+   case OP_NOT:
-+      lowered = handleNOT(i);
-+      break;
-+   case OP_SHL:
-+   case OP_SHR:
-+      lowered = handleShift(i);
-+      break;
-+   case OP_SET:
-+   case OP_SET_AND:
-+   case OP_SET_OR:
-+   case OP_SET_XOR:
-+      if (i->def(0).getFile() != FILE_PREDICATE)
-+         lowered = handleSET(i);
-+      break;
-+   case OP_SLCT:
-+      lowered = handleCMP(i);
-+      break;
-+   case OP_PREEX2:
-+      lowered = handlePREEX2(i);
-+      break;
-+   case OP_MUL:
-+      if (!isFloatType(i->dType))
-+         lowered = handleIMUL(i);
-+      break;
-+   case OP_MAD:
-+      if (!isFloatType(i->dType) && i->subOp == NV50_IR_SUBOP_MUL_HIGH)
-+         lowered = handleIMAD_HIGH(i);
-+      break;
-+   case OP_SHFL:
-+      lowered = handleSHFL(i);
-+      break;
-+   case OP_QUADON:
-+      lowered = handleQUADON(i);
-+      break;
-+   case OP_QUADPOP:
-+      lowered = handleQUADPOP(i);
-+      break;
-+   case OP_SUB:
-+      lowered = handleSUB(i);
-+      break;
-+   case OP_MAX:
-+   case OP_MIN:
-+      if (!isFloatType(i->dType))
-+         lowered = handleIMNMX(i);
-+      break;
-+   case OP_ADD:
-+      if (!isFloatType(i->dType) && typeSizeof(i->dType) == 8)
-+         lowered = handleIADD64(i);
-+      break;
-+   case OP_PFETCH:
-+      handlePFETCH(i);
-+      break;
-+   case OP_LOAD:
-+      handleLOAD(i);
-+      break;
-+   default:
-+      break;
-+   }
-+
-+   if (lowered)
-+      delete_Instruction(prog, i);
-+
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handleDMNMX(Instruction *i)
-+{
-+   Value *pred = bld.getSSA(1, FILE_PREDICATE);
-+   Value *src0[2], *src1[2], *dest[2];
-+
-+   bld.mkCmp(OP_SET, (i->op == OP_MIN) ? CC_LT : CC_GT, TYPE_U32, pred,
-+             i->sType, i->getSrc(0), i->getSrc(1));
-+   bld.mkSplit(src0, 4, i->getSrc(0));
-+   bld.mkSplit(src1, 4, i->getSrc(1));
-+   bld.mkSplit(dest, 4, i->getDef(0));
-+   bld.mkOp3(OP_SELP, TYPE_U32, dest[0], src0[0], src1[0], pred);
-+   bld.mkOp3(OP_SELP, TYPE_U32, dest[1], src0[1], src1[1], pred);
-+   bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), dest[0], dest[1]);
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handleEXTBF(Instruction *i)
-+{
-+   Value *bit = bld.getScratch();
-+   Value *cnt = bld.getScratch();
-+   Value *mask = bld.getScratch();
-+   Value *zero = bld.mkImm(0);
-+
-+   bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero);
-+   bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero);
-+   bld.mkOp2(OP_BMSK, TYPE_U32, mask, bit, cnt);
-+   bld.mkOp2(OP_AND, TYPE_U32, mask, i->getSrc(0), mask);
-+   bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), mask, bit);
-+   if (isSignedType(i->dType))
-+      bld.mkOp2(OP_SGXT, TYPE_S32, i->getDef(0), i->getDef(0), cnt);
-+
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handleFLOW(Instruction *i)
-+{
-+   i->op = OP_BRA;
-+   return false;
-+}
-+
-+bool
-+GV100LoweringPass::handleI2I(Instruction *i)
-+{
-+   bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), i->sType, i->getSrc(0))->
-+      subOp = i->subOp;
-+   bld.mkCvt(OP_CVT, i->dType, i->getDef(0), TYPE_F32, i->getDef(0));
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handleINSBF(Instruction *i)
-+{
-+   Value *bit = bld.getScratch();
-+   Value *cnt = bld.getScratch();
-+   Value *mask = bld.getScratch();
-+   Value *src0 = bld.getScratch();
-+   Value *zero = bld.mkImm(0);
-+
-+   bld.mkOp3(OP_PERMT, TYPE_U32, bit, i->getSrc(1), bld.mkImm(0x4440), zero);
-+   bld.mkOp3(OP_PERMT, TYPE_U32, cnt, i->getSrc(1), bld.mkImm(0x4441), zero);
-+   bld.mkOp2(OP_BMSK, TYPE_U32, mask, zero, cnt);
-+
-+   bld.mkOp2(OP_AND, TYPE_U32, src0, i->getSrc(0), mask);
-+   bld.mkOp2(OP_SHL, TYPE_U32, src0, src0, bit);
-+
-+   bld.mkOp2(OP_SHL, TYPE_U32, mask, mask, bit);
-+   bld.mkOp3(OP_LOP3_LUT, TYPE_U32, i->getDef(0), src0, i->getSrc(2), mask)->
-+      subOp = NV50_IR_SUBOP_LOP3_LUT(a | (b & ~c));
-+
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handlePINTERP(Instruction *i)
-+{
-+   Value *src2 = i->srcExists(2) ? i->getSrc(2) : NULL;
-+   Instruction *ipa, *mul;
-+
-+   ipa = bld.mkOp2(OP_LINTERP, TYPE_F32, i->getDef(0), i->getSrc(0), src2);
-+   ipa->ipa = i->ipa;
-+   mul = bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), i->getSrc(1));
-+
-+   if (i->getInterpMode() == NV50_IR_INTERP_SC) {
-+      ipa->setDef(1, bld.getSSA(1, FILE_PREDICATE));
-+      mul->setPredicate(CC_NOT_P, ipa->getDef(1));
-+   }
-+
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handlePREFLOW(Instruction *i)
-+{
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::handlePRESIN(Instruction *i)
-+{
-+   const float f = 1.0 / (2.0 * 3.14159265);
-+   bld.mkOp2(OP_MUL, i->dType, i->getDef(0), i->getSrc(0), bld.mkImm(f));
-+   return true;
-+}
-+
-+bool
-+GV100LoweringPass::visit(Instruction *i)
-+{
-+   bool lowered = false;
-+
-+   bld.setPosition(i, false);
-+
-+   switch (i->op) {
-+   case OP_BREAK:
-+   case OP_CONT:
-+      lowered = handleFLOW(i);
-+      break;
-+   case OP_PREBREAK:
-+   case OP_PRECONT:
-+      lowered = handlePREFLOW(i);
-+      break;
-+   case OP_CVT:
-+      if (i->src(0).getFile() != FILE_PREDICATE &&
-+          i->def(0).getFile() != FILE_PREDICATE &&
-+          !isFloatType(i->dType) && !isFloatType(i->sType))
-+         lowered = handleI2I(i);
-+      break;
-+   case OP_EXTBF:
-+      lowered = handleEXTBF(i);
-+      break;
-+   case OP_INSBF:
-+      lowered = handleINSBF(i);
-+      break;
-+   case OP_MAX:
-+   case OP_MIN:
-+      if (i->dType == TYPE_F64)
-+         lowered = handleDMNMX(i);
-+      break;
-+   case OP_PINTERP:
-+      lowered = handlePINTERP(i);
-+      break;
-+   case OP_PRESIN:
-+      lowered = handlePRESIN(i);
-+      break;
-+   default:
-+      break;
-+   }
-+
-+   if (lowered)
-+      delete_Instruction(prog, i);
-+
-+   return true;
-+}
-+
-+} // namespace nv50_ir
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h
-new file mode 100644
-index 00000000000..d918c6e83eb
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gv100.h
-@@ -0,0 +1,78 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#ifndef __NV50_IR_LOWERING_GV100_H__
-+#define __NV50_IR_LOWERING_GV100_H__
-+#include "codegen/nv50_ir_lowering_gm107.h"
-+
-+namespace nv50_ir {
-+
-+class GV100LoweringPass : public Pass
-+{
-+public:
-+   GV100LoweringPass(Program *p) {
-+      bld.setProgram(p);
-+   }
-+
-+private:
-+   BuildUtil bld;
-+
-+   virtual bool visit(Instruction *);
-+
-+   bool handleDMNMX(Instruction *);
-+   bool handleEXTBF(Instruction *);
-+   bool handleFLOW(Instruction *);
-+   bool handleI2I(Instruction *);
-+   bool handleINSBF(Instruction *);
-+   bool handlePINTERP(Instruction *);
-+   bool handlePREFLOW(Instruction *);
-+   bool handlePRESIN(Instruction *);
-+};
-+
-+class GV100LegalizeSSA : public GM107LegalizeSSA
-+{
-+public:
-+   GV100LegalizeSSA(Program *p) {
-+      bld.setProgram(p);
-+   }
-+
-+private:
-+   virtual bool visit(Function *) { return true; }
-+   virtual bool visit(BasicBlock *) { return true; }
-+   virtual bool visit(Instruction *);
-+
-+   bool handleCMP(Instruction *);
-+   bool handleIADD64(Instruction *);
-+   bool handleIMAD_HIGH(Instruction *);
-+   bool handleIMNMX(Instruction *);
-+   bool handleIMUL(Instruction *);
-+   bool handleLOP2(Instruction *);
-+   bool handleNOT(Instruction *);
-+   bool handlePREEX2(Instruction *);
-+   bool handleQUADON(Instruction *);
-+   bool handleQUADPOP(Instruction *);
-+   bool handleSET(Instruction *);
-+   bool handleSHFL(Instruction *);
-+   bool handleShift(Instruction *);
-+   bool handleSUB(Instruction *);
-+};
-+}
-+#endif
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
-index a60881000fe..067f9abaca8 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
-@@ -310,6 +310,14 @@ NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
-    cmp->sType = hTy;
- }
- 
-+void
-+NVC0LegalizeSSA::handleBREV(Instruction *i)
-+{
-+   i->op = OP_EXTBF;
-+   i->subOp = NV50_IR_SUBOP_EXTBF_REV;
-+   i->setSrc(1, bld.mkImm(0x2000));
-+}
-+
- bool
- NVC0LegalizeSSA::visit(Function *fn)
- {
-@@ -354,6 +362,9 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
-          if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
-             handleSET(i->asCmp());
-          break;
-+      case OP_BREV:
-+         handleBREV(i);
-+         break;
-       default:
-          break;
-       }
-@@ -856,11 +867,11 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
-                next = hi;
-          }
- 
--         if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
--            replaceCvt(i);
--
-          if (i->op != OP_MOV && i->op != OP_PFETCH)
-             replaceZero(i);
-+
-+         if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
-+            replaceCvt(i);
-       }
-    }
-    if (!bb->getEntry())
-@@ -887,6 +898,8 @@ NVC0LoweringPass::visit(Function *fn)
-       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
-       if (fn->cfgExit) {
-          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
-+         if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET)
-+            bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1;
-          bld.mkMovToReg(0, gpEmitAddress);
-       }
-    }
-@@ -1714,7 +1727,8 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
-          cctl->setPredicate(cas->cc, cas->getPredicate());
-    }
- 
--   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
-+   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS &&
-+       targ->getChipset() < NVISA_GV100_CHIPSET) {
-       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
-       // should be set to the high part of the double reg or bad things will
-       // happen elsewhere in the universe.
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
-index b4c405a9ea5..8c99427d3c0 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
-@@ -64,12 +64,14 @@ private:
-    void handleDIV(Instruction *); // integer division, modulus
-    void handleRCPRSQLib(Instruction *, Value *[]);
-    void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
--   void handleFTZ(Instruction *);
-    void handleSET(CmpInstruction *);
-    void handleTEXLOD(TexInstruction *);
-    void handleShift(Instruction *);
-+   void handleBREV(Instruction *);
- 
- protected:
-+   void handleFTZ(Instruction *);
-+
-    BuildUtil bld;
- };
- 
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
-index 2f46b0e886a..3a4ec3ca561 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
-@@ -558,6 +558,19 @@ ConstantFolding::expr(Instruction *i,
-    memset(&res.data, 0, sizeof(res.data));
- 
-    switch (i->op) {
-+   case OP_SGXT: {
-+      int bits = b->data.u32;
-+      if (bits) {
-+         uint32_t data = a->data.u32 & (0xffffffff >> (32 - bits));
-+         if (bits < 32 && (data & (1 << (bits - 1))))
-+            data = data - (1 << bits);
-+         res.data.u32 = data;
-+      }
-+      break;
-+   }
-+   case OP_BMSK:
-+      res.data.u32 = ((1 << b->data.u32) - 1) << a->data.u32;
-+      break;
-    case OP_MAD:
-    case OP_FMA:
-    case OP_MUL:
-@@ -780,6 +793,23 @@ ConstantFolding::expr(Instruction *i,
-    memset(&res.data, 0, sizeof(res.data));
- 
-    switch (i->op) {
-+   case OP_LOP3_LUT:
-+      for (int n = 0; n < 32; n++) {
-+         uint8_t lut = ((a->data.u32 >> n) & 1) << 2 |
-+                       ((b->data.u32 >> n) & 1) << 1 |
-+                       ((c->data.u32 >> n) & 1);
-+         res.data.u32 |= !!(i->subOp & (1 << lut)) << n;
-+      }
-+      break;
-+   case OP_PERMT:
-+      if (!i->subOp) {
-+         uint64_t input = (uint64_t)c->data.u32 << 32 | a->data.u32;
-+         uint16_t permt = b->data.u32;
-+         for (int n = 0 ; n < 4; n++, permt >>= 4)
-+            res.data.u32 |= ((input >> ((permt & 0xf) * 8)) & 0xff) << n * 8;
-+      } else
-+         return;
-+      break;
-    case OP_INSBF: {
-       int offset = b->data.u32 & 0xff;
-       int width = (b->data.u32 >> 8) & 0xff;
-@@ -1526,6 +1556,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
-       i->subOp = 0;
-       break;
-    }
-+   case OP_BREV: {
-+      uint32_t res = util_bitreverse(imm0.reg.data.u32);
-+      i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
-+      i->op = OP_MOV;
-+      break;
-+   }
-    case OP_POPCNT: {
-       // Only deal with 1-arg POPCNT here
-       if (i->srcExists(1))
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
-index 5dcbf3c3e0c..ce0d2507dc1 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
-@@ -93,8 +93,10 @@ const char *operationStr[OP_LAST + 1] =
-    "and",
-    "or",
-    "xor",
-+   "lop3 lut",
-    "shl",
-    "shr",
-+   "shf",
-    "max",
-    "min",
-    "sat",
-@@ -142,6 +144,7 @@ const char *operationStr[OP_LAST + 1] =
-    "pinterp",
-    "emit",
-    "restart",
-+   "final",
-    "tex",
-    "texbias",
-    "texlod",
-@@ -177,7 +180,10 @@ const char *operationStr[OP_LAST + 1] =
-    "insbf",
-    "extbf",
-    "bfind",
-+   "brev",
-+   "bmsk",
-    "permt",
-+   "sgxt",
-    "atom",
-    "bar",
-    "vadd",
-@@ -193,6 +199,7 @@ const char *operationStr[OP_LAST + 1] =
-    "shfl",
-    "vote",
-    "bufq",
-+   "warpsync",
-    "(invalid)"
- };
- 
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
-index 6df2664da22..4e5b21d9176 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
-@@ -988,6 +988,8 @@ GCRA::coalesce(ArrayList& insns)
-    case 0x110:
-    case 0x120:
-    case 0x130:
-+   case 0x140:
-+   case 0x160:
-       ret = doCoalesce(insns, JOIN_MASK_UNION);
-       break;
-    default:
-@@ -2297,13 +2299,25 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
-    if (isTextureOp(tex->op))
-       textureMask(tex);
- 
--   if (isScalarTexGM107(tex)) {
--      handleScalarTexGM107(tex);
--      return;
--   }
-+   if (targ->getChipset() < NVISA_GV100_CHIPSET) {
-+      if (isScalarTexGM107(tex)) {
-+         handleScalarTexGM107(tex);
-+         return;
-+      }
- 
--   assert(!tex->tex.scalar);
--   condenseDefs(tex);
-+      assert(!tex->tex.scalar);
-+      condenseDefs(tex);
-+   } else {
-+      if (isTextureOp(tex->op)) {
-+         int defCount = tex->defCount(0xff);
-+         if (defCount > 3)
-+            condenseDefs(tex, 2, 3);
-+         if (defCount > 1)
-+            condenseDefs(tex, 0, 1);
-+      } else {
-+         condenseDefs(tex);
-+      }
-+   }
- 
-    if (isSurfaceOp(tex->op)) {
-       int s = tex->tex.target.getDim() +
-@@ -2485,6 +2499,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
-          case 0x110:
-          case 0x120:
-          case 0x130:
-+         case 0x140:
-+         case 0x160:
-             texConstraintGM107(tex);
-             break;
-          default:
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_sched_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_sched_gm107.h
-new file mode 100644
-index 00000000000..54443ae2770
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_sched_gm107.h
-@@ -0,0 +1,156 @@
-+#ifndef __NV50_IR_SCHED_GM107_H__
-+#define __NV50_IR_SCHED_GM107_H__
-+namespace nv50_ir {
-+
-+class SchedDataCalculatorGM107 : public Pass
-+{
-+public:
-+   SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {}
-+
-+private:
-+   struct RegScores
-+   {
-+      struct ScoreData {
-+         int r[256];
-+         int p[8];
-+         int c;
-+      } rd, wr;
-+      int base;
-+
-+      void rebase(const int base)
-+      {
-+         const int delta = this->base - base;
-+         if (!delta)
-+            return;
-+         this->base = 0;
-+
-+         for (int i = 0; i < 256; ++i) {
-+            rd.r[i] += delta;
-+            wr.r[i] += delta;
-+         }
-+         for (int i = 0; i < 8; ++i) {
-+            rd.p[i] += delta;
-+            wr.p[i] += delta;
-+         }
-+         rd.c += delta;
-+         wr.c += delta;
-+      }
-+      void wipe()
-+      {
-+         memset(&rd, 0, sizeof(rd));
-+         memset(&wr, 0, sizeof(wr));
-+      }
-+      int getLatest(const ScoreData& d) const
-+      {
-+         int max = 0;
-+         for (int i = 0; i < 256; ++i)
-+            if (d.r[i] > max)
-+               max = d.r[i];
-+         for (int i = 0; i < 8; ++i)
-+            if (d.p[i] > max)
-+               max = d.p[i];
-+         if (d.c > max)
-+            max = d.c;
-+         return max;
-+      }
-+      inline int getLatestRd() const
-+      {
-+         return getLatest(rd);
-+      }
-+      inline int getLatestWr() const
-+      {
-+         return getLatest(wr);
-+      }
-+      inline int getLatest() const
-+      {
-+         return MAX2(getLatestRd(), getLatestWr());
-+      }
-+      void setMax(const RegScores *that)
-+      {
-+         for (int i = 0; i < 256; ++i) {
-+            rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
-+            wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
-+         }
-+         for (int i = 0; i < 8; ++i) {
-+            rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
-+            wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
-+         }
-+         rd.c = MAX2(rd.c, that->rd.c);
-+         wr.c = MAX2(wr.c, that->wr.c);
-+      }
-+      void print(int cycle)
-+      {
-+         for (int i = 0; i < 256; ++i) {
-+            if (rd.r[i] > cycle)
-+               INFO("rd $r%i @ %i\n", i, rd.r[i]);
-+            if (wr.r[i] > cycle)
-+               INFO("wr $r%i @ %i\n", i, wr.r[i]);
-+         }
-+         for (int i = 0; i < 8; ++i) {
-+            if (rd.p[i] > cycle)
-+               INFO("rd $p%i @ %i\n", i, rd.p[i]);
-+            if (wr.p[i] > cycle)
-+               INFO("wr $p%i @ %i\n", i, wr.p[i]);
-+         }
-+         if (rd.c > cycle)
-+            INFO("rd $c @ %i\n", rd.c);
-+         if (wr.c > cycle)
-+            INFO("wr $c @ %i\n", wr.c);
-+      }
-+   };
-+
-+   RegScores *score; // for current BB
-+   std::vector<RegScores> scoreBoards;
-+
-+   const TargetGM107 *targ;
-+   bool visit(Function *);
-+   bool visit(BasicBlock *);
-+
-+   void commitInsn(const Instruction *, int);
-+   int calcDelay(const Instruction *, int) const;
-+   void setDelay(Instruction *, int, const Instruction *);
-+   void recordWr(const Value *, int, int);
-+   void checkRd(const Value *, int, int&) const;
-+
-+   inline void emitYield(Instruction *);
-+   inline void emitStall(Instruction *, uint8_t);
-+   inline void emitReuse(Instruction *, uint8_t);
-+   inline void emitWrDepBar(Instruction *, uint8_t);
-+   inline void emitRdDepBar(Instruction *, uint8_t);
-+   inline void emitWtDepBar(Instruction *, uint8_t);
-+
-+   inline int getStall(const Instruction *) const;
-+   inline int getWrDepBar(const Instruction *) const;
-+   inline int getRdDepBar(const Instruction *) const;
-+   inline int getWtDepBar(const Instruction *) const;
-+
-+   void setReuseFlag(Instruction *);
-+
-+   inline void printSchedInfo(int, const Instruction *) const;
-+
-+   struct LiveBarUse {
-+      LiveBarUse(Instruction *insn, Instruction *usei)
-+         : insn(insn), usei(usei) { }
-+      Instruction *insn;
-+      Instruction *usei;
-+   };
-+
-+   struct LiveBarDef {
-+      LiveBarDef(Instruction *insn, Instruction *defi)
-+         : insn(insn), defi(defi) { }
-+      Instruction *insn;
-+      Instruction *defi;
-+   };
-+
-+   bool insertBarriers(BasicBlock *);
-+
-+   bool doesInsnWriteTo(const Instruction *insn, const Value *val) const;
-+   Instruction *findFirstUse(const Instruction *) const;
-+   Instruction *findFirstDef(const Instruction *) const;
-+
-+   bool needRdDepBar(const Instruction *) const;
-+   bool needWrDepBar(const Instruction *) const;
-+};
-+
-+}; // namespace nv50_ir
-+#endif
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
-index 5c6d0570ae2..765375a47df 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
-@@ -33,7 +33,7 @@ const uint8_t Target::operationSrcNr[] =
-    2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD
-    3, 3,                   // SHLADD, XMAD
-    1, 1, 1,                // ABS, NEG, NOT
--   2, 2, 2, 2, 2,          // AND, OR, XOR, SHL, SHR
-+   2, 2, 2, 3, 2, 2, 3,    // AND, OR, XOR, LOP3_LUT, SHL, SHR, SHF
-    2, 2, 1,                // MAX, MIN, SAT
-    1, 1, 1, 1,             // CEIL, FLOOR, TRUNC, CVT
-    3, 3, 3, 2, 3, 3,       // SET_AND,OR,XOR, SET, SELP, SLCT
-@@ -43,7 +43,7 @@ const uint8_t Target::operationSrcNr[] =
-    0, 0, 0,                // PRERET,CONT,BREAK
-    0, 0, 0, 0, 0, 0,       // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
-    1, 1, 1, 2, 1, 2,       // VFETCH, PFETCH, AFETCH, EXPORT, LINTERP, PINTERP
--   1, 1,                   // EMIT, RESTART
-+   1, 1, 1,                // EMIT, RESTART, FINAL
-    1, 1, 1,                // TEX, TXB, TXL,
-    1, 1, 1, 1, 1, 1, 2,    // TXF, TXQ, TXD, TXG, TXLQ, TEXCSAA, TEXPREP
-    1, 1, 2, 2, 2, 2, 2,    // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA
-@@ -51,13 +51,15 @@ const uint8_t Target::operationSrcNr[] =
-    0,                      // TEXBAR
-    1, 1,                   // DFDX, DFDY
-    1, 2, 1, 2, 0, 0,       // RDSV, WRSV, PIXLD, QUADOP, QUADON, QUADPOP
--   2, 3, 2, 1, 3,          // POPCNT, INSBF, EXTBF, BFIND, PERMT
-+   2, 3, 2, 1, 1, 2, 3,    // POPCNT, INSBF, EXTBF, BFIND, BREV, BMSK, PERMT
-+   2,                      // SGXT
-    2, 2,                   // ATOM, BAR
-    2, 2, 2, 2, 3, 2,       // VADD, VAVG, VMIN, VMAX, VSAD, VSET,
-    2, 2, 2, 1,             // VSHR, VSHL, VSEL, CCTL
-    3,                      // SHFL
-    1,                      // VOTE
-    1,                      // BUFQ
-+   1,                      // WARPSYNC
-    0
- };
- 
-@@ -75,10 +77,10 @@ const OpClass Target::operationClass[] =
-    OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
-    OPCLASS_ARITH, OPCLASS_ARITH,
-    OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
--   // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR
-+   // ABS, NEG; NOT, AND, OR, XOR, LOP3_LUT; SHL, SHR, SHF
-    OPCLASS_CONVERT, OPCLASS_CONVERT,
--   OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
--   OPCLASS_SHIFT, OPCLASS_SHIFT,
-+   OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
-+   OPCLASS_SHIFT, OPCLASS_SHIFT, OPCLASS_SHIFT,
-    // MAX, MIN
-    OPCLASS_COMPARE, OPCLASS_COMPARE,
-    // SAT, CEIL, FLOOR, TRUNC; CVT
-@@ -103,8 +105,8 @@ const OpClass Target::operationClass[] =
-    OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_STORE,
-    // LINTERP, PINTERP
-    OPCLASS_SFU, OPCLASS_SFU,
--   // EMIT, RESTART
--   OPCLASS_CONTROL, OPCLASS_CONTROL,
-+   // EMIT, RESTART, FINAL
-+   OPCLASS_CONTROL, OPCLASS_CONTROL, OPCLASS_CONTROL,
-    // TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TXLQ; TEXCSAA, TEXPREP
-    OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
-    OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
-@@ -119,9 +121,9 @@ const OpClass Target::operationClass[] =
-    // DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP
-    OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
-    OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_CONTROL, OPCLASS_CONTROL,
--   // POPCNT, INSBF, EXTBF, BFIND; PERMT
-+   // POPCNT, INSBF, EXTBF, BFIND, BREV, BMSK; PERMT, SGXT
-+   OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD,
-    OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD,
--   OPCLASS_BITFIELD,
-    // ATOM, BAR
-    OPCLASS_ATOMIC, OPCLASS_CONTROL,
-    // VADD, VAVG, VMIN, VMAX
-@@ -136,10 +138,13 @@ const OpClass Target::operationClass[] =
-    OPCLASS_OTHER,
-    // BUFQ
-    OPCLASS_OTHER,
-+   // WARPSYNC
-+   OPCLASS_OTHER,
-    OPCLASS_PSEUDO // LAST
- };
- 
- 
-+extern Target *getTargetGV100(unsigned int chipset);
- extern Target *getTargetGM107(unsigned int chipset);
- extern Target *getTargetNVC0(unsigned int chipset);
- extern Target *getTargetNV50(unsigned int chipset);
-@@ -149,6 +154,9 @@ Target *Target::create(unsigned int chipset)
-    STATIC_ASSERT(ARRAY_SIZE(operationSrcNr) == OP_LAST + 1);
-    STATIC_ASSERT(ARRAY_SIZE(operationClass) == OP_LAST + 1);
-    switch (chipset & ~0xf) {
-+   case 0x160:
-+   case 0x140:
-+      return getTargetGV100(chipset);
-    case 0x110:
-    case 0x120:
-    case 0x130:
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
-index afeca14d7d1..0f7db116577 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
-@@ -200,7 +200,7 @@ public:
-       uint8_t dstMods;
-       uint16_t srcFiles[3];
-       uint16_t dstFiles;
--      unsigned int minEncSize  : 4;
-+      unsigned int minEncSize  : 5;
-       unsigned int vector      : 1;
-       unsigned int predicate   : 1;
-       unsigned int commutative : 1;
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.cpp
-new file mode 100644
-index 00000000000..fd969e1ece5
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.cpp
-@@ -0,0 +1,594 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#include "codegen/nv50_ir_target_gv100.h"
-+#include "codegen/nv50_ir_lowering_gv100.h"
-+#include "codegen/nv50_ir_emit_gv100.h"
-+
-+namespace nv50_ir {
-+
-+void
-+TargetGV100::initOpInfo()
-+{
-+   unsigned int i, j;
-+
-+   static const operation commutative[] =
-+   {
-+      OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_MAX, OP_MIN,
-+      OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT
-+   };
-+
-+   static const operation noDest[] =
-+   {
-+      OP_EXIT
-+   };
-+
-+   static const operation noPred[] =
-+   {
-+   };
-+
-+   for (i = 0; i < DATA_FILE_COUNT; ++i)
-+      nativeFileMap[i] = (DataFile)i;
-+   nativeFileMap[FILE_ADDRESS] = FILE_GPR;
-+   nativeFileMap[FILE_FLAGS] = FILE_PREDICATE;
-+
-+   for (i = 0; i < OP_LAST; ++i) {
-+      opInfo[i].variants = NULL;
-+      opInfo[i].op = (operation)i;
-+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
-+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
-+      opInfo[i].immdBits = 0;
-+      opInfo[i].srcNr = operationSrcNr[i];
-+
-+      for (j = 0; j < opInfo[i].srcNr; ++j) {
-+         opInfo[i].srcMods[j] = 0;
-+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
-+      }
-+      opInfo[i].dstMods = 0;
-+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
-+
-+      opInfo[i].hasDest = 1;
-+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
-+      opInfo[i].commutative = false; /* set below */
-+      opInfo[i].pseudo = (i < OP_MOV);
-+      opInfo[i].predicate = !opInfo[i].pseudo;
-+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
-+      opInfo[i].minEncSize = 16;
-+   }
-+   for (i = 0; i < ARRAY_SIZE(commutative); ++i)
-+      opInfo[commutative[i]].commutative = true;
-+   for (i = 0; i < ARRAY_SIZE(noDest); ++i)
-+      opInfo[noDest[i]].hasDest = 0;
-+   for (i = 0; i < ARRAY_SIZE(noPred); ++i)
-+      opInfo[noPred[i]].predicate = 0;
-+}
-+
-+struct opInfo {
-+   struct {
-+      uint8_t files;
-+      uint8_t mods;
-+   } src[3];
-+};
-+
-+#define SRC_NONE 0
-+#define SRC_R    (1 << FILE_GPR)
-+#define SRC_I    (1 << FILE_MEMORY_CONST)
-+#define SRC_C    (1 << FILE_IMMEDIATE)
-+#define SRC_RC   (SRC_R |         SRC_C)
-+#define SRC_RI   (SRC_R | SRC_I        )
-+#define SRC_RIC  (SRC_R | SRC_I | SRC_C)
-+
-+#define MOD_NONE 0
-+#define MOD_NEG  NV50_IR_MOD_NEG
-+#define MOD_ABS  NV50_IR_MOD_ABS
-+#define MOD_NOT  NV50_IR_MOD_NOT
-+#define MOD_NA   (MOD_NEG | MOD_ABS)
-+
-+#define OPINFO(O,SA,MA,SB,MB,SC,MC)                                            \
-+static struct opInfo                                                           \
-+opInfo_##O = {                                                                 \
-+   .src = { { SRC_##SA, MOD_##MA },                                            \
-+            { SRC_##SB, MOD_##MB },                                            \
-+            { SRC_##SC, MOD_##MC }},                                           \
-+};
-+
-+
-+/* Handled by GV100LegalizeSSA. */
-+OPINFO(FABS     , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(FCMP     , R   , NONE, RIC , NONE, RIC , NONE); //XXX: use FSEL for mods
-+OPINFO(FNEG     , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(FSET     , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(ICMP     , R   , NONE, RIC , NONE, RIC , NONE);
-+OPINFO(IMUL     , R   , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(INEG     , RIC , NEG , NONE, NONE, NONE, NONE);
-+OPINFO(ISET     , R   , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(LOP2     , R   , NOT , RIC , NOT , NONE, NONE);
-+OPINFO(NOT      , RIC , NONE, NONE, NONE, NONE, NONE);
-+OPINFO(SAT      , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(SHL      , RIC , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(SHR      , RIC , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(SUB      , R   , NONE, RIC , NEG , NONE, NONE);
-+OPINFO(IMNMX    , R   , NONE, RIC , NONE, NONE, NONE);
-+
-+/* Handled by CodeEmitterGV100. */
-+OPINFO(AL2P     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(ALD      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(AST      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(ATOM     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(ATOMS    , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(BAR      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(BRA      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(BMSK     , R   , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(BREV     , RIC , NONE, NONE, NONE, NONE, NONE);
-+OPINFO(CCTL     , NONE, NONE, NONE, NONE, NONE, NONE);
-+//OPINFO(CS2R     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(DADD     , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(DFMA     , R   , NA  , RIC , NA  , RIC , NA  );
-+OPINFO(DMUL     , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(DSETP    , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(EXIT     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(F2F      , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(F2I      , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(FADD     , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(FFMA     , R   , NA  , RIC , NA  , RIC , NA  );
-+OPINFO(FLO      , RIC , NOT , NONE, NONE, NONE, NONE);
-+OPINFO(FMNMX    , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(FMUL     , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(FRND     , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(FSET_BF  , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(FSETP    , R   , NA  , RIC , NA  , NONE, NONE);
-+OPINFO(FSWZADD  , R   , NONE, R   , NONE, NONE, NONE);
-+OPINFO(I2F      , RIC , NONE, NONE, NONE, NONE, NONE);
-+OPINFO(IABS     , RIC , NONE, NONE, NONE, NONE, NONE);
-+OPINFO(IADD3    , R   , NEG , RIC , NEG , R   , NEG );
-+OPINFO(IMAD     , R   , NONE, RIC , NONE, RIC , NEG );
-+OPINFO(IMAD_WIDE, R   , NONE, RIC , NONE, RC  , NEG );
-+OPINFO(IPA      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(ISBERD   , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(ISETP    , R   , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(KILL     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(LD       , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(LDC      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(LDL      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(LDS      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(LEA      , R   , NEG , I   , NONE, RIC , NEG );
-+OPINFO(LOP3_LUT , R   , NONE, RIC , NONE, R   , NONE);
-+OPINFO(MEMBAR   , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(MOV      , RIC , NONE, NONE, NONE, NONE, NONE);
-+OPINFO(MUFU     , RIC , NA  , NONE, NONE, NONE, NONE);
-+OPINFO(NOP      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(OUT      , R   , NONE, RI  , NONE, NONE, NONE);
-+OPINFO(PIXLD    , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(PLOP3_LUT, NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(POPC     , RIC , NOT , NONE, NONE, NONE, NONE);
-+OPINFO(PRMT     , R   , NONE, RIC , NONE, RIC , NONE);
-+OPINFO(RED      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(SGXT     , R   , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(S2R      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(SEL      , R   , NONE, RIC , NONE, NONE, NONE);
-+OPINFO(SHF      , R   , NONE, RIC , NONE, RIC , NONE);
-+OPINFO(SHFL     , R   , NONE, R   , NONE, R   , NONE);
-+OPINFO(ST       , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(STL      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(STS      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(SUATOM   , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(SULD     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(SUST     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(TEX      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(TLD      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(TLD4     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(TMML     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(TXD      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(TXQ      , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(VOTE     , NONE, NONE, NONE, NONE, NONE, NONE);
-+OPINFO(WARPSYNC , R   , NONE, NONE, NONE, NONE, NONE);
-+
-+static const struct opInfo *
-+getOpInfo(const Instruction *i)
-+{
-+   switch (i->op) {
-+   case OP_ABS:
-+      if (isFloatType(i->dType))
-+         return &opInfo_FABS;
-+      return &opInfo_IABS;
-+   case OP_ADD:
-+      if (isFloatType(i->dType)) {
-+         if (i->dType == TYPE_F32)
-+            return &opInfo_FADD;
-+         else
-+            return &opInfo_DADD;
-+      } else {
-+         return &opInfo_IADD3;
-+      }
-+      break;
-+   case OP_AFETCH: return &opInfo_AL2P;
-+   case OP_AND:
-+   case OP_OR:
-+   case OP_XOR:
-+      if (i->def(0).getFile() == FILE_PREDICATE)
-+         return &opInfo_PLOP3_LUT;
-+      return &opInfo_LOP2;
-+   case OP_ATOM:
-+      if (i->src(0).getFile() == FILE_MEMORY_SHARED)
-+         return &opInfo_ATOMS;
-+      else
-+         if (!i->defExists(0) && i->subOp < NV50_IR_SUBOP_ATOM_CAS)
-+            return &opInfo_RED;
-+         else
-+            return &opInfo_ATOM;
-+      break;
-+   case OP_BAR: return &opInfo_BAR;
-+   case OP_BFIND: return &opInfo_FLO;
-+   case OP_BMSK: return &opInfo_BMSK;
-+   case OP_BREV: return &opInfo_BREV;
-+   case OP_BRA:
-+   case OP_JOIN: return &opInfo_BRA; //XXX
-+   case OP_CCTL: return &opInfo_CCTL;
-+   case OP_CEIL:
-+   case OP_CVT:
-+   case OP_FLOOR:
-+   case OP_TRUNC:
-+      if (i->op == OP_CVT && (i->def(0).getFile() == FILE_PREDICATE ||
-+                                 i->src(0).getFile() == FILE_PREDICATE)) {
-+         return &opInfo_MOV;
-+      } else if (isFloatType(i->dType)) {
-+         if (isFloatType(i->sType)) {
-+            if (i->sType == i->dType)
-+               return &opInfo_FRND;
-+            else
-+               return &opInfo_F2F;
-+         } else {
-+            return &opInfo_I2F;
-+         }
-+      } else {
-+         if (isFloatType(i->sType))
-+            return &opInfo_F2I;
-+      }
-+      break;
-+   case OP_COS:
-+   case OP_EX2:
-+   case OP_LG2:
-+   case OP_RCP:
-+   case OP_RSQ:
-+   case OP_SIN:
-+   case OP_SQRT: return &opInfo_MUFU;
-+   case OP_DISCARD: return &opInfo_KILL;
-+   case OP_EMIT:
-+   case OP_FINAL:
-+   case OP_RESTART: return &opInfo_OUT;
-+   case OP_EXIT: return &opInfo_EXIT;
-+   case OP_EXPORT: return &opInfo_AST;
-+   case OP_FMA:
-+   case OP_MAD:
-+      if (isFloatType(i->dType)) {
-+         if (i->dType == TYPE_F32)
-+            return &opInfo_FFMA;
-+         else
-+            return &opInfo_DFMA;
-+      } else {
-+         if (typeSizeof(i->dType) != 8)
-+            return &opInfo_IMAD;
-+         else
-+            return &opInfo_IMAD_WIDE;
-+      }
-+      break;
-+   case OP_JOINAT: return &opInfo_NOP; //XXX
-+   case OP_LINTERP: return &opInfo_IPA;
-+   case OP_LOAD:
-+      switch (i->src(0).getFile()) {
-+      case FILE_MEMORY_CONST : return &opInfo_LDC;
-+      case FILE_MEMORY_LOCAL : return &opInfo_LDL;
-+      case FILE_MEMORY_SHARED: return &opInfo_LDS;
-+      case FILE_MEMORY_GLOBAL: return &opInfo_LD;
-+      default:
-+         break;
-+      }
-+      break;
-+   case OP_LOP3_LUT: return &opInfo_LOP3_LUT;
-+   case OP_MAX:
-+   case OP_MIN:
-+      if (isFloatType(i->dType)) {
-+         if (i->dType == TYPE_F32)
-+            return &opInfo_FMNMX;
-+      } else {
-+         return &opInfo_IMNMX;
-+      }
-+      break;
-+   case OP_MEMBAR: return &opInfo_MEMBAR;
-+   case OP_MOV: return &opInfo_MOV;
-+   case OP_MUL:
-+      if (isFloatType(i->dType)) {
-+         if (i->dType == TYPE_F32)
-+            return &opInfo_FMUL;
-+         else
-+            return &opInfo_DMUL;
-+      }
-+      return &opInfo_IMUL;
-+   case OP_NEG:
-+      if (isFloatType(i->dType))
-+         return &opInfo_FNEG;
-+      return &opInfo_INEG;
-+   case OP_NOT: return &opInfo_NOT;
-+   case OP_PERMT: return &opInfo_PRMT;
-+   case OP_PFETCH: return &opInfo_ISBERD;
-+   case OP_PIXLD: return &opInfo_PIXLD;
-+   case OP_POPCNT: return &opInfo_POPC;
-+   case OP_QUADOP: return &opInfo_FSWZADD;
-+   case OP_RDSV:
-+#if 0
-+      if (targ->isCS2RSV(i->getSrc(0)->reg.data.sv.sv))
-+         return &opInfo_CS2R;
-+#endif
-+      return &opInfo_S2R;
-+   case OP_SAT: return &opInfo_SAT;
-+   case OP_SELP: return &opInfo_SEL;
-+   case OP_SET:
-+   case OP_SET_AND:
-+   case OP_SET_OR:
-+   case OP_SET_XOR:
-+      if (i->def(0).getFile() != FILE_PREDICATE) {
-+         if (isFloatType(i->dType)) {
-+            if (i->dType == TYPE_F32)
-+               return &opInfo_FSET_BF;
-+         } else {
-+            if (isFloatType(i->sType))
-+                  return &opInfo_FSET;
-+            return &opInfo_ISET;
-+         }
-+      } else {
-+         if (isFloatType(i->sType))
-+            if (i->sType == TYPE_F64)
-+               return &opInfo_DSETP;
-+            else
-+               return &opInfo_FSETP;
-+         else
-+            return &opInfo_ISETP;
-+      }
-+      break;
-+   case OP_SGXT: return &opInfo_SGXT;
-+   case OP_SHF: return &opInfo_SHF;
-+   case OP_SHFL: return &opInfo_SHFL;
-+   case OP_SHL: return &opInfo_SHL;
-+   case OP_SHLADD: return &opInfo_LEA;
-+   case OP_SHR: return &opInfo_SHR;
-+   case OP_SLCT:
-+      if (isFloatType(i->sType))
-+         return &opInfo_FCMP;
-+      return &opInfo_ICMP;
-+   case OP_STORE:
-+      switch (i->src(0).getFile()) {
-+      case FILE_MEMORY_LOCAL : return &opInfo_STL;
-+      case FILE_MEMORY_SHARED: return &opInfo_STS;
-+      case FILE_MEMORY_GLOBAL: return &opInfo_ST;
-+      default:
-+         break;
-+      }
-+      break;
-+   case OP_SUB: return &opInfo_SUB;
-+   case OP_SULDB:
-+   case OP_SULDP: return &opInfo_SULD;
-+   case OP_SUREDB:
-+   case OP_SUREDP: return &opInfo_SUATOM;
-+   case OP_SUSTB:
-+   case OP_SUSTP: return &opInfo_SUST;
-+   case OP_TEX:
-+   case OP_TXB:
-+   case OP_TXL: return &opInfo_TEX;
-+   case OP_TXD: return &opInfo_TXD;
-+   case OP_TXF: return &opInfo_TLD;
-+   case OP_TXG: return &opInfo_TLD4;
-+   case OP_TXLQ: return &opInfo_TMML;
-+   case OP_TXQ: return &opInfo_TXQ;
-+   case OP_VFETCH: return &opInfo_ALD;
-+   case OP_VOTE: return &opInfo_VOTE;
-+   case OP_WARPSYNC: return &opInfo_WARPSYNC;
-+   default:
-+      break;
-+   }
-+   return NULL;
-+}
-+
-+bool
-+TargetGV100::isSatSupported(const Instruction *i) const
-+{
-+   switch (i->dType) {
-+   case TYPE_F32:
-+      switch (i->op) {
-+      case OP_ADD:
-+      case OP_FMA:
-+      case OP_MAD:
-+      case OP_MUL: return true;
-+      default:
-+         break;
-+      }
-+      break;
-+   default:
-+      break;
-+   }
-+   return false;
-+}
-+
-+bool
-+TargetGV100::isModSupported(const Instruction *i, int s, Modifier mod) const
-+{
-+   const struct opInfo *info = nv50_ir::getOpInfo(i);
-+   uint8_t mods = 0;
-+   if (info && s < (int)ARRAY_SIZE(info->src))
-+      mods = info->src[s].mods;
-+   return (mod & Modifier(mods)) == mod;
-+}
-+
-+bool
-+TargetGV100::isOpSupported(operation op, DataType ty) const
-+{
-+   if (op == OP_MAD || op == OP_FMA)
-+      return true;
-+   if (ty == TYPE_F32) {
-+      if (op == OP_MAX)
-+         return true;
-+   }
-+   if (op == OP_RSQ)
-+      return true;
-+   if (op == OP_SET ||
-+       op == OP_SET_AND ||
-+       op == OP_SET_OR ||
-+       op == OP_SET_XOR)
-+      return true;
-+   if (op == OP_SHLADD)
-+      return true;
-+   return false;
-+}
-+
-+bool
-+TargetGV100::isBarrierRequired(const Instruction *i) const
-+{
-+   switch (i->op) {
-+   case OP_BREV:
-+      return true;
-+   default:
-+      break;
-+   }
-+
-+   return TargetGM107::isBarrierRequired(i);
-+}
-+
-+bool
-+TargetGV100::insnCanLoad(const Instruction *i, int s,
-+                         const Instruction *ld) const
-+{
-+   const struct opInfo *info = nv50_ir::getOpInfo(i);
-+   uint16_t files = 0;
-+
-+   if (ld->src(0).getFile() == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
-+      return (!i->isPseudo() &&
-+              !i->asTex() &&
-+              i->op != OP_EXPORT && i->op != OP_STORE);
-+
-+   if (ld->src(0).isIndirect(0))
-+      return false;
-+
-+   if (info && s < (int)ARRAY_SIZE(info->src)) {
-+      files = info->src[s].files;
-+      if ((s == 1 && i->srcExists(2) && i->src(2).getFile() != FILE_GPR) ||
-+          (s == 2 && i->srcExists(1) && i->src(1).getFile() != FILE_GPR)) {
-+         files &= ~(1 << FILE_MEMORY_CONST);
-+         files &= ~(1 << FILE_IMMEDIATE);
-+      } else
-+      if ((i->op == OP_SHL || i->op == OP_SHR) &&
-+          ((s == 0 && i->srcExists(1) && i->src(1).getFile() != FILE_GPR) ||
-+           (s == 1 && i->srcExists(0) && i->src(0).getFile() != FILE_GPR))) {
-+         files &= ~(1 << FILE_MEMORY_CONST);
-+         files &= ~(1 << FILE_IMMEDIATE);
-+      }
-+   }
-+
-+   if (ld->src(0).getFile() == FILE_IMMEDIATE) {
-+      if (i->sType == TYPE_F64) {
-+         if (ld->getSrc(0)->asImm()->reg.data.u64 & 0x00000000ffffffff)
-+            return false;
-+      }
-+   }
-+
-+   return (files & (1 << ld->src(0).getFile()));
-+}
-+
-+void
-+TargetGV100::getBuiltinCode(const uint32_t **code, uint32_t *size) const
-+{
-+   //XXX: find out why gv100 (tu1xx is fine) hangs without this
-+   static uint32_t builtin[] = {
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+      0x0000794d, 0x00000000, 0x03800000, 0x03ffde00,
-+   };
-+   *code = builtin;
-+   *size = sizeof(builtin);
-+}
-+
-+uint32_t
-+TargetGV100::getBuiltinOffset(int builtin) const
-+{
-+   return 0;
-+}
-+
-+bool
-+TargetGV100::runLegalizePass(Program *prog, CGStage stage) const
-+{
-+   if (stage == CG_STAGE_PRE_SSA) {
-+      GM107LoweringPass pass1(prog);
-+      GV100LoweringPass pass2(prog);
-+      pass1.run(prog, false, true);
-+      pass2.run(prog, false, true);
-+      return true;
-+   } else
-+   if (stage == CG_STAGE_SSA) {
-+      GV100LegalizeSSA pass(prog);
-+      return pass.run(prog, false, true);
-+   } else
-+   if (stage == CG_STAGE_POST_RA) {
-+      NVC0LegalizePostRA pass(prog);
-+      return pass.run(prog, false, true);
-+   }
-+   return false;
-+}
-+
-+CodeEmitter *
-+TargetGV100::getCodeEmitter(Program::Type type)
-+{
-+   return new CodeEmitterGV100(this);
-+}
-+
-+TargetGV100::TargetGV100(unsigned int chipset)
-+   : TargetGM107(chipset)
-+{
-+   initOpInfo();
-+};
-+
-+Target *getTargetGV100(unsigned int chipset)
-+{
-+   return new TargetGV100(chipset);
-+}
-+
-+};
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.h
-new file mode 100644
-index 00000000000..897e6a22d30
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gv100.h
-@@ -0,0 +1,52 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#ifndef __NV50_IR_TARGET_GV100_H__
-+#define __NV50_IR_TARGET_GV100_H__
-+#include "codegen/nv50_ir_target_gm107.h"
-+
-+namespace nv50_ir {
-+
-+class TargetGV100 : public TargetGM107 {
-+public:
-+   TargetGV100(unsigned int chipset);
-+
-+   virtual CodeEmitter *getCodeEmitter(Program::Type);
-+
-+   virtual bool runLegalizePass(Program *, CGStage stage) const;
-+
-+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
-+   virtual uint32_t getBuiltinOffset(int builtin) const;
-+
-+   virtual bool insnCanLoad(const Instruction *, int, const Instruction *) const;
-+   virtual bool isOpSupported(operation, DataType) const;
-+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
-+   virtual bool isSatSupported(const Instruction *) const;
-+
-+   virtual bool isBarrierRequired(const Instruction *) const;
-+
-+private:
-+   void initOpInfo();
-+   void initProps(const struct opProperties *, int);
-+};
-+
-+};
-+#endif
-diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
-index 60134b445db..ed5b343ccba 100644
---- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
-+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
-@@ -30,7 +30,7 @@ Target *getTargetNVC0(unsigned int chipset)
- }
- 
- TargetNVC0::TargetNVC0(unsigned int card) :
--   Target(card < 0x110, false, card >= 0xe4)
-+   Target(card < 0x110, false, card >= 0xe4 && card < 0x140)
- {
-    chipset = card;
-    initOpInfo();
-diff --git a/src/gallium/drivers/nouveau/meson.build b/src/gallium/drivers/nouveau/meson.build
-index 7a1d18a6394..68cfebdf20c 100644
---- a/src/gallium/drivers/nouveau/meson.build
-+++ b/src/gallium/drivers/nouveau/meson.build
-@@ -150,17 +150,31 @@ files_libnouveau = files(
-   'codegen/nv50_ir_util.cpp',
-   'codegen/nv50_ir_util.h',
-   'codegen/unordered_set.h',
-+  'codegen/nv50_ir_emit_gv100.cpp',
-+  'codegen/nv50_ir_emit_gv100.h',
-   'codegen/nv50_ir_emit_gk110.cpp',
-   'codegen/nv50_ir_emit_gm107.cpp',
-   'codegen/nv50_ir_emit_nvc0.cpp',
-+  'codegen/nv50_ir_lowering_gv100.cpp',
-+  'codegen/nv50_ir_lowering_gv100.h',
-   'codegen/nv50_ir_lowering_gm107.cpp',
-   'codegen/nv50_ir_lowering_gm107.h',
-   'codegen/nv50_ir_lowering_nvc0.cpp',
-   'codegen/nv50_ir_lowering_nvc0.h',
-+  'codegen/nv50_ir_target_gv100.cpp',
-+  'codegen/nv50_ir_target_gv100.h',
-   'codegen/nv50_ir_target_gm107.cpp',
-   'codegen/nv50_ir_target_gm107.h',
-   'codegen/nv50_ir_target_nvc0.cpp',
-   'codegen/nv50_ir_target_nvc0.h',
-+  'nvc0/cla0c0qmd.h',
-+  'nvc0/clc0c0qmd.h',
-+  'nvc0/clc3c0qmd.h',
-+  'nvc0/drf.h',
-+  'nvc0/qmd.h',
-+  'nvc0/qmda0c0.c',
-+  'nvc0/qmdc0c0.c',
-+  'nvc0/qmdc3c0.c',
-   'nvc0/gm107_texture.xml.h',
-   'nvc0/nvc0_3d.xml.h',
-   'nvc0/nvc0_compute.c',
-diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
-index de9cce3812a..8606ba43c1a 100644
---- a/src/gallium/drivers/nouveau/nouveau_screen.c
-+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
-@@ -188,7 +188,11 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
-    if (nv_dbg)
-       nouveau_mesa_debug = atoi(nv_dbg);
- 
--   screen->prefer_nir = debug_get_bool_option("NV50_PROG_USE_NIR", false);
-+   if (dev->chipset < 0x140)
-+      screen->prefer_nir = debug_get_bool_option("NV50_PROG_USE_NIR", false);
-+   else
-+      screen->prefer_nir = true;
-+
-    screen->force_enable_cl = debug_get_bool_option("NOUVEAU_ENABLE_CL", false);
-    if (screen->force_enable_cl)
-       glsl_type_singleton_init_or_ref();
-diff --git a/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
-index 899d73d7398..31e7cf82233 100644
---- a/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
-+++ b/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
-@@ -218,9 +218,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- #define NV50_2D_PATTERN_SELECT_BITMAP_1X64			0x00000002
- #define NV50_2D_PATTERN_SELECT_COLOR				0x00000003
- 
--#define NVC0_2D_UNK02B8(i0)				       (0x000002b8 + 0x4*(i0))
--#define NVC0_2D_UNK02B8__ESIZE					0x00000004
--#define NVC0_2D_UNK02B8__LEN					0x00000009
-+#define NVC0_2D_SET_DST_COLOR_RENDER_TO_ZETA_SURFACE  0x000002b8
- 
- #define NVC0_2D_UNK2DC						0x000002dc
- 
-diff --git a/src/gallium/drivers/nouveau/nv_object.xml.h b/src/gallium/drivers/nouveau/nv_object.xml.h
-index 664bfae9f64..fac195d4846 100644
---- a/src/gallium/drivers/nouveau/nv_object.xml.h
-+++ b/src/gallium/drivers/nouveau/nv_object.xml.h
-@@ -195,6 +195,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- #define GM200_3D_CLASS						0x0000b197
- #define GP100_3D_CLASS						0x0000c097
- #define GP102_3D_CLASS						0x0000c197
-+#define GV100_3D_CLASS						0x0000c397
-+#define TU102_3D_CLASS						0x0000c597
- #define NV50_2D_CLASS						0x0000502d
- #define NVC0_2D_CLASS						0x0000902d
- #define NV50_COMPUTE_CLASS					0x000050c0
-@@ -207,6 +209,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- #define GM200_COMPUTE_CLASS					0x0000b1c0
- #define GP100_COMPUTE_CLASS					0x0000c0c0
- #define GP104_COMPUTE_CLASS					0x0000c1c0
-+#define GV100_COMPUTE_CLASS					0x0000c3c0
-+#define TU102_COMPUTE_CLASS					0x0000c5c0
- #define NV84_CRYPT_CLASS					0x000074c1
- #define BLOB_NVC0_PCOPY1_CLASS					0x000090b8
- #define BLOB_NVC0_PCOPY0_CLASS					0x000090b5
-diff --git a/src/gallium/drivers/nouveau/nvc0/cla0c0qmd.h b/src/gallium/drivers/nouveau/nvc0/cla0c0qmd.h
-new file mode 100644
-index 00000000000..c0829f1cdc2
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/cla0c0qmd.h
-@@ -0,0 +1,660 @@
-+/*******************************************************************************
-+    Copyright (c) 2016 NVIDIA Corporation
-+
-+    Permission is hereby granted, free of charge, to any person obtaining a copy
-+    of this software and associated documentation files (the "Software"), to
-+    deal in the Software without restriction, including without limitation the
-+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-+    sell copies of the Software, and to permit persons to whom the Software is
-+    furnished to do so, subject to the following conditions:
-+
-+        The above copyright notice and this permission notice shall be
-+        included in all copies or substantial portions of the Software.
-+
-+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-+    DEALINGS IN THE SOFTWARE.
-+
-+*******************************************************************************/
-+
-+/* AUTO GENERATED FILE -- DO NOT EDIT */
-+
-+#ifndef __CLA0C0QMD_H__
-+#define __CLA0C0QMD_H__
-+
-+/*
-+** Queue Meta Data, Version 00_06
-+ */
-+
-+// The below C preprocessor definitions describe "multi-word" structures, where
-+// fields may have bit numbers beyond 32.  For example, MW(127:96) means
-+// the field is in bits 0-31 of word number 3 of the structure.  The "MW(X:Y)"
-+// syntax is to distinguish from similar "X:Y" single-word definitions: the
-+// macros historically used for single-word definitions would fail with
-+// multi-word definitions.
-+//
-+// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel
-+// interface layer of nvidia.ko for an example of how to manipulate
-+// these MW(X:Y) definitions.
-+
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_A                         MW(30:0)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_B                         MW(31:31)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_C                         MW(62:32)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_D                         MW(63:63)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_E                         MW(94:64)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_F                         MW(95:95)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_G                         MW(126:96)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_H                         MW(127:127)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_A_A                          MW(159:128)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_I                         MW(191:160)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_J                         MW(196:192)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_A                            MW(199:197)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_K                         MW(200:200)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_K_FALSE                   0x00000000
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_K_TRUE                    0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_L                         MW(201:201)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_L_FALSE                   0x00000000
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_L_TRUE                    0x00000001
-+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE0                 MW(202:202)
-+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE1                 MW(203:203)
-+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVA0C0_QMDV00_06_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_B                            MW(207:204)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_M                         MW(222:208)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_N                         MW(223:223)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_N_FALSE                   0x00000000
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_N_TRUE                    0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_O                         MW(248:224)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_C                            MW(249:249)
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_HEADER_CACHE           MW(250:250)
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(251:251)
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_DATA_CACHE             MW(252:252)
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVA0C0_QMDV00_06_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_DATA_CACHE              MW(253:253)
-+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVA0C0_QMDV00_06_INVALIDATE_INSTRUCTION_CACHE              MW(254:254)
-+#define NVA0C0_QMDV00_06_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVA0C0_QMDV00_06_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_CONSTANT_CACHE          MW(255:255)
-+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVA0C0_QMDV00_06_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVA0C0_QMDV00_06_PROGRAM_OFFSET                            MW(287:256)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_P                         MW(319:288)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_Q                         MW(327:320)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_D                            MW(335:328)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_R                         MW(351:336)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_S                         MW(357:352)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_E                            MW(365:358)
-+#define NVA0C0_QMDV00_06_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVA0C0_QMDV00_06_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVA0C0_QMDV00_06_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_T                         MW(370:370)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_T_FALSE                   0x00000000
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_T_TRUE                    0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_U                         MW(371:371)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_U_FALSE                   0x00000000
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_U_TRUE                    0x00000001
-+#define NVA0C0_QMDV00_06_THROTTLED                                 MW(372:372)
-+#define NVA0C0_QMDV00_06_THROTTLED_FALSE                           0x00000000
-+#define NVA0C0_QMDV00_06_THROTTLED_TRUE                            0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_E2_A                         MW(376:376)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_E2_B                         MW(377:377)
-+#define NVA0C0_QMDV00_06_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVA0C0_QMDV00_06_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVA0C0_QMDV00_06_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVA0C0_QMDV00_06_SHARED_MEMORY_BANK_MAPPING                MW(379:379)
-+#define NVA0C0_QMDV00_06_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000
-+#define NVA0C0_QMDV00_06_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001
-+#define NVA0C0_QMDV00_06_SAMPLER_INDEX                             MW(382:382)
-+#define NVA0C0_QMDV00_06_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVA0C0_QMDV00_06_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_E3_A                         MW(383:383)
-+#define NVA0C0_QMDV00_06_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVA0C0_QMDV00_06_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVA0C0_QMDV00_06_CTA_RASTER_DEPTH                          MW(447:432)
-+#define NVA0C0_QMDV00_06_CTA_RASTER_WIDTH_RESUME                   MW(479:448)
-+#define NVA0C0_QMDV00_06_CTA_RASTER_HEIGHT_RESUME                  MW(495:480)
-+#define NVA0C0_QMDV00_06_CTA_RASTER_DEPTH_RESUME                   MW(511:496)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_V                         MW(535:512)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_F                            MW(542:536)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_W                         MW(543:543)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_W_FALSE                   0x00000000
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_W_TRUE                    0x00000001
-+#define NVA0C0_QMDV00_06_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_G                            MW(575:562)
-+#define NVA0C0_QMDV00_06_QMD_VERSION                               MW(579:576)
-+#define NVA0C0_QMDV00_06_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_H                            MW(591:584)
-+#define NVA0C0_QMDV00_06_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVA0C0_QMDV00_06_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVA0C0_QMDV00_06_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_I                            MW(668:648)
-+#define NVA0C0_QMDV00_06_L1_CONFIGURATION                          MW(671:669)
-+#define NVA0C0_QMDV00_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001
-+#define NVA0C0_QMDV00_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002
-+#define NVA0C0_QMDV00_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_X                         MW(703:672)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_V1_Y                         MW(735:704)
-+#define NVA0C0_QMDV00_06_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVA0C0_QMDV00_06_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_J                            MW(783:776)
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_K                            MW(791:791)
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVA0C0_QMDV00_06_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVA0C0_QMDV00_06_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVA0C0_QMDV00_06_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_L                            MW(879:872)
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_M                            MW(887:887)
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVA0C0_QMDV00_06_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVA0C0_QMDV00_06_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVA0C0_QMDV00_06_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((959+(i)*64):(928+(i)*64))
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((967+(i)*64):(960+(i)*64))
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((973+(i)*64):(968+(i)*64))
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_INVALIDATE(i)             MW((974+(i)*64):(974+(i)*64))
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVA0C0_QMDV00_06_CONSTANT_BUFFER_SIZE(i)                   MW((991+(i)*64):(975+(i)*64))
-+#define NVA0C0_QMDV00_06_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(1463:1440)
-+#define NVA0C0_QMDV00_06_QMD_RESERVED_N                            MW(1466:1464)
-+#define NVA0C0_QMDV00_06_BARRIER_COUNT                             MW(1471:1467)
-+#define NVA0C0_QMDV00_06_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(1495:1472)
-+#define NVA0C0_QMDV00_06_REGISTER_COUNT                            MW(1503:1496)
-+#define NVA0C0_QMDV00_06_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1527:1504)
-+#define NVA0C0_QMDV00_06_SASS_VERSION                              MW(1535:1528)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_A                               MW(1567:1536)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_B                               MW(1599:1568)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_C                               MW(1631:1600)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_D                               MW(1663:1632)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_E                               MW(1695:1664)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_F                               MW(1727:1696)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_G                               MW(1759:1728)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_H                               MW(1791:1760)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_I                               MW(1823:1792)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_J                               MW(1855:1824)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_K                               MW(1887:1856)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_L                               MW(1919:1888)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_M                               MW(1951:1920)
-+#define NVA0C0_QMDV00_06_QMD_SPARE_N                               MW(1983:1952)
-+#define NVA0C0_QMDV00_06_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVA0C0_QMDV00_06_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+/*
-+** Queue Meta Data, Version 01_06
-+ */
-+
-+#define NVA0C0_QMDV01_06_OUTER_PUT                                 MW(30:0)
-+#define NVA0C0_QMDV01_06_OUTER_OVERFLOW                            MW(31:31)
-+#define NVA0C0_QMDV01_06_OUTER_GET                                 MW(62:32)
-+#define NVA0C0_QMDV01_06_OUTER_STICKY_OVERFLOW                     MW(63:63)
-+#define NVA0C0_QMDV01_06_INNER_GET                                 MW(94:64)
-+#define NVA0C0_QMDV01_06_INNER_OVERFLOW                            MW(95:95)
-+#define NVA0C0_QMDV01_06_INNER_PUT                                 MW(126:96)
-+#define NVA0C0_QMDV01_06_INNER_STICKY_OVERFLOW                     MW(127:127)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_A_A                          MW(159:128)
-+#define NVA0C0_QMDV01_06_SCHEDULER_NEXT_QMD_POINTER                MW(191:160)
-+#define NVA0C0_QMDV01_06_QMD_GROUP_ID                              MW(197:192)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_A                            MW(199:198)
-+#define NVA0C0_QMDV01_06_SCHEDULE_ON_PUT_UPDATE_ENABLE             MW(200:200)
-+#define NVA0C0_QMDV01_06_SCHEDULE_ON_PUT_UPDATE_ENABLE_FALSE       0x00000000
-+#define NVA0C0_QMDV01_06_SCHEDULE_ON_PUT_UPDATE_ENABLE_TRUE        0x00000001
-+#define NVA0C0_QMDV01_06_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST      MW(201:201)
-+#define NVA0C0_QMDV01_06_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
-+#define NVA0C0_QMDV01_06_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
-+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE0                 MW(202:202)
-+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE1                 MW(203:203)
-+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVA0C0_QMDV01_06_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVA0C0_QMDV01_06_REQUIRE_SCHEDULING_PCAS                   MW(204:204)
-+#define NVA0C0_QMDV01_06_REQUIRE_SCHEDULING_PCAS_FALSE             0x00000000
-+#define NVA0C0_QMDV01_06_REQUIRE_SCHEDULING_PCAS_TRUE              0x00000001
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_B                            MW(207:205)
-+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_ADDR                    MW(222:208)
-+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_VALID                   MW(223:223)
-+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_VALID_FALSE             0x00000000
-+#define NVA0C0_QMDV01_06_SKED_PRIVATE_LIST_VALID_TRUE              0x00000001
-+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_SIZE                       MW(248:224)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_C                            MW(249:249)
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_HEADER_CACHE           MW(250:250)
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(251:251)
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_DATA_CACHE             MW(252:252)
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVA0C0_QMDV01_06_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_DATA_CACHE              MW(253:253)
-+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVA0C0_QMDV01_06_INVALIDATE_INSTRUCTION_CACHE              MW(254:254)
-+#define NVA0C0_QMDV01_06_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVA0C0_QMDV01_06_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_CONSTANT_CACHE          MW(255:255)
-+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVA0C0_QMDV01_06_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVA0C0_QMDV01_06_PROGRAM_OFFSET                            MW(287:256)
-+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_ADDR_LOWER                 MW(319:288)
-+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_ADDR_UPPER                 MW(327:320)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_D                            MW(335:328)
-+#define NVA0C0_QMDV01_06_CIRCULAR_QUEUE_ENTRY_SIZE                 MW(351:336)
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_ID                    MW(357:352)
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE       MW(365:358)
-+#define NVA0C0_QMDV01_06_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVA0C0_QMDV01_06_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_INCR_ENABLE           MW(367:367)
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE     0x00000000
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE      0x00000001
-+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVA0C0_QMDV01_06_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVA0C0_QMDV01_06_SEQUENTIALLY_RUN_CTAS                     MW(370:370)
-+#define NVA0C0_QMDV01_06_SEQUENTIALLY_RUN_CTAS_FALSE               0x00000000
-+#define NVA0C0_QMDV01_06_SEQUENTIALLY_RUN_CTAS_TRUE                0x00000001
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DECR_ENABLE           MW(371:371)
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE     0x00000000
-+#define NVA0C0_QMDV01_06_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE      0x00000001
-+#define NVA0C0_QMDV01_06_THROTTLED                                 MW(372:372)
-+#define NVA0C0_QMDV01_06_THROTTLED_FALSE                           0x00000000
-+#define NVA0C0_QMDV01_06_THROTTLED_TRUE                            0x00000001
-+#define NVA0C0_QMDV01_06_FP32_NAN_BEHAVIOR                         MW(376:376)
-+#define NVA0C0_QMDV01_06_FP32_NAN_BEHAVIOR_LEGACY                  0x00000000
-+#define NVA0C0_QMDV01_06_FP32_NAN_BEHAVIOR_FP64_COMPATIBLE         0x00000001
-+#define NVA0C0_QMDV01_06_FP32_F2I_NAN_BEHAVIOR                     MW(377:377)
-+#define NVA0C0_QMDV01_06_FP32_F2I_NAN_BEHAVIOR_PASS_ZERO           0x00000000
-+#define NVA0C0_QMDV01_06_FP32_F2I_NAN_BEHAVIOR_PASS_INDEFINITE     0x00000001
-+#define NVA0C0_QMDV01_06_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVA0C0_QMDV01_06_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVA0C0_QMDV01_06_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVA0C0_QMDV01_06_SHARED_MEMORY_BANK_MAPPING                MW(379:379)
-+#define NVA0C0_QMDV01_06_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000
-+#define NVA0C0_QMDV01_06_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001
-+#define NVA0C0_QMDV01_06_SAMPLER_INDEX                             MW(382:382)
-+#define NVA0C0_QMDV01_06_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVA0C0_QMDV01_06_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVA0C0_QMDV01_06_FP32_NARROW_INSTRUCTION                   MW(383:383)
-+#define NVA0C0_QMDV01_06_FP32_NARROW_INSTRUCTION_KEEP_DENORMS      0x00000000
-+#define NVA0C0_QMDV01_06_FP32_NARROW_INSTRUCTION_FLUSH_DENORMS     0x00000001
-+#define NVA0C0_QMDV01_06_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVA0C0_QMDV01_06_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVA0C0_QMDV01_06_CTA_RASTER_DEPTH                          MW(447:432)
-+#define NVA0C0_QMDV01_06_CTA_RASTER_WIDTH_RESUME                   MW(479:448)
-+#define NVA0C0_QMDV01_06_CTA_RASTER_HEIGHT_RESUME                  MW(495:480)
-+#define NVA0C0_QMDV01_06_CTA_RASTER_DEPTH_RESUME                   MW(511:496)
-+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA                              MW(535:512)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_F                            MW(542:536)
-+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA_ENABLE                       MW(543:543)
-+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA_ENABLE_FALSE                 0x00000000
-+#define NVA0C0_QMDV01_06_LAUNCH_QUOTA_ENABLE_TRUE                  0x00000001
-+#define NVA0C0_QMDV01_06_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_G                            MW(575:562)
-+#define NVA0C0_QMDV01_06_QMD_VERSION                               MW(579:576)
-+#define NVA0C0_QMDV01_06_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_H                            MW(591:584)
-+#define NVA0C0_QMDV01_06_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVA0C0_QMDV01_06_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVA0C0_QMDV01_06_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_I                            MW(668:648)
-+#define NVA0C0_QMDV01_06_L1_CONFIGURATION                          MW(671:669)
-+#define NVA0C0_QMDV01_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001
-+#define NVA0C0_QMDV01_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002
-+#define NVA0C0_QMDV01_06_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003
-+#define NVA0C0_QMDV01_06_SM_DISABLE_MASK_LOWER                     MW(703:672)
-+#define NVA0C0_QMDV01_06_SM_DISABLE_MASK_UPPER                     MW(735:704)
-+#define NVA0C0_QMDV01_06_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVA0C0_QMDV01_06_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_J                            MW(783:776)
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_K                            MW(791:791)
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVA0C0_QMDV01_06_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVA0C0_QMDV01_06_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVA0C0_QMDV01_06_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_L                            MW(879:872)
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_M                            MW(887:887)
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVA0C0_QMDV01_06_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVA0C0_QMDV01_06_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVA0C0_QMDV01_06_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((959+(i)*64):(928+(i)*64))
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((967+(i)*64):(960+(i)*64))
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((973+(i)*64):(968+(i)*64))
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_INVALIDATE(i)             MW((974+(i)*64):(974+(i)*64))
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVA0C0_QMDV01_06_CONSTANT_BUFFER_SIZE(i)                   MW((991+(i)*64):(975+(i)*64))
-+#define NVA0C0_QMDV01_06_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(1463:1440)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_N                            MW(1466:1464)
-+#define NVA0C0_QMDV01_06_BARRIER_COUNT                             MW(1471:1467)
-+#define NVA0C0_QMDV01_06_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(1495:1472)
-+#define NVA0C0_QMDV01_06_REGISTER_COUNT                            MW(1503:1496)
-+#define NVA0C0_QMDV01_06_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1527:1504)
-+#define NVA0C0_QMDV01_06_SASS_VERSION                              MW(1535:1528)
-+#define NVA0C0_QMDV01_06_HW_ONLY_INNER_GET                         MW(1566:1536)
-+#define NVA0C0_QMDV01_06_HW_ONLY_REQUIRE_SCHEDULING_PCAS           MW(1567:1567)
-+#define NVA0C0_QMDV01_06_HW_ONLY_INNER_PUT                         MW(1598:1568)
-+#define NVA0C0_QMDV01_06_HW_ONLY_SCHEDULE_ON_PUT_UPDATE_ENABLE     MW(1599:1599)
-+#define NVA0C0_QMDV01_06_QUEUE_ENTRIES_PER_CTA_MINUS_ONE           MW(1606:1600)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_Q                            MW(1609:1607)
-+#define NVA0C0_QMDV01_06_COALESCE_WAITING_PERIOD                   MW(1617:1610)
-+#define NVA0C0_QMDV01_06_QMD_RESERVED_R                            MW(1631:1618)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_D                               MW(1663:1632)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_E                               MW(1695:1664)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_F                               MW(1727:1696)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_G                               MW(1759:1728)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_H                               MW(1791:1760)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_I                               MW(1823:1792)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_J                               MW(1855:1824)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_K                               MW(1887:1856)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_L                               MW(1919:1888)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_M                               MW(1951:1920)
-+#define NVA0C0_QMDV01_06_QMD_SPARE_N                               MW(1983:1952)
-+#define NVA0C0_QMDV01_06_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVA0C0_QMDV01_06_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+/*
-+** Queue Meta Data, Version 01_07
-+ */
-+
-+#define NVA0C0_QMDV01_07_OUTER_PUT                                 MW(30:0)
-+#define NVA0C0_QMDV01_07_OUTER_OVERFLOW                            MW(31:31)
-+#define NVA0C0_QMDV01_07_OUTER_GET                                 MW(62:32)
-+#define NVA0C0_QMDV01_07_OUTER_STICKY_OVERFLOW                     MW(63:63)
-+#define NVA0C0_QMDV01_07_INNER_GET                                 MW(94:64)
-+#define NVA0C0_QMDV01_07_INNER_OVERFLOW                            MW(95:95)
-+#define NVA0C0_QMDV01_07_INNER_PUT                                 MW(126:96)
-+#define NVA0C0_QMDV01_07_INNER_STICKY_OVERFLOW                     MW(127:127)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_A_A                          MW(159:128)
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_POINTER                     MW(191:160)
-+#define NVA0C0_QMDV01_07_QMD_GROUP_ID                              MW(197:192)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_A                            MW(200:198)
-+#define NVA0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST      MW(201:201)
-+#define NVA0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
-+#define NVA0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
-+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0                 MW(202:202)
-+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1                 MW(203:203)
-+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVA0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVA0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS                   MW(204:204)
-+#define NVA0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_FALSE             0x00000000
-+#define NVA0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_TRUE              0x00000001
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE             MW(205:205)
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE       0x00000000
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE        0x00000001
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_TYPE                        MW(206:206)
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_TYPE_QUEUE                  0x00000000
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_TYPE_GRID                   0x00000001
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY                  MW(207:207)
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_FALSE            0x00000000
-+#define NVA0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_TRUE             0x00000001
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_B                            MW(223:208)
-+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_SIZE                       MW(248:224)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_C                            MW(249:249)
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE           MW(250:250)
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(251:251)
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE             MW(252:252)
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVA0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE              MW(253:253)
-+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVA0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE              MW(254:254)
-+#define NVA0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVA0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE          MW(255:255)
-+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVA0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVA0C0_QMDV01_07_PROGRAM_OFFSET                            MW(287:256)
-+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_LOWER                 MW(319:288)
-+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_UPPER                 MW(327:320)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_D                            MW(335:328)
-+#define NVA0C0_QMDV01_07_CIRCULAR_QUEUE_ENTRY_SIZE                 MW(351:336)
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_ID                    MW(357:352)
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE       MW(365:358)
-+#define NVA0C0_QMDV01_07_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVA0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE           MW(367:367)
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE     0x00000000
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE      0x00000001
-+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVA0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVA0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS                     MW(370:370)
-+#define NVA0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_FALSE               0x00000000
-+#define NVA0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_TRUE                0x00000001
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE           MW(371:371)
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE     0x00000000
-+#define NVA0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE      0x00000001
-+#define NVA0C0_QMDV01_07_THROTTLED                                 MW(372:372)
-+#define NVA0C0_QMDV01_07_THROTTLED_FALSE                           0x00000000
-+#define NVA0C0_QMDV01_07_THROTTLED_TRUE                            0x00000001
-+#define NVA0C0_QMDV01_07_FP32_NAN_BEHAVIOR                         MW(376:376)
-+#define NVA0C0_QMDV01_07_FP32_NAN_BEHAVIOR_LEGACY                  0x00000000
-+#define NVA0C0_QMDV01_07_FP32_NAN_BEHAVIOR_FP64_COMPATIBLE         0x00000001
-+#define NVA0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR                     MW(377:377)
-+#define NVA0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_ZERO           0x00000000
-+#define NVA0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_INDEFINITE     0x00000001
-+#define NVA0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVA0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVA0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVA0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING                MW(379:379)
-+#define NVA0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000
-+#define NVA0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001
-+#define NVA0C0_QMDV01_07_SAMPLER_INDEX                             MW(382:382)
-+#define NVA0C0_QMDV01_07_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVA0C0_QMDV01_07_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVA0C0_QMDV01_07_FP32_NARROW_INSTRUCTION                   MW(383:383)
-+#define NVA0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_KEEP_DENORMS      0x00000000
-+#define NVA0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_FLUSH_DENORMS     0x00000001
-+#define NVA0C0_QMDV01_07_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVA0C0_QMDV01_07_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVA0C0_QMDV01_07_CTA_RASTER_DEPTH                          MW(447:432)
-+#define NVA0C0_QMDV01_07_CTA_RASTER_WIDTH_RESUME                   MW(479:448)
-+#define NVA0C0_QMDV01_07_CTA_RASTER_HEIGHT_RESUME                  MW(495:480)
-+#define NVA0C0_QMDV01_07_CTA_RASTER_DEPTH_RESUME                   MW(511:496)
-+#define NVA0C0_QMDV01_07_QUEUE_ENTRIES_PER_CTA_MINUS_ONE           MW(518:512)
-+#define NVA0C0_QMDV01_07_COALESCE_WAITING_PERIOD                   MW(529:522)
-+#define NVA0C0_QMDV01_07_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_G                            MW(575:562)
-+#define NVA0C0_QMDV01_07_QMD_VERSION                               MW(579:576)
-+#define NVA0C0_QMDV01_07_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_H                            MW(591:584)
-+#define NVA0C0_QMDV01_07_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVA0C0_QMDV01_07_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVA0C0_QMDV01_07_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_I                            MW(668:648)
-+#define NVA0C0_QMDV01_07_L1_CONFIGURATION                          MW(671:669)
-+#define NVA0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001
-+#define NVA0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002
-+#define NVA0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003
-+#define NVA0C0_QMDV01_07_SM_DISABLE_MASK_LOWER                     MW(703:672)
-+#define NVA0C0_QMDV01_07_SM_DISABLE_MASK_UPPER                     MW(735:704)
-+#define NVA0C0_QMDV01_07_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVA0C0_QMDV01_07_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_J                            MW(783:776)
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_K                            MW(791:791)
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVA0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVA0C0_QMDV01_07_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVA0C0_QMDV01_07_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_L                            MW(879:872)
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_M                            MW(887:887)
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVA0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVA0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVA0C0_QMDV01_07_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((959+(i)*64):(928+(i)*64))
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((967+(i)*64):(960+(i)*64))
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((973+(i)*64):(968+(i)*64))
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE(i)             MW((974+(i)*64):(974+(i)*64))
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVA0C0_QMDV01_07_CONSTANT_BUFFER_SIZE(i)                   MW((991+(i)*64):(975+(i)*64))
-+#define NVA0C0_QMDV01_07_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(1463:1440)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_N                            MW(1466:1464)
-+#define NVA0C0_QMDV01_07_BARRIER_COUNT                             MW(1471:1467)
-+#define NVA0C0_QMDV01_07_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(1495:1472)
-+#define NVA0C0_QMDV01_07_REGISTER_COUNT                            MW(1503:1496)
-+#define NVA0C0_QMDV01_07_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1527:1504)
-+#define NVA0C0_QMDV01_07_SASS_VERSION                              MW(1535:1528)
-+#define NVA0C0_QMDV01_07_HW_ONLY_INNER_GET                         MW(1566:1536)
-+#define NVA0C0_QMDV01_07_HW_ONLY_REQUIRE_SCHEDULING_PCAS           MW(1567:1567)
-+#define NVA0C0_QMDV01_07_HW_ONLY_INNER_PUT                         MW(1598:1568)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_P                            MW(1599:1599)
-+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX              MW(1629:1600)
-+#define NVA0C0_QMDV01_07_QMD_RESERVED_Q                            MW(1630:1630)
-+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID        MW(1631:1631)
-+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE  0x00000000
-+#define NVA0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE   0x00000001
-+#define NVA0C0_QMDV01_07_HW_ONLY_SKED_NEXT_QMD_POINTER             MW(1663:1632)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_E                               MW(1695:1664)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_F                               MW(1727:1696)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_G                               MW(1759:1728)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_H                               MW(1791:1760)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_I                               MW(1823:1792)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_J                               MW(1855:1824)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_K                               MW(1887:1856)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_L                               MW(1919:1888)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_M                               MW(1951:1920)
-+#define NVA0C0_QMDV01_07_QMD_SPARE_N                               MW(1983:1952)
-+#define NVA0C0_QMDV01_07_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVA0C0_QMDV01_07_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+
-+#endif // #ifndef __CLA0C0QMD_H__
-diff --git a/src/gallium/drivers/nouveau/nvc0/clc0c0qmd.h b/src/gallium/drivers/nouveau/nvc0/clc0c0qmd.h
-new file mode 100644
-index 00000000000..040bdcd9dcb
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/clc0c0qmd.h
-@@ -0,0 +1,665 @@
-+/*******************************************************************************
-+    Copyright (c) 2016 NVIDIA Corporation
-+
-+    Permission is hereby granted, free of charge, to any person obtaining a copy
-+    of this software and associated documentation files (the "Software"), to
-+    deal in the Software without restriction, including without limitation the
-+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-+    sell copies of the Software, and to permit persons to whom the Software is
-+    furnished to do so, subject to the following conditions:
-+
-+        The above copyright notice and this permission notice shall be
-+        included in all copies or substantial portions of the Software.
-+
-+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-+    DEALINGS IN THE SOFTWARE.
-+
-+*******************************************************************************/
-+
-+/* AUTO GENERATED FILE -- DO NOT EDIT */
-+
-+#ifndef __CLC0C0QMD_H__
-+#define __CLC0C0QMD_H__
-+
-+/*
-+** Queue Meta Data, Version 01_07
-+ */
-+
-+// The below C preprocessor definitions describe "multi-word" structures, where
-+// fields may have bit numbers beyond 32.  For example, MW(127:96) means
-+// the field is in bits 0-31 of word number 3 of the structure.  The "MW(X:Y)"
-+// syntax is to distinguish from similar "X:Y" single-word definitions: the
-+// macros historically used for single-word definitions would fail with
-+// multi-word definitions.
-+//
-+// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel
-+// interface layer of nvidia.ko for an example of how to manipulate
-+// these MW(X:Y) definitions.
-+
-+#define NVC0C0_QMDV01_07_OUTER_PUT                                 MW(30:0)
-+#define NVC0C0_QMDV01_07_OUTER_OVERFLOW                            MW(31:31)
-+#define NVC0C0_QMDV01_07_OUTER_GET                                 MW(62:32)
-+#define NVC0C0_QMDV01_07_OUTER_STICKY_OVERFLOW                     MW(63:63)
-+#define NVC0C0_QMDV01_07_INNER_GET                                 MW(94:64)
-+#define NVC0C0_QMDV01_07_INNER_OVERFLOW                            MW(95:95)
-+#define NVC0C0_QMDV01_07_INNER_PUT                                 MW(126:96)
-+#define NVC0C0_QMDV01_07_INNER_STICKY_OVERFLOW                     MW(127:127)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_A_A                          MW(159:128)
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_POINTER                     MW(191:160)
-+#define NVC0C0_QMDV01_07_QMD_GROUP_ID                              MW(197:192)
-+#define NVC0C0_QMDV01_07_SM_GLOBAL_CACHING_ENABLE                  MW(198:198)
-+#define NVC0C0_QMDV01_07_RUN_CTA_IN_ONE_SM_PARTITION               MW(199:199)
-+#define NVC0C0_QMDV01_07_RUN_CTA_IN_ONE_SM_PARTITION_FALSE         0x00000000
-+#define NVC0C0_QMDV01_07_RUN_CTA_IN_ONE_SM_PARTITION_TRUE          0x00000001
-+#define NVC0C0_QMDV01_07_IS_QUEUE                                  MW(200:200)
-+#define NVC0C0_QMDV01_07_IS_QUEUE_FALSE                            0x00000000
-+#define NVC0C0_QMDV01_07_IS_QUEUE_TRUE                             0x00000001
-+#define NVC0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST      MW(201:201)
-+#define NVC0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
-+#define NVC0C0_QMDV01_07_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
-+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0                 MW(202:202)
-+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1                 MW(203:203)
-+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVC0C0_QMDV01_07_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVC0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS                   MW(204:204)
-+#define NVC0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_FALSE             0x00000000
-+#define NVC0C0_QMDV01_07_REQUIRE_SCHEDULING_PCAS_TRUE              0x00000001
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE             MW(205:205)
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE       0x00000000
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE        0x00000001
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_TYPE                        MW(206:206)
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_TYPE_QUEUE                  0x00000000
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_TYPE_GRID                   0x00000001
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY                  MW(207:207)
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_FALSE            0x00000000
-+#define NVC0C0_QMDV01_07_DEPENDENT_QMD_FIELD_COPY_TRUE             0x00000001
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_B                            MW(223:208)
-+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_SIZE                       MW(248:224)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_C                            MW(249:249)
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE           MW(250:250)
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(251:251)
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE             MW(252:252)
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVC0C0_QMDV01_07_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE              MW(253:253)
-+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVC0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE              MW(254:254)
-+#define NVC0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVC0C0_QMDV01_07_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE          MW(255:255)
-+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVC0C0_QMDV01_07_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVC0C0_QMDV01_07_PROGRAM_OFFSET                            MW(287:256)
-+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_LOWER                 MW(319:288)
-+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_ADDR_UPPER                 MW(327:320)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_D                            MW(335:328)
-+#define NVC0C0_QMDV01_07_CIRCULAR_QUEUE_ENTRY_SIZE                 MW(351:336)
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_ID                    MW(357:352)
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE       MW(365:358)
-+#define NVC0C0_QMDV01_07_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVC0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE           MW(367:367)
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE     0x00000000
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE      0x00000001
-+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVC0C0_QMDV01_07_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVC0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS                     MW(370:370)
-+#define NVC0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_FALSE               0x00000000
-+#define NVC0C0_QMDV01_07_SEQUENTIALLY_RUN_CTAS_TRUE                0x00000001
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE           MW(371:371)
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE     0x00000000
-+#define NVC0C0_QMDV01_07_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE      0x00000001
-+#define NVC0C0_QMDV01_07_THROTTLED                                 MW(372:372)
-+#define NVC0C0_QMDV01_07_THROTTLED_FALSE                           0x00000000
-+#define NVC0C0_QMDV01_07_THROTTLED_TRUE                            0x00000001
-+#define NVC0C0_QMDV01_07_FP32_NAN_BEHAVIOR                         MW(376:376)
-+#define NVC0C0_QMDV01_07_FP32_NAN_BEHAVIOR_LEGACY                  0x00000000
-+#define NVC0C0_QMDV01_07_FP32_NAN_BEHAVIOR_FP64_COMPATIBLE         0x00000001
-+#define NVC0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR                     MW(377:377)
-+#define NVC0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_ZERO           0x00000000
-+#define NVC0C0_QMDV01_07_FP32_F2I_NAN_BEHAVIOR_PASS_INDEFINITE     0x00000001
-+#define NVC0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVC0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVC0C0_QMDV01_07_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVC0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING                MW(379:379)
-+#define NVC0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_FOUR_BYTES_PER_BANK 0x00000000
-+#define NVC0C0_QMDV01_07_SHARED_MEMORY_BANK_MAPPING_EIGHT_BYTES_PER_BANK 0x00000001
-+#define NVC0C0_QMDV01_07_SAMPLER_INDEX                             MW(382:382)
-+#define NVC0C0_QMDV01_07_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVC0C0_QMDV01_07_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVC0C0_QMDV01_07_FP32_NARROW_INSTRUCTION                   MW(383:383)
-+#define NVC0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_KEEP_DENORMS      0x00000000
-+#define NVC0C0_QMDV01_07_FP32_NARROW_INSTRUCTION_FLUSH_DENORMS     0x00000001
-+#define NVC0C0_QMDV01_07_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVC0C0_QMDV01_07_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVC0C0_QMDV01_07_CTA_RASTER_DEPTH                          MW(447:432)
-+#define NVC0C0_QMDV01_07_CTA_RASTER_WIDTH_RESUME                   MW(479:448)
-+#define NVC0C0_QMDV01_07_CTA_RASTER_HEIGHT_RESUME                  MW(495:480)
-+#define NVC0C0_QMDV01_07_CTA_RASTER_DEPTH_RESUME                   MW(511:496)
-+#define NVC0C0_QMDV01_07_QUEUE_ENTRIES_PER_CTA_MINUS_ONE           MW(518:512)
-+#define NVC0C0_QMDV01_07_COALESCE_WAITING_PERIOD                   MW(529:522)
-+#define NVC0C0_QMDV01_07_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_G                            MW(575:562)
-+#define NVC0C0_QMDV01_07_QMD_VERSION                               MW(579:576)
-+#define NVC0C0_QMDV01_07_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_H                            MW(591:584)
-+#define NVC0C0_QMDV01_07_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVC0C0_QMDV01_07_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVC0C0_QMDV01_07_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_I                            MW(668:648)
-+#define NVC0C0_QMDV01_07_L1_CONFIGURATION                          MW(671:669)
-+#define NVC0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB 0x00000001
-+#define NVC0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB 0x00000002
-+#define NVC0C0_QMDV01_07_L1_CONFIGURATION_DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB 0x00000003
-+#define NVC0C0_QMDV01_07_SM_DISABLE_MASK_LOWER                     MW(703:672)
-+#define NVC0C0_QMDV01_07_SM_DISABLE_MASK_UPPER                     MW(735:704)
-+#define NVC0C0_QMDV01_07_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVC0C0_QMDV01_07_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_J                            MW(783:776)
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_K                            MW(791:791)
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVC0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVC0C0_QMDV01_07_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVC0C0_QMDV01_07_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_L                            MW(879:872)
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_M                            MW(887:887)
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVC0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC0C0_QMDV01_07_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC0C0_QMDV01_07_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((959+(i)*64):(928+(i)*64))
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((967+(i)*64):(960+(i)*64))
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((973+(i)*64):(968+(i)*64))
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE(i)             MW((974+(i)*64):(974+(i)*64))
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVC0C0_QMDV01_07_CONSTANT_BUFFER_SIZE(i)                   MW((991+(i)*64):(975+(i)*64))
-+#define NVC0C0_QMDV01_07_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(1463:1440)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_N                            MW(1466:1464)
-+#define NVC0C0_QMDV01_07_BARRIER_COUNT                             MW(1471:1467)
-+#define NVC0C0_QMDV01_07_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(1495:1472)
-+#define NVC0C0_QMDV01_07_REGISTER_COUNT                            MW(1503:1496)
-+#define NVC0C0_QMDV01_07_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1527:1504)
-+#define NVC0C0_QMDV01_07_SASS_VERSION                              MW(1535:1528)
-+#define NVC0C0_QMDV01_07_HW_ONLY_INNER_GET                         MW(1566:1536)
-+#define NVC0C0_QMDV01_07_HW_ONLY_REQUIRE_SCHEDULING_PCAS           MW(1567:1567)
-+#define NVC0C0_QMDV01_07_HW_ONLY_INNER_PUT                         MW(1598:1568)
-+#define NVC0C0_QMDV01_07_HW_ONLY_SCG_TYPE                          MW(1599:1599)
-+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX              MW(1629:1600)
-+#define NVC0C0_QMDV01_07_QMD_RESERVED_Q                            MW(1630:1630)
-+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID        MW(1631:1631)
-+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE  0x00000000
-+#define NVC0C0_QMDV01_07_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE   0x00000001
-+#define NVC0C0_QMDV01_07_HW_ONLY_SKED_NEXT_QMD_POINTER             MW(1663:1632)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_E                               MW(1695:1664)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_F                               MW(1727:1696)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_G                               MW(1759:1728)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_H                               MW(1791:1760)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_I                               MW(1823:1792)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_J                               MW(1855:1824)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_K                               MW(1887:1856)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_L                               MW(1919:1888)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_M                               MW(1951:1920)
-+#define NVC0C0_QMDV01_07_QMD_SPARE_N                               MW(1983:1952)
-+#define NVC0C0_QMDV01_07_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVC0C0_QMDV01_07_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+/*
-+** Queue Meta Data, Version 02_00
-+ */
-+
-+#define NVC0C0_QMDV02_00_OUTER_PUT                                 MW(30:0)
-+#define NVC0C0_QMDV02_00_OUTER_OVERFLOW                            MW(31:31)
-+#define NVC0C0_QMDV02_00_OUTER_GET                                 MW(62:32)
-+#define NVC0C0_QMDV02_00_OUTER_STICKY_OVERFLOW                     MW(63:63)
-+#define NVC0C0_QMDV02_00_INNER_GET                                 MW(94:64)
-+#define NVC0C0_QMDV02_00_INNER_OVERFLOW                            MW(95:95)
-+#define NVC0C0_QMDV02_00_INNER_PUT                                 MW(126:96)
-+#define NVC0C0_QMDV02_00_INNER_STICKY_OVERFLOW                     MW(127:127)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_A_A                          MW(159:128)
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_POINTER                     MW(191:160)
-+#define NVC0C0_QMDV02_00_QMD_GROUP_ID                              MW(197:192)
-+#define NVC0C0_QMDV02_00_SM_GLOBAL_CACHING_ENABLE                  MW(198:198)
-+#define NVC0C0_QMDV02_00_RUN_CTA_IN_ONE_SM_PARTITION               MW(199:199)
-+#define NVC0C0_QMDV02_00_RUN_CTA_IN_ONE_SM_PARTITION_FALSE         0x00000000
-+#define NVC0C0_QMDV02_00_RUN_CTA_IN_ONE_SM_PARTITION_TRUE          0x00000001
-+#define NVC0C0_QMDV02_00_IS_QUEUE                                  MW(200:200)
-+#define NVC0C0_QMDV02_00_IS_QUEUE_FALSE                            0x00000000
-+#define NVC0C0_QMDV02_00_IS_QUEUE_TRUE                             0x00000001
-+#define NVC0C0_QMDV02_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST      MW(201:201)
-+#define NVC0C0_QMDV02_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
-+#define NVC0C0_QMDV02_00_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
-+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE0                 MW(202:202)
-+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE1                 MW(203:203)
-+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVC0C0_QMDV02_00_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVC0C0_QMDV02_00_REQUIRE_SCHEDULING_PCAS                   MW(204:204)
-+#define NVC0C0_QMDV02_00_REQUIRE_SCHEDULING_PCAS_FALSE             0x00000000
-+#define NVC0C0_QMDV02_00_REQUIRE_SCHEDULING_PCAS_TRUE              0x00000001
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_SCHEDULE_ENABLE             MW(205:205)
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE       0x00000000
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE        0x00000001
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_TYPE                        MW(206:206)
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_TYPE_QUEUE                  0x00000000
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_TYPE_GRID                   0x00000001
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_FIELD_COPY                  MW(207:207)
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_FIELD_COPY_FALSE            0x00000000
-+#define NVC0C0_QMDV02_00_DEPENDENT_QMD_FIELD_COPY_TRUE             0x00000001
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_B                            MW(223:208)
-+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_SIZE                       MW(248:224)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_C                            MW(249:249)
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_HEADER_CACHE           MW(250:250)
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(251:251)
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_DATA_CACHE             MW(252:252)
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVC0C0_QMDV02_00_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_DATA_CACHE              MW(253:253)
-+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVC0C0_QMDV02_00_INVALIDATE_INSTRUCTION_CACHE              MW(254:254)
-+#define NVC0C0_QMDV02_00_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVC0C0_QMDV02_00_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_CONSTANT_CACHE          MW(255:255)
-+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVC0C0_QMDV02_00_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVC0C0_QMDV02_00_PROGRAM_OFFSET                            MW(287:256)
-+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_ADDR_LOWER                 MW(319:288)
-+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_ADDR_UPPER                 MW(327:320)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_D                            MW(335:328)
-+#define NVC0C0_QMDV02_00_CIRCULAR_QUEUE_ENTRY_SIZE                 MW(351:336)
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_ID                    MW(357:352)
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE       MW(365:358)
-+#define NVC0C0_QMDV02_00_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVC0C0_QMDV02_00_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_INCR_ENABLE           MW(367:367)
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE     0x00000000
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE      0x00000001
-+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVC0C0_QMDV02_00_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVC0C0_QMDV02_00_SEQUENTIALLY_RUN_CTAS                     MW(370:370)
-+#define NVC0C0_QMDV02_00_SEQUENTIALLY_RUN_CTAS_FALSE               0x00000000
-+#define NVC0C0_QMDV02_00_SEQUENTIALLY_RUN_CTAS_TRUE                0x00000001
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DECR_ENABLE           MW(371:371)
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE     0x00000000
-+#define NVC0C0_QMDV02_00_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE      0x00000001
-+#define NVC0C0_QMDV02_00_THROTTLED                                 MW(372:372)
-+#define NVC0C0_QMDV02_00_THROTTLED_FALSE                           0x00000000
-+#define NVC0C0_QMDV02_00_THROTTLED_TRUE                            0x00000001
-+#define NVC0C0_QMDV02_00_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVC0C0_QMDV02_00_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVC0C0_QMDV02_00_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVC0C0_QMDV02_00_SAMPLER_INDEX                             MW(382:382)
-+#define NVC0C0_QMDV02_00_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVC0C0_QMDV02_00_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVC0C0_QMDV02_00_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVC0C0_QMDV02_00_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED13A                           MW(447:432)
-+#define NVC0C0_QMDV02_00_CTA_RASTER_DEPTH                          MW(463:448)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED14A                           MW(479:464)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED15A                           MW(511:480)
-+#define NVC0C0_QMDV02_00_QUEUE_ENTRIES_PER_CTA_MINUS_ONE           MW(518:512)
-+#define NVC0C0_QMDV02_00_COALESCE_WAITING_PERIOD                   MW(529:522)
-+#define NVC0C0_QMDV02_00_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_G                            MW(575:562)
-+#define NVC0C0_QMDV02_00_QMD_VERSION                               MW(579:576)
-+#define NVC0C0_QMDV02_00_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_H                            MW(591:584)
-+#define NVC0C0_QMDV02_00_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVC0C0_QMDV02_00_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVC0C0_QMDV02_00_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_I                            MW(671:648)
-+#define NVC0C0_QMDV02_00_SM_DISABLE_MASK_LOWER                     MW(703:672)
-+#define NVC0C0_QMDV02_00_SM_DISABLE_MASK_UPPER                     MW(735:704)
-+#define NVC0C0_QMDV02_00_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVC0C0_QMDV02_00_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_J                            MW(783:776)
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_K                            MW(791:791)
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVC0C0_QMDV02_00_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVC0C0_QMDV02_00_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVC0C0_QMDV02_00_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_L                            MW(879:872)
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_M                            MW(887:887)
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVC0C0_QMDV02_00_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC0C0_QMDV02_00_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC0C0_QMDV02_00_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVC0C0_QMDV02_00_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(951:928)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_N                            MW(954:952)
-+#define NVC0C0_QMDV02_00_BARRIER_COUNT                             MW(959:955)
-+#define NVC0C0_QMDV02_00_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(983:960)
-+#define NVC0C0_QMDV02_00_REGISTER_COUNT                            MW(991:984)
-+#define NVC0C0_QMDV02_00_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1015:992)
-+#define NVC0C0_QMDV02_00_SASS_VERSION                              MW(1023:1016)
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((1055+(i)*64):(1024+(i)*64))
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((1072+(i)*64):(1056+(i)*64))
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((1073+(i)*64):(1073+(i)*64))
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_INVALIDATE(i)             MW((1074+(i)*64):(1074+(i)*64))
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVC0C0_QMDV02_00_CONSTANT_BUFFER_SIZE_SHIFTED4(i)          MW((1087+(i)*64):(1075+(i)*64))
-+#define NVC0C0_QMDV02_00_HW_ONLY_INNER_GET                         MW(1566:1536)
-+#define NVC0C0_QMDV02_00_HW_ONLY_REQUIRE_SCHEDULING_PCAS           MW(1567:1567)
-+#define NVC0C0_QMDV02_00_HW_ONLY_INNER_PUT                         MW(1598:1568)
-+#define NVC0C0_QMDV02_00_HW_ONLY_SCG_TYPE                          MW(1599:1599)
-+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX              MW(1629:1600)
-+#define NVC0C0_QMDV02_00_QMD_RESERVED_Q                            MW(1630:1630)
-+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID        MW(1631:1631)
-+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE  0x00000000
-+#define NVC0C0_QMDV02_00_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE   0x00000001
-+#define NVC0C0_QMDV02_00_HW_ONLY_SKED_NEXT_QMD_POINTER             MW(1663:1632)
-+#define NVC0C0_QMDV02_00_CTA_RASTER_WIDTH_RESUME                   MW(1695:1664)
-+#define NVC0C0_QMDV02_00_CTA_RASTER_HEIGHT_RESUME                  MW(1711:1696)
-+#define NVC0C0_QMDV02_00_CTA_RASTER_DEPTH_RESUME                   MW(1727:1712)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_G                               MW(1759:1728)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_H                               MW(1791:1760)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_I                               MW(1823:1792)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_J                               MW(1855:1824)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_K                               MW(1887:1856)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_L                               MW(1919:1888)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_M                               MW(1951:1920)
-+#define NVC0C0_QMDV02_00_QMD_SPARE_N                               MW(1983:1952)
-+#define NVC0C0_QMDV02_00_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVC0C0_QMDV02_00_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+/*
-+** Queue Meta Data, Version 02_01
-+ */
-+
-+#define NVC0C0_QMDV02_01_OUTER_PUT                                 MW(30:0)
-+#define NVC0C0_QMDV02_01_OUTER_OVERFLOW                            MW(31:31)
-+#define NVC0C0_QMDV02_01_OUTER_GET                                 MW(62:32)
-+#define NVC0C0_QMDV02_01_OUTER_STICKY_OVERFLOW                     MW(63:63)
-+#define NVC0C0_QMDV02_01_INNER_GET                                 MW(94:64)
-+#define NVC0C0_QMDV02_01_INNER_OVERFLOW                            MW(95:95)
-+#define NVC0C0_QMDV02_01_INNER_PUT                                 MW(126:96)
-+#define NVC0C0_QMDV02_01_INNER_STICKY_OVERFLOW                     MW(127:127)
-+#define NVC0C0_QMDV02_01_QMD_GROUP_ID                              MW(133:128)
-+#define NVC0C0_QMDV02_01_SM_GLOBAL_CACHING_ENABLE                  MW(134:134)
-+#define NVC0C0_QMDV02_01_RUN_CTA_IN_ONE_SM_PARTITION               MW(135:135)
-+#define NVC0C0_QMDV02_01_RUN_CTA_IN_ONE_SM_PARTITION_FALSE         0x00000000
-+#define NVC0C0_QMDV02_01_RUN_CTA_IN_ONE_SM_PARTITION_TRUE          0x00000001
-+#define NVC0C0_QMDV02_01_IS_QUEUE                                  MW(136:136)
-+#define NVC0C0_QMDV02_01_IS_QUEUE_FALSE                            0x00000000
-+#define NVC0C0_QMDV02_01_IS_QUEUE_TRUE                             0x00000001
-+#define NVC0C0_QMDV02_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST      MW(137:137)
-+#define NVC0C0_QMDV02_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
-+#define NVC0C0_QMDV02_01_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
-+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE0                 MW(138:138)
-+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE1                 MW(139:139)
-+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVC0C0_QMDV02_01_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVC0C0_QMDV02_01_REQUIRE_SCHEDULING_PCAS                   MW(140:140)
-+#define NVC0C0_QMDV02_01_REQUIRE_SCHEDULING_PCAS_FALSE             0x00000000
-+#define NVC0C0_QMDV02_01_REQUIRE_SCHEDULING_PCAS_TRUE              0x00000001
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_SCHEDULE_ENABLE             MW(141:141)
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE       0x00000000
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE        0x00000001
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_TYPE                        MW(142:142)
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_TYPE_QUEUE                  0x00000000
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_TYPE_GRID                   0x00000001
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_FIELD_COPY                  MW(143:143)
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_FIELD_COPY_FALSE            0x00000000
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_FIELD_COPY_TRUE             0x00000001
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_B                            MW(159:144)
-+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_SIZE                       MW(184:160)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_C                            MW(185:185)
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_HEADER_CACHE           MW(186:186)
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(187:187)
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_DATA_CACHE             MW(188:188)
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVC0C0_QMDV02_01_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_DATA_CACHE              MW(189:189)
-+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVC0C0_QMDV02_01_INVALIDATE_INSTRUCTION_CACHE              MW(190:190)
-+#define NVC0C0_QMDV02_01_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVC0C0_QMDV02_01_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_CONSTANT_CACHE          MW(191:191)
-+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVC0C0_QMDV02_01_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVC0C0_QMDV02_01_CTA_RASTER_WIDTH_RESUME                   MW(223:192)
-+#define NVC0C0_QMDV02_01_CTA_RASTER_HEIGHT_RESUME                  MW(239:224)
-+#define NVC0C0_QMDV02_01_CTA_RASTER_DEPTH_RESUME                   MW(255:240)
-+#define NVC0C0_QMDV02_01_PROGRAM_OFFSET                            MW(287:256)
-+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_ADDR_LOWER                 MW(319:288)
-+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_ADDR_UPPER                 MW(327:320)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_D                            MW(335:328)
-+#define NVC0C0_QMDV02_01_CIRCULAR_QUEUE_ENTRY_SIZE                 MW(351:336)
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_ID                    MW(357:352)
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE       MW(365:358)
-+#define NVC0C0_QMDV02_01_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVC0C0_QMDV02_01_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_INCR_ENABLE           MW(367:367)
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE     0x00000000
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE      0x00000001
-+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVC0C0_QMDV02_01_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVC0C0_QMDV02_01_SEQUENTIALLY_RUN_CTAS                     MW(370:370)
-+#define NVC0C0_QMDV02_01_SEQUENTIALLY_RUN_CTAS_FALSE               0x00000000
-+#define NVC0C0_QMDV02_01_SEQUENTIALLY_RUN_CTAS_TRUE                0x00000001
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DECR_ENABLE           MW(371:371)
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE     0x00000000
-+#define NVC0C0_QMDV02_01_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE      0x00000001
-+#define NVC0C0_QMDV02_01_THROTTLED                                 MW(372:372)
-+#define NVC0C0_QMDV02_01_THROTTLED_FALSE                           0x00000000
-+#define NVC0C0_QMDV02_01_THROTTLED_TRUE                            0x00000001
-+#define NVC0C0_QMDV02_01_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVC0C0_QMDV02_01_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVC0C0_QMDV02_01_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVC0C0_QMDV02_01_SAMPLER_INDEX                             MW(382:382)
-+#define NVC0C0_QMDV02_01_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVC0C0_QMDV02_01_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVC0C0_QMDV02_01_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVC0C0_QMDV02_01_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED13A                           MW(447:432)
-+#define NVC0C0_QMDV02_01_CTA_RASTER_DEPTH                          MW(463:448)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED14A                           MW(479:464)
-+#define NVC0C0_QMDV02_01_DEPENDENT_QMD_POINTER                     MW(511:480)
-+#define NVC0C0_QMDV02_01_QUEUE_ENTRIES_PER_CTA_MINUS_ONE           MW(518:512)
-+#define NVC0C0_QMDV02_01_COALESCE_WAITING_PERIOD                   MW(529:522)
-+#define NVC0C0_QMDV02_01_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_G                            MW(575:562)
-+#define NVC0C0_QMDV02_01_QMD_VERSION                               MW(579:576)
-+#define NVC0C0_QMDV02_01_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_H                            MW(591:584)
-+#define NVC0C0_QMDV02_01_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVC0C0_QMDV02_01_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVC0C0_QMDV02_01_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_I                            MW(671:648)
-+#define NVC0C0_QMDV02_01_SM_DISABLE_MASK_LOWER                     MW(703:672)
-+#define NVC0C0_QMDV02_01_SM_DISABLE_MASK_UPPER                     MW(735:704)
-+#define NVC0C0_QMDV02_01_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVC0C0_QMDV02_01_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_J                            MW(783:776)
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_K                            MW(791:791)
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVC0C0_QMDV02_01_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVC0C0_QMDV02_01_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVC0C0_QMDV02_01_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_L                            MW(879:872)
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_M                            MW(887:887)
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVC0C0_QMDV02_01_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC0C0_QMDV02_01_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC0C0_QMDV02_01_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVC0C0_QMDV02_01_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(951:928)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_N                            MW(954:952)
-+#define NVC0C0_QMDV02_01_BARRIER_COUNT                             MW(959:955)
-+#define NVC0C0_QMDV02_01_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(983:960)
-+#define NVC0C0_QMDV02_01_REGISTER_COUNT                            MW(991:984)
-+#define NVC0C0_QMDV02_01_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1015:992)
-+#define NVC0C0_QMDV02_01_SASS_VERSION                              MW(1023:1016)
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((1055+(i)*64):(1024+(i)*64))
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((1072+(i)*64):(1056+(i)*64))
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((1073+(i)*64):(1073+(i)*64))
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_INVALIDATE(i)             MW((1074+(i)*64):(1074+(i)*64))
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVC0C0_QMDV02_01_CONSTANT_BUFFER_SIZE_SHIFTED4(i)          MW((1087+(i)*64):(1075+(i)*64))
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_R                            MW(1567:1536)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_S                            MW(1599:1568)
-+#define NVC0C0_QMDV02_01_HW_ONLY_INNER_GET                         MW(1630:1600)
-+#define NVC0C0_QMDV02_01_HW_ONLY_REQUIRE_SCHEDULING_PCAS           MW(1631:1631)
-+#define NVC0C0_QMDV02_01_HW_ONLY_INNER_PUT                         MW(1662:1632)
-+#define NVC0C0_QMDV02_01_HW_ONLY_SCG_TYPE                          MW(1663:1663)
-+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX              MW(1693:1664)
-+#define NVC0C0_QMDV02_01_QMD_RESERVED_Q                            MW(1694:1694)
-+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID        MW(1695:1695)
-+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE  0x00000000
-+#define NVC0C0_QMDV02_01_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE   0x00000001
-+#define NVC0C0_QMDV02_01_HW_ONLY_SKED_NEXT_QMD_POINTER             MW(1727:1696)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_G                               MW(1759:1728)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_H                               MW(1791:1760)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_I                               MW(1823:1792)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_J                               MW(1855:1824)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_K                               MW(1887:1856)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_L                               MW(1919:1888)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_M                               MW(1951:1920)
-+#define NVC0C0_QMDV02_01_QMD_SPARE_N                               MW(1983:1952)
-+#define NVC0C0_QMDV02_01_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVC0C0_QMDV02_01_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+
-+#endif // #ifndef __CLC0C0QMD_H__
-diff --git a/src/gallium/drivers/nouveau/nvc0/clc3c0qmd.h b/src/gallium/drivers/nouveau/nvc0/clc3c0qmd.h
-new file mode 100644
-index 00000000000..588cc639d32
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/clc3c0qmd.h
-@@ -0,0 +1,245 @@
-+/*******************************************************************************
-+    Copyright (c) 2001-2010 NVIDIA Corporation
-+
-+    Permission is hereby granted, free of charge, to any person obtaining a copy
-+    of this software and associated documentation files (the "Software"), to
-+    deal in the Software without restriction, including without limitation the
-+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-+    sell copies of the Software, and to permit persons to whom the Software is
-+    furnished to do so, subject to the following conditions:
-+
-+    The above copyright notice and this permission notice shall be
-+    included in all copies or substantial portions of the Software.
-+
-+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-+    DEALINGS IN THE SOFTWARE.
-+
-+*******************************************************************************/
-+
-+/* AUTO GENERATED FILE -- DO NOT EDIT */
-+
-+#ifndef __CLC3C0QMD_H__
-+#define __CLC3C0QMD_H__
-+
-+/*
-+** Queue Meta Data, Version 02_02
-+ */
-+
-+// The below C preprocessor definitions describe "multi-word" structures, where
-+// fields may have bit numbers beyond 32.  For example, MW(127:96) means
-+// the field is in bits 0-31 of word number 3 of the structure.  The "MW(X:Y)"
-+// syntax is to distinguish from similar "X:Y" single-word definitions: the
-+// macros historically used for single-word definitions would fail with
-+// multi-word definitions.
-+//
-+// See nvmisc.h:DRF_VAL_MW() in the source code of the kernel
-+// interface layer of nvidia.ko for an example of how to manipulate
-+// these MW(X:Y) definitions.
-+
-+#define NVC3C0_QMDV02_02_OUTER_PUT                                 MW(30:0)
-+#define NVC3C0_QMDV02_02_OUTER_OVERFLOW                            MW(31:31)
-+#define NVC3C0_QMDV02_02_OUTER_GET                                 MW(62:32)
-+#define NVC3C0_QMDV02_02_OUTER_STICKY_OVERFLOW                     MW(63:63)
-+#define NVC3C0_QMDV02_02_INNER_GET                                 MW(94:64)
-+#define NVC3C0_QMDV02_02_INNER_OVERFLOW                            MW(95:95)
-+#define NVC3C0_QMDV02_02_INNER_PUT                                 MW(126:96)
-+#define NVC3C0_QMDV02_02_INNER_STICKY_OVERFLOW                     MW(127:127)
-+#define NVC3C0_QMDV02_02_QMD_GROUP_ID                              MW(133:128)
-+#define NVC3C0_QMDV02_02_SM_GLOBAL_CACHING_ENABLE                  MW(134:134)
-+#define NVC3C0_QMDV02_02_RUN_CTA_IN_ONE_SM_PARTITION               MW(135:135)
-+#define NVC3C0_QMDV02_02_RUN_CTA_IN_ONE_SM_PARTITION_FALSE         0x00000000
-+#define NVC3C0_QMDV02_02_RUN_CTA_IN_ONE_SM_PARTITION_TRUE          0x00000001
-+#define NVC3C0_QMDV02_02_IS_QUEUE                                  MW(136:136)
-+#define NVC3C0_QMDV02_02_IS_QUEUE_FALSE                            0x00000000
-+#define NVC3C0_QMDV02_02_IS_QUEUE_TRUE                             0x00000001
-+#define NVC3C0_QMDV02_02_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST      MW(137:137)
-+#define NVC3C0_QMDV02_02_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_FALSE 0x00000000
-+#define NVC3C0_QMDV02_02_ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST_TRUE 0x00000001
-+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE0                 MW(138:138)
-+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE0_FALSE           0x00000000
-+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE0_TRUE            0x00000001
-+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE1                 MW(139:139)
-+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE1_FALSE           0x00000000
-+#define NVC3C0_QMDV02_02_SEMAPHORE_RELEASE_ENABLE1_TRUE            0x00000001
-+#define NVC3C0_QMDV02_02_REQUIRE_SCHEDULING_PCAS                   MW(140:140)
-+#define NVC3C0_QMDV02_02_REQUIRE_SCHEDULING_PCAS_FALSE             0x00000000
-+#define NVC3C0_QMDV02_02_REQUIRE_SCHEDULING_PCAS_TRUE              0x00000001
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_SCHEDULE_ENABLE             MW(141:141)
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_SCHEDULE_ENABLE_FALSE       0x00000000
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_SCHEDULE_ENABLE_TRUE        0x00000001
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_TYPE                        MW(142:142)
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_TYPE_QUEUE                  0x00000000
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_TYPE_GRID                   0x00000001
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_FIELD_COPY                  MW(143:143)
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_FIELD_COPY_FALSE            0x00000000
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_FIELD_COPY_TRUE             0x00000001
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_B                            MW(159:144)
-+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_SIZE                       MW(184:160)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_C                            MW(185:185)
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_HEADER_CACHE           MW(186:186)
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_HEADER_CACHE_FALSE     0x00000000
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_HEADER_CACHE_TRUE      0x00000001
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_SAMPLER_CACHE          MW(187:187)
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_SAMPLER_CACHE_FALSE    0x00000000
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_SAMPLER_CACHE_TRUE     0x00000001
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_DATA_CACHE             MW(188:188)
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_DATA_CACHE_FALSE       0x00000000
-+#define NVC3C0_QMDV02_02_INVALIDATE_TEXTURE_DATA_CACHE_TRUE        0x00000001
-+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_DATA_CACHE              MW(189:189)
-+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_DATA_CACHE_FALSE        0x00000000
-+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_DATA_CACHE_TRUE         0x00000001
-+#define NVC3C0_QMDV02_02_INVALIDATE_INSTRUCTION_CACHE              MW(190:190)
-+#define NVC3C0_QMDV02_02_INVALIDATE_INSTRUCTION_CACHE_FALSE        0x00000000
-+#define NVC3C0_QMDV02_02_INVALIDATE_INSTRUCTION_CACHE_TRUE         0x00000001
-+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_CONSTANT_CACHE          MW(191:191)
-+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_CONSTANT_CACHE_FALSE    0x00000000
-+#define NVC3C0_QMDV02_02_INVALIDATE_SHADER_CONSTANT_CACHE_TRUE     0x00000001
-+#define NVC3C0_QMDV02_02_CTA_RASTER_WIDTH_RESUME                   MW(223:192)
-+#define NVC3C0_QMDV02_02_CTA_RASTER_HEIGHT_RESUME                  MW(239:224)
-+#define NVC3C0_QMDV02_02_CTA_RASTER_DEPTH_RESUME                   MW(255:240)
-+#define NVC3C0_QMDV02_02_PROGRAM_OFFSET                            MW(287:256)
-+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_ADDR_LOWER                 MW(319:288)
-+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_ADDR_UPPER                 MW(327:320)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_D                            MW(335:328)
-+#define NVC3C0_QMDV02_02_CIRCULAR_QUEUE_ENTRY_SIZE                 MW(351:336)
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_ID                    MW(357:352)
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DELTA_MINUS_ONE       MW(365:358)
-+#define NVC3C0_QMDV02_02_RELEASE_MEMBAR_TYPE                       MW(366:366)
-+#define NVC3C0_QMDV02_02_RELEASE_MEMBAR_TYPE_FE_NONE               0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE_MEMBAR_TYPE_FE_SYSMEMBAR          0x00000001
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_INCR_ENABLE           MW(367:367)
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_INCR_ENABLE_FALSE     0x00000000
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_INCR_ENABLE_TRUE      0x00000001
-+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE                           MW(369:368)
-+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE_L1_NONE                   0x00000000
-+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE_L1_SYSMEMBAR              0x00000001
-+#define NVC3C0_QMDV02_02_CWD_MEMBAR_TYPE_L1_MEMBAR                 0x00000003
-+#define NVC3C0_QMDV02_02_SEQUENTIALLY_RUN_CTAS                     MW(370:370)
-+#define NVC3C0_QMDV02_02_SEQUENTIALLY_RUN_CTAS_FALSE               0x00000000
-+#define NVC3C0_QMDV02_02_SEQUENTIALLY_RUN_CTAS_TRUE                0x00000001
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DECR_ENABLE           MW(371:371)
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DECR_ENABLE_FALSE     0x00000000
-+#define NVC3C0_QMDV02_02_CWD_REFERENCE_COUNT_DECR_ENABLE_TRUE      0x00000001
-+#define NVC3C0_QMDV02_02_API_VISIBLE_CALL_LIMIT                    MW(378:378)
-+#define NVC3C0_QMDV02_02_API_VISIBLE_CALL_LIMIT__32                0x00000000
-+#define NVC3C0_QMDV02_02_API_VISIBLE_CALL_LIMIT_NO_CHECK           0x00000001
-+#define NVC3C0_QMDV02_02_SAMPLER_INDEX                             MW(382:382)
-+#define NVC3C0_QMDV02_02_SAMPLER_INDEX_INDEPENDENTLY               0x00000000
-+#define NVC3C0_QMDV02_02_SAMPLER_INDEX_VIA_HEADER_INDEX            0x00000001
-+#define NVC3C0_QMDV02_02_CTA_RASTER_WIDTH                          MW(415:384)
-+#define NVC3C0_QMDV02_02_CTA_RASTER_HEIGHT                         MW(431:416)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED13A                           MW(447:432)
-+#define NVC3C0_QMDV02_02_CTA_RASTER_DEPTH                          MW(463:448)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED14A                           MW(479:464)
-+#define NVC3C0_QMDV02_02_DEPENDENT_QMD_POINTER                     MW(511:480)
-+#define NVC3C0_QMDV02_02_QUEUE_ENTRIES_PER_CTA_MINUS_ONE           MW(518:512)
-+#define NVC3C0_QMDV02_02_COALESCE_WAITING_PERIOD                   MW(529:522)
-+#define NVC3C0_QMDV02_02_SHARED_MEMORY_SIZE                        MW(561:544)
-+#define NVC3C0_QMDV02_02_MIN_SM_CONFIG_SHARED_MEM_SIZE             MW(568:562)
-+#define NVC3C0_QMDV02_02_MAX_SM_CONFIG_SHARED_MEM_SIZE             MW(575:569)
-+#define NVC3C0_QMDV02_02_QMD_VERSION                               MW(579:576)
-+#define NVC3C0_QMDV02_02_QMD_MAJOR_VERSION                         MW(583:580)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_H                            MW(591:584)
-+#define NVC3C0_QMDV02_02_CTA_THREAD_DIMENSION0                     MW(607:592)
-+#define NVC3C0_QMDV02_02_CTA_THREAD_DIMENSION1                     MW(623:608)
-+#define NVC3C0_QMDV02_02_CTA_THREAD_DIMENSION2                     MW(639:624)
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_VALID(i)                  MW((640+(i)*1):(640+(i)*1))
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_VALID_FALSE               0x00000000
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_VALID_TRUE                0x00000001
-+#define NVC3C0_QMDV02_02_REGISTER_COUNT_V                          MW(656:648)
-+#define NVC3C0_QMDV02_02_TARGET_SM_CONFIG_SHARED_MEM_SIZE          MW(663:657)
-+#define NVC3C0_QMDV02_02_FREE_CTA_SLOTS_EMPTY_SM                   MW(671:664)
-+#define NVC3C0_QMDV02_02_SM_DISABLE_MASK_LOWER                     MW(703:672)
-+#define NVC3C0_QMDV02_02_SM_DISABLE_MASK_UPPER                     MW(735:704)
-+#define NVC3C0_QMDV02_02_RELEASE0_ADDRESS_LOWER                    MW(767:736)
-+#define NVC3C0_QMDV02_02_RELEASE0_ADDRESS_UPPER                    MW(775:768)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_J                            MW(783:776)
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP                     MW(790:788)
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_K                            MW(791:791)
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_FORMAT                 MW(793:792)
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_ENABLE                 MW(794:794)
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE0_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE0_STRUCTURE_SIZE                   MW(799:799)
-+#define NVC3C0_QMDV02_02_RELEASE0_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE0_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE0_PAYLOAD                          MW(831:800)
-+#define NVC3C0_QMDV02_02_RELEASE1_ADDRESS_LOWER                    MW(863:832)
-+#define NVC3C0_QMDV02_02_RELEASE1_ADDRESS_UPPER                    MW(871:864)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_L                            MW(879:872)
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP                     MW(886:884)
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_ADD             0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_MIN             0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_MAX             0x00000002
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_INC             0x00000003
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_DEC             0x00000004
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_AND             0x00000005
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_OR              0x00000006
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_OP_RED_XOR             0x00000007
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_M                            MW(887:887)
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_FORMAT                 MW(889:888)
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_FORMAT_UNSIGNED_32     0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_FORMAT_SIGNED_32       0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_ENABLE                 MW(890:890)
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_ENABLE_FALSE           0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE1_REDUCTION_ENABLE_TRUE            0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE1_STRUCTURE_SIZE                   MW(895:895)
-+#define NVC3C0_QMDV02_02_RELEASE1_STRUCTURE_SIZE_FOUR_WORDS        0x00000000
-+#define NVC3C0_QMDV02_02_RELEASE1_STRUCTURE_SIZE_ONE_WORD          0x00000001
-+#define NVC3C0_QMDV02_02_RELEASE1_PAYLOAD                          MW(927:896)
-+#define NVC3C0_QMDV02_02_SHADER_LOCAL_MEMORY_LOW_SIZE              MW(951:928)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_N                            MW(954:952)
-+#define NVC3C0_QMDV02_02_BARRIER_COUNT                             MW(959:955)
-+#define NVC3C0_QMDV02_02_SHADER_LOCAL_MEMORY_HIGH_SIZE             MW(983:960)
-+#define NVC3C0_QMDV02_02_REGISTER_COUNT                            MW(991:984)
-+#define NVC3C0_QMDV02_02_SHADER_LOCAL_MEMORY_CRS_SIZE              MW(1015:992)
-+#define NVC3C0_QMDV02_02_SASS_VERSION                              MW(1023:1016)
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_ADDR_LOWER(i)             MW((1055+(i)*64):(1024+(i)*64))
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_ADDR_UPPER(i)             MW((1072+(i)*64):(1056+(i)*64))
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_RESERVED_ADDR(i)          MW((1073+(i)*64):(1073+(i)*64))
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_INVALIDATE(i)             MW((1074+(i)*64):(1074+(i)*64))
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_INVALIDATE_FALSE          0x00000000
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_INVALIDATE_TRUE           0x00000001
-+#define NVC3C0_QMDV02_02_CONSTANT_BUFFER_SIZE_SHIFTED4(i)          MW((1087+(i)*64):(1075+(i)*64))
-+#define NVC3C0_QMDV02_02_PROGRAM_ADDRESS_LOWER                     MW(1567:1536)
-+#define NVC3C0_QMDV02_02_PROGRAM_ADDRESS_UPPER                     MW(1584:1568)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_S                            MW(1599:1585)
-+#define NVC3C0_QMDV02_02_HW_ONLY_INNER_GET                         MW(1630:1600)
-+#define NVC3C0_QMDV02_02_HW_ONLY_REQUIRE_SCHEDULING_PCAS           MW(1631:1631)
-+#define NVC3C0_QMDV02_02_HW_ONLY_INNER_PUT                         MW(1662:1632)
-+#define NVC3C0_QMDV02_02_HW_ONLY_SCG_TYPE                          MW(1663:1663)
-+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX              MW(1693:1664)
-+#define NVC3C0_QMDV02_02_QMD_RESERVED_Q                            MW(1694:1694)
-+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID        MW(1695:1695)
-+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_FALSE  0x00000000
-+#define NVC3C0_QMDV02_02_HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID_TRUE   0x00000001
-+#define NVC3C0_QMDV02_02_HW_ONLY_SKED_NEXT_QMD_POINTER             MW(1727:1696)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_G                               MW(1759:1728)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_H                               MW(1791:1760)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_I                               MW(1823:1792)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_J                               MW(1855:1824)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_K                               MW(1887:1856)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_L                               MW(1919:1888)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_M                               MW(1951:1920)
-+#define NVC3C0_QMDV02_02_QMD_SPARE_N                               MW(1983:1952)
-+#define NVC3C0_QMDV02_02_DEBUG_ID_UPPER                            MW(2015:1984)
-+#define NVC3C0_QMDV02_02_DEBUG_ID_LOWER                            MW(2047:2016)
-+
-+
-+
-+#endif // #ifndef __CLC3C0QMD_H__
-diff --git a/src/gallium/drivers/nouveau/nvc0/drf.h b/src/gallium/drivers/nouveau/nvc0/drf.h
-new file mode 100644
-index 00000000000..bf95c8c3185
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/drf.h
-@@ -0,0 +1,119 @@
-+/*
-+ * Copyright 2019 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#ifndef __NVHW_DRF_H__
-+#define __NVHW_DRF_H__
-+
-+/* Helpers common to all DRF accessors. */
-+#define DRF_LO(drf)    (0 ? drf)
-+#define DRF_HI(drf)    (1 ? drf)
-+#define DRF_BITS(drf)  (DRF_HI(drf) - DRF_LO(drf) + 1)
-+#define DRF_MASK(drf)  (~0ULL >> (64 - DRF_BITS(drf)))
-+#define DRF_SMASK(drf) (DRF_MASK(drf) << DRF_LO(drf))
-+
-+/* Helpers for DRF-MW accessors. */
-+#define DRF_MX_MW(drf)      drf
-+#define DRF_MX(drf)         DRF_MX_##drf
-+#define DRF_MW(drf)         DRF_MX(drf)
-+#define DRF_MW_SPANS(o,drf) (DRF_LW_IDX((o),drf) != DRF_HW_IDX((o),drf))
-+#define DRF_MW_SIZE(o)      (sizeof((o)[0]) * 8)
-+
-+#define DRF_LW_IDX(o,drf)   (DRF_LO(DRF_MW(drf)) / DRF_MW_SIZE(o))
-+#define DRF_LW_LO(o,drf)    (DRF_LO(DRF_MW(drf)) % DRF_MW_SIZE(o))
-+#define DRF_LW_HI(o,drf)    (DRF_MW_SPANS((o),drf) ? (DRF_MW_SIZE(o) - 1) : DRF_HW_HI((o),drf))
-+#define DRF_LW_BITS(o,drf)  (DRF_LW_HI((o),drf) - DRF_LW_LO((o),drf) + 1)
-+#define DRF_LW_MASK(o,drf)  (~0ULL >> (64 - DRF_LW_BITS((o),drf)))
-+#define DRF_LW_SMASK(o,drf) (DRF_LW_MASK((o),drf) << DRF_LW_LO((o),drf))
-+#define DRF_LW_GET(o,drf)   (((o)[DRF_LW_IDX((o),drf)] >> DRF_LW_LO((o),drf)) & DRF_LW_MASK((o),drf))
-+#define DRF_LW_VAL(o,drf,v) (((v) & DRF_LW_MASK((o),drf)) << DRF_LW_LO((o),drf))
-+#define DRF_LW_CLR(o,drf)   ((o)[DRF_LW_IDX((o),drf)] & ~DRF_LW_SMASK((o),drf))
-+#define DRF_LW_SET(o,drf,v) (DRF_LW_CLR((o),drf) | DRF_LW_VAL((o),drf,(v)))
-+
-+#define DRF_HW_IDX(o,drf)   (DRF_HI(DRF_MW(drf)) / DRF_MW_SIZE(o))
-+#define DRF_HW_LO(o,drf)    0
-+#define DRF_HW_HI(o,drf)    (DRF_HI(DRF_MW(drf)) % DRF_MW_SIZE(o))
-+#define DRF_HW_BITS(o,drf)  (DRF_HW_HI((o),drf) - DRF_HW_LO((o),drf) + 1)
-+#define DRF_HW_MASK(o,drf)  (~0ULL >> (64 - DRF_HW_BITS((o),drf)))
-+#define DRF_HW_SMASK(o,drf) (DRF_HW_MASK((o),drf) << DRF_HW_LO((o),drf))
-+#define DRF_HW_GET(o,drf)   ((o)[DRF_HW_IDX(o,drf)] & DRF_HW_SMASK((o),drf))
-+#define DRF_HW_VAL(o,drf,v) (((long long)(v) >> DRF_LW_BITS((o),drf)) & DRF_HW_SMASK((o),drf))
-+#define DRF_HW_CLR(o,drf)   ((o)[DRF_HW_IDX((o),drf)] & ~DRF_HW_SMASK((o),drf))
-+#define DRF_HW_SET(o,drf,v) (DRF_HW_CLR((o),drf) | DRF_HW_VAL((o),drf,(v)))
-+
-+/* DRF accessors. */
-+#define NVVAL_X(drf,v) (((v) & DRF_MASK(drf)) << DRF_LO(drf))
-+#define NVVAL_N(X,d,r,f,  v) NVVAL_X(d##_##r##_##f, (v))
-+#define NVVAL_I(X,d,r,f,i,v) NVVAL_X(d##_##r##_##f(i), (v))
-+#define NVVAL_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL
-+#define NVVAL(A...) NVVAL_(X, ##A, NVVAL_I, NVVAL_N)(X, ##A)
-+
-+#define NVDEF_N(X,d,r,f,  v) NVVAL_X(d##_##r##_##f, d##_##r##_##f##_##v)
-+#define NVDEF_I(X,d,r,f,i,v) NVVAL_X(d##_##r##_##f(i), d##_##r##_##f##_##v)
-+#define NVDEF_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL
-+#define NVDEF(A...) NVDEF_(X, ##A, NVDEF_I, NVDEF_N)(X, ##A)
-+
-+#define NVVAL_GET_X(o,drf) (((o) >> DRF_LO(drf)) & DRF_MASK(drf))
-+#define NVVAL_GET_N(X,o,d,r,f  ) NVVAL_GET_X(o, d##_##r##_##f)
-+#define NVVAL_GET_I(X,o,d,r,f,i) NVVAL_GET_X(o, d##_##r##_##f(i))
-+#define NVVAL_GET_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL
-+#define NVVAL_GET(A...) NVVAL_GET_(X, ##A, NVVAL_GET_I, NVVAL_GET_N)(X, ##A)
-+
-+#define NVVAL_SET_X(o,drf,v) (((o) & ~DRF_SMASK(drf)) | NVVAL_X(drf, (v)))
-+#define NVVAL_SET_N(X,o,d,r,f,  v) NVVAL_SET_X(o, d##_##r##_##f, (v))
-+#define NVVAL_SET_I(X,o,d,r,f,i,v) NVVAL_SET_X(o, d##_##r##_##f(i), (v))
-+#define NVVAL_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL
-+#define NVVAL_SET(A...) NVVAL_SET_(X, ##A, NVVAL_SET_I, NVVAL_SET_N)(X, ##A)
-+
-+#define NVDEF_SET_N(X,o,d,r,f,  v)                                             \
-+	NVVAL_SET_X(o, d##_##r##_##f,    d##_##r##_##f##_##v)
-+#define NVDEF_SET_I(X,o,d,r,f,i,v)                                             \
-+	NVVAL_SET_X(o, d##_##r##_##f(i), d##_##r##_##f##_##v)
-+#define NVDEF_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL
-+#define NVDEF_SET(A...) NVDEF_SET_(X, ##A, NVDEF_SET_I, NVDEF_SET_N)(X, ##A)
-+
-+/* DRF-MW accessors. */
-+#define NVVAL_MW_GET_X(o,drf)                                                  \
-+	((DRF_MW_SPANS((o),drf) ?                                              \
-+	  (DRF_HW_GET((o),drf) << DRF_LW_BITS((o),drf)) : 0) | DRF_LW_GET((o),drf))
-+#define NVVAL_MW_GET_N(X,o,d,r,f  ) NVVAL_MW_GET_X((o), d##_##r##_##f)
-+#define NVVAL_MW_GET_I(X,o,d,r,f,i) NVVAL_MW_GET_X((o), d##_##r##_##f(i))
-+#define NVVAL_MW_GET_(X,_1,_2,_3,_4,_5,IMPL,...) IMPL
-+#define NVVAL_MW_GET(A...) NVVAL_MW_GET_(X, ##A, NVVAL_MW_GET_I, NVVAL_MW_GET_N)(X, ##A)
-+
-+#define NVVAL_MW_SET_X(o,drf,v) do {                                           \
-+	(o)[DRF_LW_IDX((o),drf)] = DRF_LW_SET((o),drf,(v));                    \
-+	if (DRF_MW_SPANS((o),drf))                                             \
-+		(o)[DRF_HW_IDX((o),drf)] = DRF_HW_SET((o),drf,(v));            \
-+} while(0)
-+#define NVVAL_MW_SET_N(X,o,d,r,f,  v) NVVAL_MW_SET_X((o), d##_##r##_##f, (v))
-+#define NVVAL_MW_SET_I(X,o,d,r,f,i,v) NVVAL_MW_SET_X((o), d##_##r##_##f(i), (v))
-+#define NVVAL_MW_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL
-+#define NVVAL_MW_SET(A...)                                                     \
-+	NVVAL_MW_SET_(X, ##A, NVVAL_MW_SET_I, NVVAL_MW_SET_N)(X, ##A)
-+
-+#define NVDEF_MW_SET_N(X,o,d,r,f,  v)                                          \
-+	NVVAL_MW_SET_X(o, d##_##r##_##f,    d##_##r##_##f##_##v)
-+#define NVDEF_MW_SET_I(X,o,d,r,f,i,v)                                          \
-+	NVVAL_MW_SET_X(o, d##_##r##_##f(i), d##_##r##_##f##_##v)
-+#define NVDEF_MW_SET_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL
-+#define NVDEF_MW_SET(A...)                                                     \
-+	NVDEF_MW_SET_(X, ##A, NVDEF_MW_SET_I, NVDEF_MW_SET_N)(X, ##A)
-+#endif
-diff --git a/src/gallium/drivers/nouveau/nvc0/mme/comc597.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/comc597.mme.h
-new file mode 100644
-index 00000000000..390741cbd04
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/mme/comc597.mme.h
-@@ -0,0 +1,904 @@
-+#define NV_MME_PRED_MODE_UUUU                0
-+#define NV_MME_PRED_MODE_TTTT                1
-+#define NV_MME_PRED_MODE_FFFF                2
-+#define NV_MME_PRED_MODE_TTUU                3
-+#define NV_MME_PRED_MODE_FFUU                4
-+#define NV_MME_PRED_MODE_TFUU                5
-+#define NV_MME_PRED_MODE_TUUU                6
-+#define NV_MME_PRED_MODE_FUUU                7
-+#define NV_MME_PRED_MODE_UUTT                8
-+#define NV_MME_PRED_MODE_UUTF                9
-+#define NV_MME_PRED_MODE_UUTU                10
-+#define NV_MME_PRED_MODE_UUFT                11
-+#define NV_MME_PRED_MODE_UUFF                12
-+#define NV_MME_PRED_MODE_UUFU                13
-+#define NV_MME_PRED_MODE_UUUT                14
-+#define NV_MME_PRED_MODE_UUUF                15
-+
-+#define NV_MME_REG_R0                       0
-+#define NV_MME_REG_R1                       1
-+#define NV_MME_REG_R2                       2
-+#define NV_MME_REG_R3                       3
-+#define NV_MME_REG_R4                       4
-+#define NV_MME_REG_R5                       5
-+#define NV_MME_REG_R6                       6
-+#define NV_MME_REG_R7                       7
-+#define NV_MME_REG_R8                       8
-+#define NV_MME_REG_R9                       9
-+#define NV_MME_REG_R10                      10
-+#define NV_MME_REG_R11                      11
-+#define NV_MME_REG_R12                      12
-+#define NV_MME_REG_R13                      13
-+#define NV_MME_REG_R14                      14
-+#define NV_MME_REG_R15                      15
-+#define NV_MME_REG_R16                      16
-+#define NV_MME_REG_R17                      17
-+#define NV_MME_REG_R18                      18
-+#define NV_MME_REG_R19                      19
-+#define NV_MME_REG_R20                      20
-+#define NV_MME_REG_R21                      21
-+#define NV_MME_REG_R22                      22
-+#define NV_MME_REG_R23                      23
-+#define NV_MME_REG_ZERO                     24
-+#define NV_MME_REG_IMMED                    25
-+#define NV_MME_REG_IMMEDPAIR                26
-+#define NV_MME_REG_IMMED32                  27
-+#define NV_MME_REG_LOAD0                    28
-+#define NV_MME_REG_LOAD1                    29
-+
-+#define NV_MME_ALU_ADD                    0
-+#define NV_MME_ALU_ADDC                   1
-+#define NV_MME_ALU_SUB                    2
-+#define NV_MME_ALU_SUBB                   3
-+#define NV_MME_ALU_MUL                    4
-+#define NV_MME_ALU_MULH                   5
-+#define NV_MME_ALU_MULU                   6
-+#define NV_MME_ALU_EXTENDED               7
-+#define NV_MME_ALU_CLZ                    8
-+#define NV_MME_ALU_SLL                    9
-+#define NV_MME_ALU_SRL                    10
-+#define NV_MME_ALU_SRA                    11
-+#define NV_MME_ALU_AND                    12
-+#define NV_MME_ALU_NAND                   13
-+#define NV_MME_ALU_OR                     14
-+#define NV_MME_ALU_XOR                    15
-+#define NV_MME_ALU_MERGE                  16
-+#define NV_MME_ALU_SLT                    17
-+#define NV_MME_ALU_SLTU                   18
-+#define NV_MME_ALU_SLE                    19
-+#define NV_MME_ALU_SLEU                   20
-+#define NV_MME_ALU_SEQ                    21
-+#define NV_MME_ALU_STATE                  22
-+#define NV_MME_ALU_LOOP                   23
-+#define NV_MME_ALU_JAL                    24
-+#define NV_MME_ALU_BLT                    25
-+#define NV_MME_ALU_BLTU                   26
-+#define NV_MME_ALU_BLE                    27
-+#define NV_MME_ALU_BLEU                   28
-+#define NV_MME_ALU_BEQ                    29
-+#define NV_MME_ALU_DREAD                  30
-+#define NV_MME_ALU_DWRITE                 31
-+
-+#define NV_MME_OUT_NONE                 0
-+#define NV_MME_OUT_ALU0                 1
-+#define NV_MME_OUT_ALU1                 2
-+#define NV_MME_OUT_LOAD0                3
-+#define NV_MME_OUT_LOAD1                4
-+#define NV_MME_OUT_IMMED0               5
-+#define NV_MME_OUT_IMMED1               6
-+#define NV_MME_OUT_RESERVED             7
-+#define NV_MME_OUT_IMMEDHIGH0           8
-+#define NV_MME_OUT_IMMEDHIGH1           9
-+#define NV_MME_OUT_IMMED32_0            10
-+
-+#define MME_BITS(en,pm,pr,o0,d0,a0,b0,i0,o1,d1,a1,b1,i1,m0,e0,m1,e1)           \
-+   ((e1) << (92 - 64) | (m1) << (89 - 64) |                                    \
-+    (e0) << (85 - 64) | (m0) << (82 - 64) |                                    \
-+    (i1) << (66 - 64) | (b1) >> (64 - 61)),                                    \
-+   (((b1) & 7)  << (61 - 32) | (a1) << (56 - 32) |                             \
-+    (d1) << (51 - 32) | (o1) << (46 - 32) |                                    \
-+    (i0) >> (32 - 30)),                                                        \
-+   (((i0) & 3) << 30 | (b0) << 25 | (a0) << 20 | (d0) << 15 | (o0) << 10 |     \
-+    (pr) << 5 | (pm) << 1 | (en))
-+
-+#define MME_INSN(en,o0,d0,a0,b0,i0,m0,e0,o1,d1,a1,b1,i1,m1,e1)                 \
-+   MME_BITS((en), NV_MME_PRED_MODE_UUUU, NV_MME_REG_ZERO,                      \
-+            NV_MME_ALU_##o0, NV_MME_REG_##d0,                               \
-+            NV_MME_REG_##a0, NV_MME_REG_##b0, (i0),                            \
-+            NV_MME_ALU_##o1, NV_MME_REG_##d1,                               \
-+            NV_MME_REG_##a1, NV_MME_REG_##b1, (i1),                            \
-+            NV_MME_OUT_##m0, NV_MME_OUT_##e0,                                  \
-+            NV_MME_OUT_##m1, NV_MME_OUT_##e1)
-+
-+uint32_t mmec597_per_instance_bf[] = {
-+// r1 = load();      // count
-+// r3 = load();      // mask
-+// mthd(0x1880, 1);  // VERTEX_ARRAY_PER_INSTANCE[0]
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,  (1<<12)|0x1880/4, IMMED0,   NONE,
-+                 ADD,   R3, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+// while (HW_LOOP_COUNT < r1) {
-+//    send(r3 & 1);
-+//    r3 >>= 1;
-+// }
-+   MME_INSN(0,  LOOP, ZERO,    R1,  ZERO,            0x0003,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   AND, ZERO,    R3, IMMED,                 1,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   SRL,   R3,    R3, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_vertex_array_select[] = {
-+// r1 = load();            // array
-+// r2 = load();            // limit hi
-+// r3 = load();            // limit lo
-+// r4 = load();            // start hi
-+// r5 = load();            // start lo
-+// r6 = (r1 & 0x1f) << 2;
-+// r7 = (r1 & 0x1f) << 1;
-+// mthd(0x1c04 + r6, 1);   // VERTEX_ARRAY_START_HIGH[]
-+// send(r4);
-+// send(r5);
-+// mthd(0x0600 + r7, 1);   // VERTEX_ARRAY_LIMIT_HIGH[]
-+// send(r2);
-+// send(r3);
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD,   R2, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R3, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD,   R4, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R5, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+               MERGE,   R6,  ZERO,    R1,  (2<<10)|(5<<5)|0,   NONE,   NONE),
-+   MME_INSN(0, MERGE,   R7,  ZERO,    R1,  (1<<10)|(5<<5)|0,   ALU1,   NONE,
-+                 ADD, ZERO,    R6, IMMED,  (1<<12)|0x1c04/4,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,    R5,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(1,   ADD, ZERO,    R7, IMMED,  (1<<12)|0x0600/4,   ALU0,   ALU1,
-+                 ADD, ZERO,    R2,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R3,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_blend_enables[] = {
-+// r1 = load();         // enable mask
-+// mthd(0x1360, 1);     // NVC0_3D_BLEND_ENABLE[]
-+// send((r1 >> 0) & 1);
-+// send((r1 >> 1) & 1);
-+// send((r1 >> 2) & 1);
-+// send((r1 >> 3) & 1);
-+// send((r1 >> 4) & 1);
-+// send((r1 >> 5) & 1);
-+// send((r1 >> 6) & 1);
-+// send((r1 >> 7) & 1);
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                 0, IMMED1,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,  (1<<12)|0x1360/4,   NONE,   NONE),
-+   MME_INSN(0, MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|0,   NONE,   ALU0,
-+               MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|1,   NONE,   ALU1),
-+   MME_INSN(0, MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|2,   NONE,   ALU0,
-+               MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|3,   NONE,   ALU1),
-+   MME_INSN(1, MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|4,   NONE,   ALU0,
-+               MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|5,   NONE,   ALU1),
-+   MME_INSN(0, MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|6,   NONE,   ALU0,
-+               MERGE, ZERO,  ZERO,    R1,  (0<<10)|(1<<5)|7,   NONE,   ALU1),
-+};
-+
-+uint32_t mmec597_poly_mode_front[] = {
-+// r1 = load();
-+// mthd(0x0dac,0);      // POLYGON_MODE_FRONT
-+// send(r1);
-+// r2 = read(0x0db0);   // POLYGON_MODE_BACK
-+// r3 = read(0x20c0);   // SP_SELECT[3]
-+// r7 = r1 | r2;
-+// r4 = read(0x2100);   // SP_SELECT[4]
-+// r6 = 0x60;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,  (0<<12)|0x0dac/4, IMMED0,   ALU0,
-+               STATE,   R2, IMMED,  ZERO,          0x0db0/4,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R3, IMMED,  ZERO,          0x20c0/4,   NONE,   NONE,
-+                  OR,   R7,    R1,    R2,                 0,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R4, IMMED,  ZERO,          0x2100/4,   NONE,   NONE,
-+                 ADD,   R6, IMMED,  ZERO,              0x60,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0x200;
-+   MME_INSN(0,   ADD,   R6, IMMED,  ZERO,             0x200,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// r7 = r3 | r4;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,    OR,   R7,    R3,    R4,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0;
-+   MME_INSN(0,   ADD,   R6,  ZERO,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x02ec, 0);
-+// send(r6);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,  (0<<12)|0x02ec/4, IMMED0,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_poly_mode_back[] = {
-+// r1 = load();
-+// mthd(0x0db0,0);      // POLYGON_MODE_BACK
-+// send(r1);
-+// r2 = read(0x0dac);   // POLYGON_MODE_FRONT
-+// r3 = read(0x20c0);   // SP_SELECT[3]
-+// r7 = r1 | r2;
-+// r4 = read(0x2100);   // SP_SELECT[4]
-+// r6 = 0x60;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,  (0<<12)|0x0db0/4, IMMED0,   ALU0,
-+               STATE,   R2, IMMED,  ZERO,          0x0dac/4,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R3, IMMED,  ZERO,          0x20c0/4,   NONE,   NONE,
-+                  OR,   R7,    R1,    R2,                 0,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R4, IMMED,  ZERO,          0x2100/4,   NONE,   NONE,
-+                 ADD,   R6, IMMED,  ZERO,              0x60,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0x200;
-+   MME_INSN(0,   ADD,   R6, IMMED,  ZERO,             0x200,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// r7 = r3 | r4;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,    OR,   R7,    R3,    R4,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0;
-+   MME_INSN(0,   ADD,   R6,  ZERO,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x02ec, 0);
-+// send(r6);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,  (0<<12)|0x02ec/4, IMMED0,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_gp_select[] = {
-+// r1 = load();
-+// mthd(0x2100,0);      // SP_SELECT[4]
-+// send(r1);
-+// r2 = read(0x0dac);   // POLYGON_MODE_FRONT
-+// r3 = read(0x0db0);   // POLYGON_MODE_BACK
-+// r7 = r2 | r3;
-+// r4 = read(0x20c0);   // SP_SELECT[3]
-+// r6 = 0x60;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,  (0<<12)|0x2100/4, IMMED0,   ALU0,
-+               STATE,   R2, IMMED,  ZERO,          0x0dac/4,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R3, IMMED,  ZERO,          0x0db0/4,   NONE,   NONE,
-+                  OR,   R7,    R2,    R3,                 0,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R4, IMMED,  ZERO,          0x20c0/4,   NONE,   NONE,
-+                 ADD,   R6, IMMED,  ZERO,              0x60,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0x200;
-+   MME_INSN(0,   ADD,   R6, IMMED,  ZERO,             0x200,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// r7 = r1 | r4;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,    OR,   R7,    R1,    R4,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0;
-+   MME_INSN(0,   ADD,   R6,  ZERO,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x02ec, 0);
-+// send(r6);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,  (0<<12)|0x02ec/4, IMMED0,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_tep_select[] = {
-+// r1 = load();
-+// mthd(0x20c0,0);      // SP_SELECT[3]
-+// send(r1);
-+// r2 = read(0x0dac);   // POLYGON_MODE_FRONT
-+// r3 = read(0x0db0);   // POLYGON_MODE_BACK
-+// r7 = r2 | r3;
-+// r4 = read(0x2100);   // SP_SELECT[4]
-+// r6 = 0x60;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,  (0<<12)|0x20c0/4, IMMED0,   ALU0,
-+               STATE,   R2, IMMED,  ZERO,          0x0dac/4,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R3, IMMED,  ZERO,          0x0db0/4,   NONE,   NONE,
-+                  OR,   R7,    R2,    R3,                 0,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R4, IMMED,  ZERO,          0x2100/4,   NONE,   NONE,
-+                 ADD,   R6, IMMED,  ZERO,              0x60,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0x200;
-+   MME_INSN(0,   ADD,   R6, IMMED,  ZERO,             0x200,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// r7 = r1 | r4;
-+// r7 = r7 & 1;
-+// if (r7 != 0)
-+   MME_INSN(0,    OR,   R7,    R1,    R4,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   AND,   R7,    R7, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R7,  ZERO,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = 0;
-+   MME_INSN(0,   ADD,   R6,  ZERO,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x02ec, 0);
-+// send(r6);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,  (0<<12)|0x02ec/4, IMMED0,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_draw_arrays_indirect[] = {
-+// r1 = load();         // mode
-+// r5 = read(0x1438);   // VB_INSTANCE_BASE
-+// r6 = load();         // start_drawid
-+// r7 = load();         // numparams
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                0,   NONE,   NONE,
-+                 ADD,   R6, LOAD1,  ZERO,                0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R7, LOAD0,  ZERO,                0,   NONE,   NONE,
-+               STATE,   R5, IMMED,  ZERO,         0x1438/4,   NONE,   NONE),
-+// while (HW_LOOP_COUNT < r7) {
-+//    r2 = load();      // count
-+//    r3 = load();      // instance_count
-+//    mthd(0x0d74, 0);  // VERTEX_BUFFER_FIRST
-+//    send(load());     // start
-+//    r4 = load();      // start_instance
-+//    if (r3) {
-+   MME_INSN(0,  LOOP, ZERO,    R7,  ZERO,            0x000c,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R2, LOAD0,  ZERO,          0x0d74/4, IMMED0,   NONE,
-+                 ADD,   R3, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD,   R4, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R3,  ZERO,    (2<<14)|0x0008,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//       mthd(0x238c, 1);     // CB_POS
-+//       send(256 + 160);
-+//       send(0);             // base_vertex
-+//       send(r4);            // start_instance
-+//       send(r6);            // draw id
-+//       mthd(0x1438, 0);     // VB_INSTANCE_BASE
-+//       send(r4);
-+//       r1 = r1 & ~(1<<26);  // clear INSTANCE_NEXT
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,  (1<<12)|0x238c/4, IMMED0, IMMED1,
-+                 ADD, ZERO,  ZERO,  ZERO,         256 + 160,   NONE,   ALU0),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,          0x1438/4, IMMED0,   ALU0,
-+               MERGE,   R1,    R1,  ZERO, (26<<10)|(1<<5)|0,   NONE,   NONE),
-+//       do {
-+//          mthd(0x1618, 0);  // VERTEX_BEGIN_GL
-+//          send(r1);         // mode
-+//          mthd(0x0d78, 0);  // VERTEX_BUFFER_COUNT
-+//          send(r2);         // count
-+//          mthd(0x1614, 0);  // VERTEX_END_GL
-+//          send(0);
-+//          r1 |= (1<<26);    // set INSTANCE_NEXT
-+//       } while(--r3);
-+//    }
-+   MME_INSN(0,   ADD, ZERO,    R1,  ZERO,          0x1618/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R2,  ZERO,          0x0d78/4, IMMED1,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,          0x1614/4, IMMED0,   ALU0,
-+                 ADD,   R4, IMMED,  ZERO,                 1,   NONE,   NONE),
-+   MME_INSN(0, MERGE,   R1,    R1,    R4, (26<<10)|(1<<5)|0,   NONE,   NONE,
-+                 SUB,   R3,    R3, IMMED,                 1,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R3,  ZERO,    (1<<14)|0x3ffd,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r6 = r6 + 1;
-+// };
-+   MME_INSN(0,   ADD,   R6,    R6, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x1438, 0);  // restore VB_INSTANCE_BASE
-+// send(r5);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,          0x1438/4, IMMED0,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R5,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+};
-+
-+uint32_t mmec597_draw_elts_indirect[] = {
-+// r1 = load();         // mode
-+// r8 = read(0x1434);   // VB_ELEMENT_BASE
-+// r9 = read(0x1438);   // VB_INSTANCE_BASE
-+// r6 = load();         // start_drawid
-+// r7 = load();         // numparams
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+               STATE,   R8, IMMED,  ZERO,          0x1434/4,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R9, IMMED,  ZERO,          0x1438/4,   NONE,   NONE,
-+                 ADD,   R6, LOAD0,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R7, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// while (HW_LOOP_COUNT < r7) {
-+//    r3 = load();      // count
-+//    r2 = load();      // instance_count
-+//    mthd(0x17dc, 0);  // INDEX_BATCH_FIRST
-+//    send(load());     // start
-+//    r4 = load();      // index_bias
-+//    mthd(0x238c, 1);  // CB_POS
-+//    send(256 + 160);
-+//    send(r4);         // index_bias
-+//    r5 = load();      // start_instance
-+//    if (r2) {
-+   MME_INSN(0,  LOOP, ZERO,    R7,  ZERO,            0x000d,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R3, LOAD0,  ZERO,          0x17dc/4, IMMED0,   NONE,
-+                 ADD,   R2, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD,   R4, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,  (1<<12)|0x238c/4, IMMED0, IMMED1,
-+                 ADD, ZERO,    R4,  ZERO,         256 + 160,   NONE,   ALU1),
-+   MME_INSN(0,   BEQ, ZERO,    R2,  ZERO,    (2<<14)|0x0008,   NONE,   NONE,
-+                 ADD,   R5, LOAD0,  ZERO,                 0,   NONE,   NONE),
-+//       send(r5);         // start_instance
-+//       send(r6);         // draw_id
-+//       mthd(0x1434, 1);  // VB_ELEMENT_BASE
-+//       send(r4);         // index_bias
-+//       send(r5);         // start_instance
-+//       mthd(0x1118, 0);  // VERTEX_ID_BASE
-+//       send(r4);         // index_bias
-+//       r1 &= ~(1 << 26); // clear INSTANCE_NEXT
-+   MME_INSN(0,   ADD, ZERO,    R5,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,  (1<<12)|0x1434/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R5,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,          0x1118/4, IMMED0,   ALU0,
-+               MERGE,   R1,    R1,  ZERO, (26<<10)|(1<<5)|0,   NONE,   NONE),
-+//       do {
-+//          mthd(0x1618, 0);  // VERTEX_BEGIN_GL
-+//          send(r1);         // mode
-+//          mthd(0x17e0, 0);  // INDEX_BATCH_COUNT
-+//          send(r3);         // count
-+//          mthd(0x1614, 0);  // VERTEX_END_GL
-+//          send(0);
-+//          r1 |= (1 << 26);  // set INSTANCE_NEXT
-+//       } while (--r2);
-+//    }
-+   MME_INSN(0,   ADD, ZERO,    R1,  ZERO,          0x1618/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R3,  ZERO,          0x17e0/4, IMMED1,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,          0x1614/4, IMMED0,   ALU0,
-+                 ADD,   R4, IMMED,  ZERO,                 1,   NONE,   NONE),
-+   MME_INSN(0, MERGE,   R1,    R1,    R4, (26<<10)|(1<<5)|0,   NONE,   NONE,
-+                 SUB,   R2,    R2, IMMED,                 1,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R2,  ZERO,    (1<<14)|0x3ffd,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//   r6 = r6 + 1;
-+// };
-+   MME_INSN(0,   ADD,   R6,    R6, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x1434, 1);
-+// send(r8);         // restore VB_ELEMENT_BASE
-+// send(r9);         // restore VB_INSTANCE_BASE
-+// mthd(0x1118, 0);
-+// send(r8);         // restore VERTEX_ID_BASE
-+   MME_INSN(1,   ADD, ZERO,    R8,  ZERO,  (1<<12)|0x1434/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R9,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R8,  ZERO,          0x1118/4, IMMED0,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_draw_arrays_indirect_count[] = {
-+// r1 = load();         // mode
-+// r6 = load();         // start_drawid
-+// r7 = load();         // numparams
-+// r5 = load();         // totaldraws
-+// r8 = read(0x1438);   // VB_INSTANCE_BASE
-+// r5 = r5 - r6;        // remaining draws
-+// if (r5 > r7)
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD,   R6, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R7, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD,   R5, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R8, IMMED,  ZERO,          0x1438/4,   NONE,   NONE,
-+                 SUB,   R5,    R5,    R6,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BLE, ZERO,    R5,    R7,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r5 = r7;
-+   MME_INSN(0,   ADD,   R5,    R7,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// if (r5 >= 0) {
-+   MME_INSN(0,   BLT, ZERO,    R5,  ZERO,    (2<<14)|0x000e,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    while (HW_LOOP_COUNT < r5) {
-+//       r2 = load();      // count
-+//       r3 = load();      // instance_count
-+//       mthd(0x0d74, 0);  // VERTEX_BUFFER_FIRST
-+//       send(load());     // start
-+//       r4 = load();      // start_instance
-+//       if (r3) {
-+   MME_INSN(0,  LOOP, ZERO,    R5,  ZERO,            0x000c,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R2, LOAD0,  ZERO,          0x0d74/4, IMMED0,   NONE,
-+                 ADD,   R3, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD,   R4, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R3,  ZERO,    (2<<14)|0x0008,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//          mthd(0x238c, 1);  // CB_POS
-+//          send(256 + 160);
-+//          send(0);          // base_vertex
-+//          send(r4);         // start_instance
-+//          send(r6);         // draw_id
-+//          mthd(0x1438, 0);  // VB_INSTANCE_BASE
-+//          send(r4);
-+//          r1 &= ~(1 << 26); // clear INSTANCE_NEXT
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,  (1<<12)|0x238c/4, IMMED0, IMMED1,
-+                 ADD, ZERO,  ZERO,  ZERO,           256+160,   NONE,   ALU0),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,          0x1438/4, IMMED0,   ALU0,
-+               MERGE,   R1,    R1,  ZERO, (26<<10)|(1<<5)|0,   NONE,   NONE),
-+//          do {
-+//             mthd(0x1618, 0);  // VERTEX_BEGIN_GL
-+//             send(r1);         // mode
-+//             mthd(0x0d78, 0);  // VERTEX_BUFFER_COUNT
-+//             send(r2);
-+//             mthd(0x1614, 0);  // VERTEX_END_GL
-+//             send(0);
-+//             r1 |= (1 << 26);  // set INSTANCE_NEXT
-+//          } while (--r3);
-+//       }
-+   MME_INSN(0,   ADD, ZERO,    R1,  ZERO,          0x1618/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R2,  ZERO,          0x0d78/4, IMMED1,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,          0x1614/4, IMMED0,   ALU0,
-+                 ADD,   R4, IMMED,  ZERO,                 1,   NONE,   NONE),
-+   MME_INSN(0, MERGE,   R1,    R1,    R4, (26<<10)|(1<<5)|0,   NONE,   NONE,
-+                 SUB,   R3,    R3, IMMED,                 1,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R3,  ZERO,    (1<<14)|0x3ffd,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//       r6 = r6 + 1;   // draw_id++
-+//    }
-+   MME_INSN(0,   ADD,   R6,    R6, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r7 = r7 - r5;  // unneeded params
-+// }
-+   MME_INSN(0,   SUB,   R7,    R7,    R5,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// while (HW_LOOP_COUNT < r7) {
-+//    load();
-+//    load();
-+//    load();
-+//    load();
-+// }
-+   MME_INSN(0,  LOOP, ZERO,    R7,  ZERO,            0x0003,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+// exit mthd(0x1438, 0);   // VB_INSTANCE_BASE
-+// send(r8);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,          0x1438/4, IMMED0,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO,    R8,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_draw_elts_indirect_count[] = {
-+// r8 = read(0x1434);
-+// r1 = load();
-+// r9 = read(0x1438);
-+// r6 = load();
-+// r7 = load();
-+// r5 = load();
-+// r5 = r5 - r6;
-+// if (r5 > r7)
-+   MME_INSN(0, STATE,   R8, IMMED,  ZERO,          0x1434/4,   NONE,   NONE,
-+                 ADD,   R1, LOAD0,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0, STATE,   R9, IMMED,  ZERO,          0x1438/4,   NONE,   NONE,
-+                 ADD,   R6, LOAD0,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R7, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD,   R5, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   SUB,   R5,    R5,    R6,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BLE, ZERO,    R5,    R7,    (2<<14)|0x0002,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r5 = r7;
-+   MME_INSN(0,   ADD,   R5,    R7,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// if (r5 >= 0) {
-+   MME_INSN(0,   BLT, ZERO,    R5,  ZERO,    (2<<14)|0x000f,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    while (HW_LOOP_COUNT < r5) {
-+//       r3 = load();
-+//       r2 = load();
-+//       mthd(0x17dc, 0);
-+//       send(load());
-+//       r4 = load();
-+//       mthd(0x238c, 1);
-+//       send(256 + 160);
-+//       send(r4);
-+//       r10 = load();
-+//       if (r2) {
-+   MME_INSN(0,  LOOP, ZERO,    R5,  ZERO,            0x000d,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD,   R3, LOAD0,  ZERO,  (0<<12)|0x17dc/4, IMMED0,   NONE,
-+                 ADD,   R2, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,  (1<<12)|0x238c/4,   NONE,   ALU0,
-+                 ADD,   R4, LOAD1,  ZERO,         256 + 160, IMMED0, IMMED1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD,  R10, LOAD0,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R2,  ZERO,    (2<<14)|0x0008,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//          send(r10);
-+//          send(r6);
-+//          mthd(0x1434, 1);
-+//          send(r4);
-+//          send(r10);
-+//          mthd(0x1118, 0);
-+//          send(r4);
-+//          r1 &= ~(1 << 26);
-+   MME_INSN(0,   ADD, ZERO,   R10,  ZERO,                 0,   NONE,   ALU0,
-+                 ADD, ZERO,    R6,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,  (1<<12)|0x1434/4, IMMED0,   ALU0,
-+                 ADD, ZERO,   R10,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R4,  ZERO,  (0<<12)|0x1118/4, IMMED0,   ALU0,
-+               MERGE,   R1,    R1,  ZERO, (26<<10)|(1<<5)|0,   NONE,   NONE),
-+//          do {
-+//             mthd(0x1618, 0);
-+//             send(r1);
-+//             mthd(0x17e0, 0);
-+//             send(r3);
-+//             mthd(0x1614, 0);
-+//             send(0);
-+//             r1 |= (1 << 26);
-+//          } while (--r2);
-+//       }
-+   MME_INSN(0,   ADD, ZERO,    R1,  ZERO,          0x1618/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R3,  ZERO,          0x17e0/4, IMMED1,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,          0x1614/4, IMMED0,   ALU0,
-+                 ADD,   R4, IMMED,  ZERO,                 1,   NONE,   NONE),
-+   MME_INSN(0, MERGE,   R1,    R1,    R4, (26<<10)|(1<<5)|0,   NONE,   NONE,
-+                 SUB,   R2,    R2, IMMED,                 1,   NONE,   NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R2,  ZERO,    (1<<14)|0x3ffd,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//       r6 = r6 + 1;
-+//    }
-+   MME_INSN(0,   ADD,   R6,    R6, IMMED,                 1,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+//    r7 = r7 - r5; // unneeded params
-+// }
-+   MME_INSN(0,   SUB,   R7,    R7,    R5,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// while (HW_LOOP_COUNT < r7) {
-+//    r2 = load();
-+//    r2 = load();
-+//    r2 = load();
-+//    r2 = load();
-+//    r2 = load();
-+// }
-+   MME_INSN(0,  LOOP, ZERO,    R7,  ZERO,            0x0004,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO, LOAD1,  ZERO,                 0,   NONE,   NONE),
-+   MME_INSN(0,   ADD, ZERO, LOAD0,  ZERO,                 0,   NONE,   NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+// mthd(0x1434, 1);
-+// send(r8);
-+// send(r9);
-+// exit mthd(0x1118, 0);
-+// send(r8);
-+   MME_INSN(1,   ADD, ZERO,    R8,  ZERO,  (1<<12)|0x1434/4, IMMED0,   ALU0,
-+                 ADD, ZERO,    R9,  ZERO,                 0,   NONE,   ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R8,  ZERO,  (0<<12)|0x1118/4, IMMED0,   ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,   NONE),
-+};
-+
-+uint32_t mmec597_query_buffer_write[] = {
-+// r1 = load();   // clamp value
-+// r2 = load();   // end value (lo)
-+// r3 = load();   // end value (hi)
-+// r4 = load();   // start value (lo)
-+// r5 = load();   // start value (hi)
-+// r8 = load();   // desired sequence
-+// r9 = load();   // actual sequence
-+// r7 = load();   // query address (hi)
-+// r6 = load();   // query address (lo)
-+// if (r9 >= r8) {
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD,   R2, LOAD1,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R3, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD,   R4, LOAD1,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R5, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD,   R8, LOAD1,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R9, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD,   R7, LOAD1,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R6, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   BLT, ZERO,    R9,    R8,    (2<<14)|0x000e,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+//    [r3,r2] = [r3,r2] - [r5,r4];
-+//    if (r1) {
-+   MME_INSN(0,   SUB,   R2,    R2,    R4,                 0,   NONE,      NONE,
-+                SUBB,   R3,    R3,    R5,                 0,   NONE,      NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R1,  ZERO,    (2<<14)|0x0004,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+//       if (r3 != 0 || r1 < r2)
-+//          r2 = r1;
-+//    }
-+   MME_INSN(0,   BEQ, ZERO,    R3,  ZERO,    (1<<14)|0x0002,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,  BLTU, ZERO,    R1,    R2,    (1<<14)|0x0002,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R2,    R1,  ZERO,                 0,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+//    mthd(0x1b00, 1);
-+//    send(r7);
-+//    send(r6);
-+//    send(r2)
-+//    send(0x10000000);
-+//    if (!r1) {
-+   MME_INSN(0,   ADD, ZERO,    R7,  ZERO,  (1<<12)|0x1b00/4, IMMED0,      ALU0,
-+                 ADD, ZERO,    R6,  ZERO,                 0,   NONE,      ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R2,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,            0x1000,   NONE, IMMED32_0,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x0000,   NONE,      NONE),
-+   MME_INSN(0,   BEQ, ZERO,    R1,  ZERO,    (1<<14)|0x0004,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+//       [r7,r6] = [r7,r6] + 4;
-+//       mthd(0x1b00, 1);
-+//       send(r7);
-+//       send(r6);
-+//       send(r3);
-+//       send(0x10000000);
-+//    }
-+   MME_INSN(0,   ADD, ZERO,    R6, IMMED,                 4, IMMED1,      ALU1,
-+                ADDC, ZERO,    R7,  ZERO,  (1<<12)|0x1b00/4,   NONE,      ALU0),
-+   MME_INSN(0,   ADD, ZERO,    R3,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,            0x1000,   NONE, IMMED32_0,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x0000,   NONE,      NONE),
-+//    mthd(0x0110, 0);
-+//    send(0);
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,  (0<<12)|0x0110/4, IMMED0,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+// }
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+};
-+
-+uint32_t mmec597_conservative_raster_state[] = {
-+// r1 = load();
-+// mthd(0x3400, 1);
-+// send(0);
-+// send(((r1 >> 8) & 7) << 23);
-+// send(0x03800000);
-+// mthd(0x2310, 1);
-+// send(0x00418800);
-+// r2 = r1 & 0xf;
-+// r3 = 16;
-+// r2 = r2 | (((r1 >> 4) & 0xf) << 8);
-+// mthd(0x0a1c, 8);
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,  (1<<12)|0x3400/4, IMMED0,    IMMED1,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0, MERGE, ZERO,  ZERO,    R1, (23<<10)|(3<<5)|8,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,            0x0380,   NONE, IMMED32_0,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x0000,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,  (1<<12)|0x2310/4, IMMED0,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x0000,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,            0x0041,   NONE, IMMED32_0,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x8800,   NONE,      NONE),
-+   MME_INSN(0,   AND,   R2,    R1, IMMED,               0xf,   NONE,      NONE,
-+                 ADD,   R3,  ZERO, IMMED,                16,   NONE,      NONE),
-+   MME_INSN(0, MERGE,   R2,    R2,    R1,  (8<<10)|(4<<5)|4, IMMED1,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,  (8<<12)|0x0a1c/4,   NONE,      NONE),
-+// while (HW_LOOP_COUNT < r3)
-+//    send(r2);
-+   MME_INSN(0,  LOOP, ZERO,    R3,  ZERO,            0x0002,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,    R2,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+// mthd(0x1148, 0);
-+// send(1);
-+   MME_INSN(1,   ADD, ZERO,  ZERO,  ZERO,  (0<<12)|0x1148/4, IMMED0,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,                 1,   NONE,    IMMED1,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+};
-+
-+uint32_t mmec597_compute_counter[] = {
-+// r0 = load();
-+// r1 = 1;
-+// r2 = 0;
-+// while (HW_LOOP_COUNT < r2) {
-+   MME_INSN(0,   ADD,   R0, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD,   R1, IMMED,  ZERO,                 1,   NONE,      NONE),
-+   MME_INSN(0,  LOOP, ZERO,    R0,  ZERO,            0x0003,   NONE,      NONE,
-+                 ADD,   R2,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+//    r3 = load();
-+//    [r1,r0] *= r3;
-+// }
-+   MME_INSN(0,   ADD,   R3, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,  MULU,   R1,    R1,    R3,                 0,   NONE,      NONE,
-+                MULH,   R2,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+// r3 = read(0x3410);
-+// r4 = read(0x3414);
-+// [r4,r3] += [r2,r1];
-+// mthd(0x3410, 1);
-+// send(r3);
-+// send(r4);
-+   MME_INSN(0, STATE, ZERO,  ZERO,  ZERO,          0x3410/4,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(1, STATE, ZERO,  ZERO,  ZERO,          0x3414/4,   NONE,      NONE,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R3,    R3,    R1,  (1<<12)|0x3410/4, IMMED0,      ALU0,
-+                ADDC,   R4,    R4,    R2,                 0,   NONE,      ALU1),
-+};
-+
-+uint32_t mmec597_compute_counter_to_query[] = {
-+// r1 = load();
-+// r3 = read(0x3410);
-+// r2 = load();
-+// r4 = read(0x3414);
-+// [r2,r1] = [r2,r1] + [r4,r3];
-+// mthd(0x1b00, 1);
-+// r3 = load();
-+// send(r3);
-+// r4 = load();
-+// send(r4);
-+// send(r1);
-+// send(0x10000000);
-+   MME_INSN(0,   ADD,   R1, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+               STATE,   R3, IMMED,  ZERO,          0x3410/4,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R2, LOAD0,  ZERO,                 0,   NONE,      NONE,
-+               STATE,   R4, IMMED,  ZERO,          0x3414/4,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R1,    R1,    R3,  (1<<12)|0x1b00/4, IMMED0,      NONE,
-+                ADDC,   R2,    R2,    R4,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD,   R3, LOAD0,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD,   R4, LOAD1,  ZERO,                 0,   NONE,      ALU1),
-+   MME_INSN(0,   ADD, ZERO,    R1,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,            0x1000,   NONE, IMMED32_0,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x0000,   NONE,      NONE),
-+// [r3,r4] = [r3,r4] + 4;
-+// mthd(0x1b00, 1);
-+// send(r3);
-+// send(r4);
-+// send(r2);
-+// send(0x10000000);
-+   MME_INSN(0,   ADD, ZERO,    R4, IMMED,                 4, IMMED1,      ALU1,
-+                ADDC, ZERO,    R3,  ZERO,  (1<<12)|0x1b00/4,   NONE,      ALU0),
-+   MME_INSN(1,   ADD, ZERO,    R2,  ZERO,                 0,   NONE,      ALU0,
-+                 ADD, ZERO,  ZERO,  ZERO,                 0,   NONE,      NONE),
-+   MME_INSN(0,   ADD, ZERO,  ZERO,  ZERO,            0x1000,   NONE, IMMED32_0,
-+                 ADD, ZERO,  ZERO,  ZERO,            0x0000,   NONE,      NONE),
-+};
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
-index 221bab3105b..539bdc75022 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
-@@ -157,6 +157,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- #define NVC0_3D_UNK0220__ESIZE					0x00000004
- #define NVC0_3D_UNK0220__LEN					0x00000028
- 
-+#define TU102_3D_INDEX_ARRAY_LIMIT_HIGH				0x00000238
-+
-+#define TU102_3D_INDEX_ARRAY_LIMIT_LOW				0x0000023c
-+
-+#define TU102_3D_SET_COLOR_RENDER_TO_ZETA_SURFACE		0x000002b8
-+
- #define NVC0_3D_UNK02C0					0x000002c0
- 
- #define NVC0_3D_UNK02C4					0x000002c4
-@@ -278,6 +284,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- #define NVC0_3D_UNK0400__ESIZE					0x00000004
- #define NVC0_3D_UNK0400__LEN					0x000000c0
- 
-+#define TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(i0)		       (0x00000600 + 0x8*(i0))
-+#define TU102_3D_VERTEX_ARRAY_LIMIT_LOW(i0)		       (0x00000604 + 0x8*(i0))
-+
- #define NVC0_3D_TFB_STREAM(i0)				       (0x00000700 + 0x10*(i0))
- #define NVC0_3D_TFB_STREAM__ESIZE				0x00000010
- #define NVC0_3D_TFB_STREAM__LEN				0x00000004
-@@ -1787,6 +1796,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- #define NVC0_3D_SP_UNK14__ESIZE				0x00000004
- #define NVC0_3D_SP_UNK14__LEN					0x00000004
- 
-+#define GV100_3D_SP_ADDRESS_HIGH(i0)			       (0x00002014 + 0x40*(i0))
-+#define GV100_3D_SP_ADDRESS_LOW(i0)			       (0x00002018 + 0x40*(i0))
-+
- #define NVC0_3D_TEX_LIMITS(i0)				       (0x00002200 + 0x10*(i0))
- #define NVC0_3D_TEX_LIMITS__ESIZE				0x00000010
- #define NVC0_3D_TEX_LIMITS__LEN				0x00000005
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
-index c897e4e8b97..69131fa22d3 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
-@@ -37,6 +37,55 @@ nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz, bool is_3d)
-    return nv50_tex_choose_tile_dims_helper(nx, ny, nz, is_3d);
- }
- 
-+static uint32_t
-+tu102_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
-+{
-+   uint32_t kind;
-+
-+   if (unlikely(mt->base.base.bind & PIPE_BIND_CURSOR))
-+      return 0;
-+   if (unlikely(mt->base.base.flags & NOUVEAU_RESOURCE_FLAG_LINEAR))
-+      return 0;
-+
-+   switch (mt->base.base.format) {
-+   case PIPE_FORMAT_Z16_UNORM:
-+      if (compressed)
-+         kind = 0x0b; // NV_MMU_PTE_KIND_Z16_COMPRESSIBLE_DISABLE_PLC
-+      else
-+         kind = 0x01; // NV_MMU_PTE_KIND_Z16
-+      break;
-+   case PIPE_FORMAT_X8Z24_UNORM:
-+   case PIPE_FORMAT_S8X24_UINT:
-+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-+      if (compressed)
-+         kind = 0x0e; // NV_MMU_PTE_KIND_Z24S8_COMPRESSIBLE_DISABLE_PLC
-+      else
-+         kind = 0x05; // NV_MMU_PTE_KIND_Z24S8
-+      break;
-+   case PIPE_FORMAT_X24S8_UINT:
-+   case PIPE_FORMAT_Z24X8_UNORM:
-+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-+      if (compressed)
-+         kind = 0x0c; // NV_MMU_PTE_KIND_S8Z24_COMPRESSIBLE_DISABLE_PLC
-+      else
-+         kind = 0x03; // NV_MMU_PTE_KIND_S8Z24
-+      break;
-+   case PIPE_FORMAT_X32_S8X24_UINT:
-+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-+      if (compressed)
-+         kind = 0x0d; // NV_MMU_PTE_KIND_ZF32_X24S8_COMPRESSIBLE_DISABLE_PLC
-+      else
-+         kind = 0x04; // NV_MMU_PTE_KIND_ZF32_X24S8
-+      break;
-+   case PIPE_FORMAT_Z32_FLOAT:
-+   default:
-+      kind = 0x06;
-+      break;
-+   }
-+
-+   return kind;
-+}
-+
- static uint32_t
- nvc0_mt_choose_storage_type(struct nv50_miptree *mt, bool compressed)
- {
-@@ -357,7 +406,10 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
-    if (pt->bind & PIPE_BIND_LINEAR)
-       pt->flags |= NOUVEAU_RESOURCE_FLAG_LINEAR;
- 
--   bo_config.nvc0.memtype = nvc0_mt_choose_storage_type(mt, compressed);
-+   if (dev->chipset < 0x160)
-+      bo_config.nvc0.memtype = nvc0_mt_choose_storage_type(mt, compressed);
-+   else
-+      bo_config.nvc0.memtype = tu102_mt_choose_storage_type(mt, compressed);
- 
-    if (!nvc0_miptree_init_ms_mode(mt)) {
-       FREE(mt);
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
-index 32aa82d168c..d2b2de47c8d 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
-@@ -645,7 +645,10 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
-    prog->code_size = info->bin.codeSize;
-    prog->relocs = info->bin.relocData;
-    prog->fixups = info->bin.fixupData;
--   prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
-+   if (info->target >= NVISA_GV100_CHIPSET)
-+      prog->num_gprs = MIN2(info->bin.maxGPR + 5, 256); //XXX: why?
-+   else
-+      prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
-    prog->cp.smem_size = info->bin.smemSize;
-    prog->num_barriers = info->numBarriers;
- 
-@@ -734,7 +737,14 @@ nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
-    struct nvc0_screen *screen = nvc0->screen;
-    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
-    int ret;
--   uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
-+   uint32_t size = prog->code_size;
-+
-+   if (!is_cp) {
-+      if (screen->eng3d->oclass < TU102_3D_CLASS)
-+         size += GF100_SHADER_HEADER_SIZE;
-+      else
-+         size += TU102_SHADER_HEADER_SIZE;
-+   }
- 
-    /* On Fermi, SP_START_ID must be aligned to 0x40.
-     * On Kepler, the first instruction must be aligned to 0x80 because
-@@ -750,7 +760,8 @@ nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
-    prog->code_base = prog->mem->start;
- 
-    if (!is_cp) {
--      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-+      if (screen->base.class_3d >= NVE4_3D_CLASS &&
-+          screen->base.class_3d < TU102_3D_CLASS) {
-          switch (prog->mem->start & 0xff) {
-          case 0x40: prog->code_base += 0x70; break;
-          case 0x80: prog->code_base += 0x30; break;
-@@ -777,7 +788,16 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
- {
-    struct nvc0_screen *screen = nvc0->screen;
-    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
--   uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
-+   uint32_t code_pos = prog->code_base;
-+   uint32_t size_sph = 0;
-+
-+   if (!is_cp) {
-+      if (screen->eng3d->oclass < TU102_3D_CLASS)
-+         size_sph = GF100_SHADER_HEADER_SIZE;
-+      else
-+         size_sph = TU102_SHADER_HEADER_SIZE;
-+   }
-+   code_pos += size_sph;
- 
-    if (prog->relocs)
-       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos,
-@@ -803,8 +823,7 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
- 
-    if (!is_cp)
-       nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
--                           NV_VRAM_DOMAIN(&screen->base),
--                           NVC0_SHADER_HEADER_SIZE, prog->hdr);
-+                           NV_VRAM_DOMAIN(&screen->base), size_sph, prog->hdr);
- 
-    nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
-                         NV_VRAM_DOMAIN(&screen->base), prog->code_size,
-@@ -817,7 +836,14 @@ nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog)
-    struct nvc0_screen *screen = nvc0->screen;
-    const bool is_cp = prog->type == PIPE_SHADER_COMPUTE;
-    int ret;
--   uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
-+   uint32_t size = prog->code_size;
-+
-+   if (!is_cp) {
-+      if (screen->eng3d->oclass < TU102_3D_CLASS)
-+         size += GF100_SHADER_HEADER_SIZE;
-+      else
-+         size += TU102_SHADER_HEADER_SIZE;
-+   }
- 
-    ret = nvc0_program_alloc_code(nvc0, prog);
-    if (ret) {
-@@ -874,8 +900,7 @@ nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog)
-             BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1);
-             PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE);
-          } else {
--            BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1);
--            PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base);
-+            nvc0_program_sp_start_id(nvc0, i, progs[i]);
-          }
-       }
-    }
-@@ -953,7 +978,7 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
-    unsigned base = 0;
-    unsigned i;
-    if (prog->type != PIPE_SHADER_COMPUTE)
--      base = NVC0_SHADER_HEADER_SIZE;
-+      base = GF100_SHADER_HEADER_SIZE;
-    for (i = 0; i < prog->cp.num_syms; ++i)
-       if (syms[i].label == label)
-          return prog->code_base + base + syms[i].offset;
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
-index 5684207aa54..2c465b342e9 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
-@@ -15,7 +15,9 @@ struct nvc0_transform_feedback_state {
- };
- 
- 
--#define NVC0_SHADER_HEADER_SIZE (20 * 4)
-+#define GF100_SHADER_HEADER_SIZE (20 * 4)
-+#define TU102_SHADER_HEADER_SIZE (32 * 4)
-+#define NVC0_MAX_SHADER_HEADER_SIZE TU102_SHADER_HEADER_SIZE
- 
- struct nvc0_program {
-    struct pipe_shader_state pipe;
-@@ -30,7 +32,7 @@ struct nvc0_program {
-    unsigned code_size;
-    unsigned parm_size; /* size of non-bindable uniforms (c0[]) */
- 
--   uint32_t hdr[20];
-+   uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4];
-    uint32_t flags[2];
- 
-    struct {
-@@ -72,4 +74,6 @@ struct nvc0_program {
-    struct nouveau_heap *mem;
- };
- 
-+void
-+nvc0_program_sp_start_id(struct nvc0_context *, int, struct nvc0_program *);
- #endif
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
-index 7abbf762af2..07d74ddd50c 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
-@@ -27,15 +27,17 @@
- #include "util/format/u_format_s3tc.h"
- #include "util/u_screen.h"
- #include "pipe/p_screen.h"
--#include "compiler/nir/nir.h"
- 
- #include "nouveau_vp3_video.h"
- 
-+#include "codegen/nv50_ir_driver.h"
-+
- #include "nvc0/nvc0_context.h"
- #include "nvc0/nvc0_screen.h"
- 
- #include "nvc0/mme/com9097.mme.h"
- #include "nvc0/mme/com90c0.mme.h"
-+#include "nvc0/mme/comc597.mme.h"
- 
- #include "nv50/g80_texture.xml.h"
- 
-@@ -443,8 +445,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
-    case PIPE_SHADER_CAP_PREFERRED_IR:
-       return screen->prefer_nir ? PIPE_SHADER_IR_NIR : PIPE_SHADER_IR_TGSI;
-    case PIPE_SHADER_CAP_SUPPORTED_IRS: {
--      uint32_t irs = 1 << PIPE_SHADER_IR_TGSI |
--                     1 << PIPE_SHADER_IR_NIR;
-+      uint32_t irs = 1 << PIPE_SHADER_IR_NIR |
-+         ((class_3d >= GV100_3D_CLASS) ? 0 : 1 << PIPE_SHADER_IR_TGSI);
-       if (screen->force_enable_cl)
-          irs |= 1 << PIPE_SHADER_IR_NIR_SERIALIZED;
-       return irs;
-@@ -467,6 +469,14 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen,
-    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
-       return shader != PIPE_SHADER_FRAGMENT;
-    case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-+      /* HW doesn't support indirect addressing of fragment program inputs
-+       * on Volta.  The binary driver generates a function to handle every
-+       * possible indirection, and indirectly calls the function to handle
-+       * this instead.
-+       */
-+      if (class_3d >= GV100_3D_CLASS)
-+         return shader != PIPE_SHADER_FRAGMENT;
-+      return 1;
-    case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
-    case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
-       return 1;
-@@ -717,6 +727,26 @@ nvc0_graph_set_macro(struct nvc0_screen *screen, uint32_t m, unsigned pos,
-    return pos + size;
- }
- 
-+static int
-+tu102_graph_set_macro(struct nvc0_screen *screen, uint32_t m, unsigned pos,
-+                     unsigned size, const uint32_t *data)
-+{
-+   struct nouveau_pushbuf *push = screen->base.pushbuf;
-+
-+   size /= 4;
-+
-+   assert((pos + size) <= 0x800);
-+
-+   BEGIN_NVC0(push, SUBC_3D(NVC0_GRAPH_MACRO_ID), 2);
-+   PUSH_DATA (push, (m - 0x3800) / 8);
-+   PUSH_DATA (push, pos);
-+   BEGIN_1IC0(push, SUBC_3D(NVC0_GRAPH_MACRO_UPLOAD_POS), size + 1);
-+   PUSH_DATA (push, pos);
-+   PUSH_DATAp(push, data, size);
-+
-+   return pos + (size / 3);
-+}
-+
- static void
- nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class)
- {
-@@ -728,8 +758,10 @@ nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class)
-    BEGIN_NVC0(push, SUBC_3D(0x10ec), 2);
-    PUSH_DATA (push, 0xff);
-    PUSH_DATA (push, 0xff);
--   BEGIN_NVC0(push, SUBC_3D(0x074c), 1);
--   PUSH_DATA (push, 0x3f);
-+   if (obj_class < GV100_3D_CLASS) {
-+      BEGIN_NVC0(push, SUBC_3D(0x074c), 1);
-+      PUSH_DATA (push, 0x3f);
-+   }
- 
-    BEGIN_NVC0(push, SUBC_3D(0x16a8), 1);
-    PUSH_DATA (push, (3 << 16) | 3);
-@@ -761,8 +793,10 @@ nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class)
-    BEGIN_NVC0(push, SUBC_3D(0x0300), 1);
-    PUSH_DATA (push, 3);
- 
--   BEGIN_NVC0(push, SUBC_3D(0x02d0), 1);
--   PUSH_DATA (push, 0x3fffff);
-+   if (obj_class < GV100_3D_CLASS) {
-+      BEGIN_NVC0(push, SUBC_3D(0x02d0), 1);
-+      PUSH_DATA (push, 0x3fffff);
-+   }
-    BEGIN_NVC0(push, SUBC_3D(0x0fdc), 1);
-    PUSH_DATA (push, 1);
-    BEGIN_NVC0(push, SUBC_3D(0x19c0), 1);
-@@ -822,6 +856,8 @@ nvc0_screen_init_compute(struct nvc0_screen *screen)
-    case 0x110:
-    case 0x120:
-    case 0x130:
-+   case 0x140:
-+   case 0x160:
-       return nve4_screen_compute_setup(screen, screen->base.pushbuf);
-    default:
-       return -1;
-@@ -893,13 +929,15 @@ nvc0_screen_resize_text_area(struct nvc0_screen *screen, uint64_t size)
-    nouveau_heap_init(&screen->text_heap, 0, size - 0x100);
- 
-    /* update the code segment setup */
--   BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2);
--   PUSH_DATAh(push, screen->text->offset);
--   PUSH_DATA (push, screen->text->offset);
--   if (screen->compute) {
--      BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2);
-+   if (screen->eng3d->oclass < GV100_3D_CLASS) {
-+      BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2);
-       PUSH_DATAh(push, screen->text->offset);
-       PUSH_DATA (push, screen->text->offset);
-+      if (screen->compute) {
-+         BEGIN_NVC0(push, NVC0_CP(CODE_ADDRESS_HIGH), 2);
-+         PUSH_DATAh(push, screen->text->offset);
-+         PUSH_DATA (push, screen->text->offset);
-+      }
-    }
- 
-    return 0;
-@@ -939,74 +977,14 @@ nvc0_screen_bind_cb_3d(struct nvc0_screen *screen, bool *can_serialize,
-    IMMED_NVC0(push, NVC0_3D(CB_BIND(stage)), (index << 4) | (size >= 0));
- }
- 
--static const nir_shader_compiler_options nir_options = {
--   .lower_fdiv = false,
--   .lower_ffma = false,
--   .fuse_ffma = false, /* nir doesn't track mad vs fma */
--   .lower_flrp32 = true,
--   .lower_flrp64 = true,
--   .lower_fpow = false,
--   .lower_fsat = false,
--   .lower_fsqrt = false, // TODO: only before gm200
--   .lower_fmod = true,
--   .lower_bitfield_extract = false,
--   .lower_bitfield_extract_to_shifts = false,
--   .lower_bitfield_insert = false,
--   .lower_bitfield_insert_to_shifts = false,
--   .lower_bitfield_reverse = false,
--   .lower_bit_count = false,
--   .lower_ifind_msb = false,
--   .lower_find_lsb = false,
--   .lower_uadd_carry = true, // TODO
--   .lower_usub_borrow = true, // TODO
--   .lower_mul_high = false,
--   .lower_negate = false,
--   .lower_sub = true,
--   .lower_scmp = true, // TODO: not implemented yet
--   .lower_idiv = true,
--   .lower_isign = false, // TODO
--   .fdot_replicates = false, // TODO
--   .lower_ffloor = false, // TODO
--   .lower_ffract = true,
--   .lower_fceil = false, // TODO
--   .lower_ldexp = true,
--   .lower_pack_half_2x16 = true,
--   .lower_pack_unorm_2x16 = true,
--   .lower_pack_snorm_2x16 = true,
--   .lower_pack_unorm_4x8 = true,
--   .lower_pack_snorm_4x8 = true,
--   .lower_unpack_half_2x16 = true,
--   .lower_unpack_unorm_2x16 = true,
--   .lower_unpack_snorm_2x16 = true,
--   .lower_unpack_unorm_4x8 = true,
--   .lower_unpack_snorm_4x8 = true,
--   .lower_extract_byte = true,
--   .lower_extract_word = true,
--   .lower_all_io_to_temps = false,
--   .vertex_id_zero_based = false,
--   .lower_base_vertex = false,
--   .lower_helper_invocation = false,
--   .lower_cs_local_index_from_id = true,
--   .lower_cs_local_id_from_index = false,
--   .lower_device_index_to_zero = false, // TODO
--   .lower_wpos_pntc = false, // TODO
--   .lower_hadd = true, // TODO
--   .lower_add_sat = true, // TODO
--   .use_interpolated_input_intrinsics = true,
--   .lower_mul_2x32_64 = true, // TODO
--   .max_unroll_iterations = 32,
--   .lower_int64_options = nir_lower_ufind_msb64|nir_lower_divmod64, // TODO
--   .lower_doubles_options = nir_lower_dmod, // TODO
--   .lower_to_scalar = true,
--};
--
- static const void *
- nvc0_screen_get_compiler_options(struct pipe_screen *pscreen,
-                                  enum pipe_shader_ir ir,
-                                  enum pipe_shader_type shader)
- {
-+   struct nvc0_screen *screen = nvc0_screen(pscreen);
-    if (ir == PIPE_SHADER_IR_NIR)
--      return &nir_options;
-+      return nv50_ir_nir_shader_compiler_options(screen->base.device->chipset);
-    return NULL;
- }
- 
-@@ -1038,6 +1016,8 @@ nvc0_screen_create(struct nouveau_device *dev)
-    case 0x110:
-    case 0x120:
-    case 0x130:
-+   case 0x140:
-+   case 0x160:
-       break;
-    default:
-       return NULL;
-@@ -1104,16 +1084,19 @@ nvc0_screen_create(struct nouveau_device *dev)
-    screen->base.fence.emit = nvc0_screen_fence_emit;
-    screen->base.fence.update = nvc0_screen_fence_update;
- 
-+   if (dev->chipset < 0x140) {
-+      ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e,
-+                               NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw);
-+      if (ret)
-+         FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret);
- 
--   ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e,
--                            NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw);
--   if (ret)
--      FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret);
--
--   BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1);
--   PUSH_DATA (push, screen->nvsw->handle);
-+      BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1);
-+      PUSH_DATA (push, screen->nvsw->handle);
-+   }
- 
-    switch (dev->chipset & ~0xf) {
-+   case 0x160:
-+   case 0x140:
-    case 0x130:
-    case 0x120:
-    case 0x110:
-@@ -1167,6 +1150,12 @@ nvc0_screen_create(struct nouveau_device *dev)
-    PUSH_DATA (push, screen->fence.bo->offset + 16);
- 
-    switch (dev->chipset & ~0xf) {
-+   case 0x160:
-+      obj_class = TU102_3D_CLASS;
-+      break;
-+   case 0x140:
-+      obj_class = GV100_3D_CLASS;
-+      break;
-    case 0x130:
-       switch (dev->chipset) {
-       case 0x130:
-@@ -1414,25 +1403,47 @@ nvc0_screen_create(struct nouveau_device *dev)
-       PUSH_DATA (push, 16384 << 16);
-    }
- 
-+   if (screen->eng3d->oclass < TU102_3D_CLASS) {
- #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
- 
--   i = 0;
--   MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, mme9097_per_instance_bf);
--   MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, mme9097_blend_enables);
--   MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, mme9097_vertex_array_select);
--   MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, mme9097_tep_select);
--   MK_MACRO(NVC0_3D_MACRO_GP_SELECT, mme9097_gp_select);
--   MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, mme9097_poly_mode_front);
--   MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back);
--   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect);
--   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
--   MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
--   MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
--   MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);
--   MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state);
--   MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter);
--   MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query);
--   MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect);
-+      i = 0;
-+      MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, mme9097_per_instance_bf);
-+      MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, mme9097_blend_enables);
-+      MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, mme9097_vertex_array_select);
-+      MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, mme9097_tep_select);
-+      MK_MACRO(NVC0_3D_MACRO_GP_SELECT, mme9097_gp_select);
-+      MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, mme9097_poly_mode_front);
-+      MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mme9097_poly_mode_back);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mme9097_draw_arrays_indirect);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mme9097_draw_elts_indirect);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
-+      MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);
-+      MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state);
-+      MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mme9097_compute_counter);
-+      MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mme9097_compute_counter_to_query);
-+      MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect);
-+   } else {
-+#undef MK_MACRO
-+#define MK_MACRO(m, n) i = tu102_graph_set_macro(screen, m, i, sizeof(n), n);
-+
-+      i = 0;
-+      MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, mmec597_per_instance_bf);
-+      MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, mmec597_blend_enables);
-+      MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, mmec597_vertex_array_select);
-+      MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, mmec597_tep_select);
-+      MK_MACRO(NVC0_3D_MACRO_GP_SELECT, mmec597_gp_select);
-+      MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, mmec597_poly_mode_front);
-+      MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, mmec597_poly_mode_back);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT, mmec597_draw_arrays_indirect);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT, mmec597_draw_elts_indirect);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mmec597_draw_arrays_indirect_count);
-+      MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mmec597_draw_elts_indirect_count);
-+      MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mmec597_query_buffer_write);
-+      MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mmec597_conservative_raster_state);
-+      MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER, mmec597_compute_counter);
-+      MK_MACRO(NVC0_3D_MACRO_COMPUTE_COUNTER_TO_QUERY, mmec597_compute_counter_to_query);
-+   }
- 
-    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
-    PUSH_DATA (push, 1);
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
-index b7e0c8a930f..490026b2c00 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
-@@ -64,6 +64,22 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
-    return true; /* stream output info only */
- }
- 
-+void
-+nvc0_program_sp_start_id(struct nvc0_context *nvc0, int stage,
-+                         struct nvc0_program *prog)
-+{
-+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-+
-+   if (nvc0->screen->eng3d->oclass < GV100_3D_CLASS) {
-+      BEGIN_NVC0(push, NVC0_3D(SP_START_ID(stage)), 1);
-+      PUSH_DATA (push, prog->code_base);
-+   } else {
-+      BEGIN_NVC0(push, SUBC_3D(GV100_3D_SP_ADDRESS_HIGH(stage)), 2);
-+      PUSH_DATAh(push, nvc0->screen->text->offset + prog->code_base);
-+      PUSH_DATA (push, nvc0->screen->text->offset + prog->code_base);
-+   }
-+}
-+
- void
- nvc0_vertprog_validate(struct nvc0_context *nvc0)
- {
-@@ -74,9 +90,9 @@ nvc0_vertprog_validate(struct nvc0_context *nvc0)
-          return;
-    nvc0_program_update_context_state(nvc0, vp, 0);
- 
--   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(1)), 2);
-+   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(1)), 1);
-    PUSH_DATA (push, 0x11);
--   PUSH_DATA (push, vp->code_base);
-+   nvc0_program_sp_start_id(nvc0, 1, vp);
-    BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(1)), 1);
-    PUSH_DATA (push, vp->num_gprs);
- 
-@@ -152,9 +168,9 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0)
-                  fp->fp.post_depth_coverage);
-    }
- 
--   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(5)), 2);
-+   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(5)), 1);
-    PUSH_DATA (push, 0x51);
--   PUSH_DATA (push, fp->code_base);
-+   nvc0_program_sp_start_id(nvc0, 5, fp);
-    BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(5)), 1);
-    PUSH_DATA (push, fp->num_gprs);
- 
-@@ -176,9 +192,9 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
-          BEGIN_NVC0(push, NVC0_3D(TESS_MODE), 1);
-          PUSH_DATA (push, tp->tp.tess_mode);
-       }
--      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2);
-+      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
-       PUSH_DATA (push, 0x21);
--      PUSH_DATA (push, tp->code_base);
-+      nvc0_program_sp_start_id(nvc0, 2, tp);
-       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
-       PUSH_DATA (push, tp->num_gprs);
-    } else {
-@@ -186,9 +202,9 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
-       /* not a whole lot we can do to handle this failure */
-       if (!nvc0_program_validate(nvc0, tp))
-          assert(!"unable to validate empty tcp");
--      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2);
-+      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
-       PUSH_DATA (push, 0x20);
--      PUSH_DATA (push, tp->code_base);
-+      nvc0_program_sp_start_id(nvc0, 2, tp);
-    }
-    nvc0_program_update_context_state(nvc0, tp, 1);
- }
-@@ -206,8 +222,7 @@ nvc0_tevlprog_validate(struct nvc0_context *nvc0)
-       }
-       BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1);
-       PUSH_DATA (push, 0x31);
--      BEGIN_NVC0(push, NVC0_3D(SP_START_ID(3)), 1);
--      PUSH_DATA (push, tp->code_base);
-+      nvc0_program_sp_start_id(nvc0, 3, tp);
-       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(3)), 1);
-       PUSH_DATA (push, tp->num_gprs);
-    } else {
-@@ -227,8 +242,7 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
-    if (gp && nvc0_program_validate(nvc0, gp) && gp->code_size) {
-       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
-       PUSH_DATA (push, 0x41);
--      BEGIN_NVC0(push, NVC0_3D(SP_START_ID(4)), 1);
--      PUSH_DATA (push, gp->code_base);
-+      nvc0_program_sp_start_id(nvc0, 4, gp);
-       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(4)), 1);
-       PUSH_DATA (push, gp->num_gprs);
-    } else {
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
-index 538effdb531..731b0b5dbf8 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
-@@ -29,6 +29,8 @@
- #include "util/format/u_format.h"
- #include "util/u_surface.h"
- 
-+#include "tgsi/tgsi_ureg.h"
-+
- #include "os/os_thread.h"
- 
- #include "nvc0/nvc0_context.h"
-@@ -138,6 +140,11 @@ nvc0_2d_texture_set(struct nouveau_pushbuf *push, bool dst,
-       PUSH_DATA (push, bo->offset + offset);
-    }
- 
-+   if (dst) {
-+      IMMED_NVC0(push, SUBC_2D(NVC0_2D_SET_DST_COLOR_RENDER_TO_ZETA_SURFACE),
-+                 util_format_is_depth_or_stencil(pformat));
-+   }
-+
- #if 0
-    if (dst) {
-       BEGIN_NVC0(push, SUBC_2D(NVC0_2D_CLIP_X), 4);
-@@ -772,7 +779,7 @@ gm200_evaluate_depth_buffer(struct pipe_context *pipe)
- struct nvc0_blitter
- {
-    struct nvc0_program *fp[NV50_BLIT_MAX_TEXTURE_TYPES][NV50_BLIT_MODES];
--   struct nvc0_program vp;
-+   struct nvc0_program *vp;
- 
-    struct nv50_tsc_entry sampler[2]; /* nearest, bilinear */
- 
-@@ -785,6 +792,7 @@ struct nvc0_blitctx
- {
-    struct nvc0_context *nvc0;
-    struct nvc0_program *fp;
-+   struct nvc0_program *vp;
-    uint8_t mode;
-    uint16_t color_mask;
-    uint8_t filter;
-@@ -809,78 +817,27 @@ struct nvc0_blitctx
-    struct nvc0_rasterizer_stateobj rast;
- };
- 
--static void
--nvc0_blitter_make_vp(struct nvc0_blitter *blit)
-+static void *
-+nvc0_blitter_make_vp(struct pipe_context *pipe)
- {
--   static const uint32_t code_nvc0[] =
--   {
--      0xfff11c26, 0x06000080, /* vfetch b64 $r4:$r5 a[0x80] */
--      0xfff01c46, 0x06000090, /* vfetch b96 $r0:$r1:$r2 a[0x90] */
--      0x13f01c26, 0x0a7e0070, /* export b64 o[0x70] $r4:$r5 */
--      0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */
--      0x00001de7, 0x80000000, /* exit */
--   };
--   static const uint32_t code_nve4[] =
--   {
--      0x00000007, 0x20000000, /* sched */
--      0xfff11c26, 0x06000080, /* vfetch b64 $r4:$r5 a[0x80] */
--      0xfff01c46, 0x06000090, /* vfetch b96 $r0:$r1:$r2 a[0x90] */
--      0x13f01c26, 0x0a7e0070, /* export b64 o[0x70] $r4:$r5 */
--      0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */
--      0x00001de7, 0x80000000, /* exit */
--   };
--   static const uint32_t code_gk110[] =
--   {
--      0x00000000, 0x08000000, /* sched */
--      0x401ffc12, 0x7ec7fc00, /* ld b64 $r4d a[0x80] 0x0 0x0 */
--      0x481ffc02, 0x7ecbfc00, /* ld b96 $r0t a[0x90] 0x0 0x0 */
--      0x381ffc12, 0x7f07fc00, /* st b64 a[0x70] $r4d 0x0 0x0 */
--      0x401ffc02, 0x7f0bfc00, /* st b96 a[0x80] $r0t 0x0 0x0 */
--      0x001c003c, 0x18000000, /* exit */
--   };
--   static const uint32_t code_gm107[] =
--   {
--      0xe4200701, 0x001d0400, /* sched (st 0x1 wr 0x0) (st 0x1 wr 0x1) (st 0x1 wr 0x2) */
--      0x0807ff00, 0xefd87f80, /* ld b32 $r0 a[0x80] 0x0 */
--      0x0847ff01, 0xefd87f80, /* ld b32 $r1 a[0x84] 0x0 */
--      0x0907ff02, 0xefd87f80, /* ld b32 $r2 a[0x90] 0x0 */
--      0xf0200761, 0x003f8400, /* sched (st 0x1 wr 0x3) (st 0x1 wr 0x4) (st 0x1 wt 0x1) */
--      0x0947ff03, 0xefd87f80, /* ld b32 $r3 a[0x94] 0x0 */
--      0x0987ff04, 0xefd87f80, /* ld b32 $r4 a[0x98] 0x0 */
--      0x0707ff00, 0xeff07f80, /* st b32 a[0x70] $r0 0x0 */
--      0xfc2017e1, 0x011f8404, /* sched (st 0x1 wt 0x2) (st 0x1 wt 0x4) (st 0x1 wt 0x8) */
--      0x0747ff01, 0xeff07f80, /* st b32 a[0x74] $r1 0x0 */
--      0x0807ff02, 0xeff07f80, /* st b32 a[0x80] $r2 0x0 */
--      0x0847ff03, 0xeff07f80, /* st b32 a[0x84] $r3 0x0 */
--      0xfde087e1, 0x001f8000, /* sched (st 0x1 wt 0x10) (st 0xf) (st 0x0) */
--      0x0887ff04, 0xeff07f80, /* st b32 a[0x88] $r4 0x0 */
--      0x0007000f, 0xe3000000, /* exit */
--   };
--
--   blit->vp.type = PIPE_SHADER_VERTEX;
--   blit->vp.translated = true;
--   if (blit->screen->base.class_3d >= GM107_3D_CLASS) {
--      blit->vp.code = (uint32_t *)code_gm107; /* const_cast */
--      blit->vp.code_size = sizeof(code_gm107);
--   } else
--   if (blit->screen->base.class_3d >= NVF0_3D_CLASS) {
--      blit->vp.code = (uint32_t *)code_gk110; /* const_cast */
--      blit->vp.code_size = sizeof(code_gk110);
--   } else
--   if (blit->screen->base.class_3d >= NVE4_3D_CLASS) {
--      blit->vp.code = (uint32_t *)code_nve4; /* const_cast */
--      blit->vp.code_size = sizeof(code_nve4);
--   } else {
--      blit->vp.code = (uint32_t *)code_nvc0; /* const_cast */
--      blit->vp.code_size = sizeof(code_nvc0);
--   }
--   blit->vp.num_gprs = 6;
--   blit->vp.vp.edgeflag = PIPE_MAX_ATTRIBS;
-+   struct ureg_program *ureg;
-+   struct ureg_src ipos, itex;
-+   struct ureg_dst opos, otex;
-+
-+   ureg = ureg_create(PIPE_SHADER_VERTEX);
-+   if (!ureg)
-+      return NULL;
-+
-+   opos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
-+   ipos = ureg_DECL_vs_input(ureg, 0);
-+   otex = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0);
-+   itex = ureg_DECL_vs_input(ureg, 1);
-+
-+   ureg_MOV(ureg, ureg_writemask(opos, TGSI_WRITEMASK_XY ), ipos);
-+   ureg_MOV(ureg, ureg_writemask(otex, TGSI_WRITEMASK_XYZ), itex);
-+   ureg_END(ureg);
- 
--   blit->vp.hdr[0]  = 0x00020461; /* vertprog magic */
--   blit->vp.hdr[4]  = 0x000ff000; /* no outputs read */
--   blit->vp.hdr[6]  = 0x00000073; /* a[0x80].xy, a[0x90].xyz */
--   blit->vp.hdr[13] = 0x00073000; /* o[0x70].xy, o[0x80].xyz */
-+   return ureg_create_shader_and_destroy(ureg, pipe);
- }
- 
- static void
-@@ -910,6 +867,20 @@ nvc0_blitter_make_sampler(struct nvc0_blitter *blit)
-       G80_TSC_1_MIP_FILTER_NONE;
- }
- 
-+static void
-+nvc0_blit_select_vp(struct nvc0_blitctx *ctx)
-+{
-+   struct nvc0_blitter *blitter = ctx->nvc0->screen->blitter;
-+
-+   if (!blitter->vp) {
-+      mtx_lock(&blitter->mutex);
-+      if (!blitter->vp)
-+         blitter->vp = nvc0_blitter_make_vp(&ctx->nvc0->base.pipe);
-+      mtx_unlock(&blitter->mutex);
-+   }
-+   ctx->vp = blitter->vp;
-+}
-+
- static void
- nvc0_blit_select_fp(struct nvc0_blitctx *ctx, const struct pipe_blit_info *info)
- {
-@@ -1082,7 +1053,7 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx,
- 
-    nvc0->rast = &ctx->rast;
- 
--   nvc0->vertprog = &blitter->vp;
-+   nvc0->vertprog = ctx->vp;
-    nvc0->tctlprog = NULL;
-    nvc0->tevlprog = NULL;
-    nvc0->gmtyprog = NULL;
-@@ -1221,6 +1192,7 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
-    blit->filter = nv50_blit_get_filter(info);
-    blit->render_condition_enable = info->render_condition_enable;
- 
-+   nvc0_blit_select_vp(blit);
-    nvc0_blit_select_fp(blit, info);
-    nvc0_blitctx_pre_blit(blit, info);
- 
-@@ -1266,6 +1238,11 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
-       }
-    }
- 
-+   if (screen->eng3d->oclass >= TU102_3D_CLASS) {
-+      IMMED_NVC0(push, SUBC_3D(TU102_3D_SET_COLOR_RENDER_TO_ZETA_SURFACE),
-+                 util_format_is_depth_or_stencil(info->dst.format));
-+   }
-+
-    IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 0);
-    IMMED_NVC0(push, NVC0_3D(VIEW_VOLUME_CLIP_CTRL), 0x2 |
-               NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_RANGE_0_1);
-@@ -1326,7 +1303,10 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
-    PUSH_DATAh(push, vtxbuf);
-    PUSH_DATA (push, vtxbuf);
-    PUSH_DATA (push, 0);
--   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
-+   if (screen->eng3d->oclass < TU102_3D_CLASS)
-+      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
-+   else
-+      BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
-    PUSH_DATAh(push, vtxbuf + length - 1);
-    PUSH_DATA (push, vtxbuf + length - 1);
- 
-@@ -1403,6 +1383,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
- 
-    /* restore viewport transform */
-    IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 1);
-+   if (screen->eng3d->oclass >= TU102_3D_CLASS)
-+      IMMED_NVC0(push, SUBC_3D(TU102_3D_SET_COLOR_RENDER_TO_ZETA_SURFACE), 0);
- }
- 
- static void
-@@ -1697,7 +1679,6 @@ nvc0_blitter_create(struct nvc0_screen *screen)
- 
-    (void) mtx_init(&screen->blitter->mutex, mtx_plain);
- 
--   nvc0_blitter_make_vp(screen->blitter);
-    nvc0_blitter_make_sampler(screen->blitter);
- 
-    return true;
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
-index 92bd7eb5b8e..8287d8431b1 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
-@@ -360,7 +360,11 @@ nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)
-          PUSH_DATAh(push, res->address + offset);
-          PUSH_DATA (push, res->address + offset);
-       }
--      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
-+
-+      if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS)
-+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
-+      else
-+         BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
-       PUSH_DATAh(push, res->address + limit);
-       PUSH_DATA (push, res->address + limit);
- 
-@@ -406,7 +410,11 @@ nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
-       PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
-       PUSH_DATAh(push, buf->address + offset);
-       PUSH_DATA (push, buf->address + offset);
--      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(b)), 2);
-+
-+      if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS)
-+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(b)), 2);
-+      else
-+         BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(b)), 2);
-       PUSH_DATAh(push, buf->address + limit);
-       PUSH_DATA (push, buf->address + limit);
- 
-@@ -961,12 +969,23 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
-       assert(nouveau_resource_mapped_by_gpu(&buf->base));
- 
-       PUSH_SPACE(push, 6);
--      BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5);
--      PUSH_DATAh(push, buf->address);
--      PUSH_DATA (push, buf->address);
--      PUSH_DATAh(push, buf->address + buf->base.width0 - 1);
--      PUSH_DATA (push, buf->address + buf->base.width0 - 1);
--      PUSH_DATA (push, info->index_size >> 1);
-+      if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS) {
-+         BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5);
-+         PUSH_DATAh(push, buf->address);
-+         PUSH_DATA (push, buf->address);
-+         PUSH_DATAh(push, buf->address + buf->base.width0 - 1);
-+         PUSH_DATA (push, buf->address + buf->base.width0 - 1);
-+         PUSH_DATA (push, info->index_size >> 1);
-+      } else {
-+         BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 2);
-+         PUSH_DATAh(push, buf->address);
-+         PUSH_DATA (push, buf->address);
-+         BEGIN_NVC0(push, SUBC_3D(TU102_3D_INDEX_ARRAY_LIMIT_HIGH), 2);
-+         PUSH_DATAh(push, buf->address + buf->base.width0 - 1);
-+         PUSH_DATA (push, buf->address + buf->base.width0 - 1);
-+         BEGIN_NVC0(push, NVC0_3D(INDEX_FORMAT), 1);
-+         PUSH_DATA (push, info->index_size >> 1);
-+      }
- 
-       BCTX_REFN(nvc0->bufctx_3d, 3D_IDX, buf, RD);
-    }
-diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
-index 8aa7088dfec..d49a5dfd2cf 100644
---- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
-@@ -228,7 +228,11 @@ nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count)
-    BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_START_HIGH(0)), 2);
-    PUSH_DATAh(push, va);
-    PUSH_DATA (push, va);
--   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
-+
-+   if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS)
-+      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
-+   else
-+      BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
-    PUSH_DATAh(push, va + size - 1);
-    PUSH_DATA (push, va + size - 1);
- 
-@@ -771,7 +775,11 @@ nvc0_push_upload_vertex_ids(struct push_context *ctx,
-    PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | index_size);
-    PUSH_DATAh(push, va);
-    PUSH_DATA (push, va);
--   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(1)), 2);
-+
-+   if (nvc0->screen->eng3d->oclass < TU102_3D_CLASS)
-+      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(1)), 2);
-+   else
-+      BEGIN_NVC0(push, SUBC_3D(TU102_3D_VERTEX_ARRAY_LIMIT_HIGH(1)), 2);
-    PUSH_DATAh(push, va + info->count * index_size - 1);
-    PUSH_DATA (push, va + info->count * index_size - 1);
- 
-diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
-index 146eeb35f85..ebbc410184b 100644
---- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
-+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
-@@ -27,11 +27,18 @@
- 
- #include "codegen/nv50_ir_driver.h"
- 
--#ifndef NDEBUG
--static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
--static void gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *);
--#endif
--
-+#include "drf.h"
-+#include "qmd.h"
-+#include "cla0c0qmd.h"
-+#include "clc0c0qmd.h"
-+#include "clc3c0qmd.h"
-+
-+#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a)
-+#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a)
-+#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a)
-+#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a)
-+#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
-+#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)
- 
- int
- nve4_screen_compute_setup(struct nvc0_screen *screen,
-@@ -45,6 +52,12 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
-    uint64_t address;
- 
-    switch (dev->chipset & ~0xf) {
-+   case 0x160:
-+      obj_class = TU102_COMPUTE_CLASS;
-+      break;
-+   case 0x140:
-+      obj_class = GV100_COMPUTE_CLASS;
-+      break;
-    case 0x100:
-    case 0xf0:
-       obj_class = NVF0_COMPUTE_CLASS; /* GK110 */
-@@ -88,24 +101,35 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
-    PUSH_DATAh(push, screen->tls->size / screen->mp_count);
-    PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
-    PUSH_DATA (push, 0xff);
--   BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
--   PUSH_DATAh(push, screen->tls->size / screen->mp_count);
--   PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
--   PUSH_DATA (push, 0xff);
-+   if (obj_class < GV100_COMPUTE_CLASS) {
-+      BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3);
-+      PUSH_DATAh(push, screen->tls->size / screen->mp_count);
-+      PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
-+      PUSH_DATA (push, 0xff);
-+   }
- 
-    /* Unified address space ? Who needs that ? Certainly not OpenCL.
-     *
-     * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
-     *  accessible. We cannot prevent that at the moment, so expect failure.
-     */
--   BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
--   PUSH_DATA (push, 0xff << 24);
--   BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
--   PUSH_DATA (push, 0xfe << 24);
--
--   BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
--   PUSH_DATAh(push, screen->text->offset);
--   PUSH_DATA (push, screen->text->offset);
-+   if (obj_class < GV100_COMPUTE_CLASS) {
-+      BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
-+      PUSH_DATA (push, 0xff << 24);
-+      BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
-+      PUSH_DATA (push, 0xfe << 24);
-+
-+      BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
-+      PUSH_DATAh(push, screen->text->offset);
-+      PUSH_DATA (push, screen->text->offset);
-+   } else {
-+      BEGIN_NVC0(push, SUBC_CP(0x2a0), 2);
-+      PUSH_DATAh(push, 0xfeULL << 24);
-+      PUSH_DATA (push, 0xfeULL << 24);
-+      BEGIN_NVC0(push, SUBC_CP(0x7b0), 2);
-+      PUSH_DATAh(push, 0xffULL << 24);
-+      PUSH_DATA (push, 0xffULL << 24);
-+   }
- 
-    BEGIN_NVC0(push, SUBC_CP(0x0310), 1);
-    PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
-@@ -542,14 +566,35 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
-    PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
- }
- 
--static inline uint8_t
--nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
-+static inline void
-+gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index,
-+                            struct nouveau_bo *bo, uint32_t base, uint32_t size)
-+{
-+   uint64_t address = bo->offset + base;
-+
-+   assert(index < 8);
-+   assert(!(base & 0xff));
-+
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index,
-+                                 DIV_ROUND_UP(size, 16));
-+   NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
-+}
-+
-+static inline void
-+nve4_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, struct nouveau_bo *bo,
-+                           uint32_t base, uint32_t size)
- {
--   if (shared_size > (32 << 10))
--      return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1;
--   if (shared_size > (16 << 10))
--      return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1;
--   return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
-+   uint64_t address = bo->offset + base;
-+
-+   assert(index < 8);
-+   assert(!(base & 0xff));
-+
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE);
- }
- 
- static void
-@@ -577,92 +622,186 @@ nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
- }
- 
- static void
--nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
--                               struct nve4_cp_launch_desc *desc,
-+nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
-                                const struct pipe_grid_info *info)
- {
-    const struct nvc0_screen *screen = nvc0->screen;
-    const struct nvc0_program *cp = nvc0->compprog;
- 
--   nve4_cp_launch_desc_init_default(desc);
--
--   desc->entry = nvc0_program_symbol_offset(cp, info->pc);
--
--   desc->griddim_x = info->grid[0];
--   desc->griddim_y = info->grid[1];
--   desc->griddim_z = info->grid[2];
--   desc->blockdim_x = info->block[0];
--   desc->blockdim_y = info->block[1];
--   desc->blockdim_z = info->block[2];
--
--   desc->shared_size = align(cp->cp.smem_size, 0x100);
--   desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10);
--   desc->local_size_n = 0;
--   desc->cstack_size = 0x800;
--   desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
-+   NVA0C0_QMDV00_06_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30);
-+
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET,
-+                                 nvc0_program_symbol_offset(cp, info->pc));
-+
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
-+
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE,
-+                                 align(cp->cp.smem_size, 0x100));
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
-+                                 (cp->hdr[1] & 0xfffff0) +
-+                                 align(cp->cp.lmem_size, 0x10));
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
-+
-+   if (cp->cp.smem_size > (32 << 10))
-+      NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
-+                                    DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
-+   else
-+   if (cp->cp.smem_size > (16 << 10))
-+      NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
-+                                    DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB);
-+   else
-+      NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION,
-+                                    DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB);
- 
--   desc->gpr_alloc = cp->num_gprs;
--   desc->bar_alloc = cp->num_barriers;
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
-+   NVA0C0_QMDV00_06_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
- 
-    // Only bind user uniforms and the driver constant buffer through the
-    // launch descriptor because UBOs are sticked to the driver cb to avoid the
-    // limitation of 8 CBs.
-    if (nvc0->constbuf[5][0].user || cp->parm_size) {
--      nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
-+      nve4_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
-                                  NVC0_CB_USR_INFO(5), 1 << 16);
- 
-       // Later logic will attempt to bind a real buffer at position 0. That
-       // should not happen if we've bound a user buffer.
-       assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
-    }
--   nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
-+   nve4_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
-                               NVC0_CB_AUX_INFO(5), 1 << 11);
- 
--   nve4_compute_setup_buf_cb(nvc0, false, desc);
-+   nve4_compute_setup_buf_cb(nvc0, false, qmd);
- }
- 
- static void
--gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
--                                struct gp100_cp_launch_desc *desc,
-+gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd,
-                                 const struct pipe_grid_info *info)
- {
-    const struct nvc0_screen *screen = nvc0->screen;
-    const struct nvc0_program *cp = nvc0->compprog;
- 
--   gp100_cp_launch_desc_init_default(desc);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
-+   NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR);
-+   NVC0C0_QMDV02_01_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR);
-+   NVC0C0_QMDV02_01_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
-+
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET,
-+                                 nvc0_program_symbol_offset(cp, info->pc));
-+
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
-+
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE,
-+                                 align(cp->cp.smem_size, 0x100));
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
-+                                 (cp->hdr[1] & 0xfffff0) +
-+                                 align(cp->cp.lmem_size, 0x10));
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800);
- 
--   desc->entry = nvc0_program_symbol_offset(cp, info->pc);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs);
-+   NVC0C0_QMDV02_01_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
- 
--   desc->griddim_x = info->grid[0];
--   desc->griddim_y = info->grid[1];
--   desc->griddim_z = info->grid[2];
--   desc->blockdim_x = info->block[0];
--   desc->blockdim_y = info->block[1];
--   desc->blockdim_z = info->block[2];
-+   // Only bind user uniforms and the driver constant buffer through the
-+   // launch descriptor because UBOs are sticked to the driver cb to avoid the
-+   // limitation of 8 CBs.
-+   if (nvc0->constbuf[5][0].user || cp->parm_size) {
-+      gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
-+                                  NVC0_CB_USR_INFO(5), 1 << 16);
- 
--   desc->shared_size = align(cp->cp.smem_size, 0x100);
--   desc->local_size_p = (cp->hdr[1] & 0xfffff0) + align(cp->cp.lmem_size, 0x10);
--   desc->local_size_n = 0;
--   desc->cstack_size = 0x800;
-+      // Later logic will attempt to bind a real buffer at position 0. That
-+      // should not happen if we've bound a user buffer.
-+      assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
-+   }
-+   gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
-+                               NVC0_CB_AUX_INFO(5), 1 << 11);
-+
-+   nve4_compute_setup_buf_cb(nvc0, true, qmd);
-+}
-+
-+static int
-+gv100_sm_config_smem_size(u32 size)
-+{
-+   if      (size > 64 * 1024) size = 96 * 1024;
-+   else if (size > 32 * 1024) size = 64 * 1024;
-+   else if (size > 16 * 1024) size = 32 * 1024;
-+   else if (size >  8 * 1024) size = 16 * 1024;
-+   else                       size =  8 * 1024;
-+   return (size / 4096) + 1;
-+}
- 
--   desc->gpr_alloc = cp->num_gprs;
--   desc->bar_alloc = cp->num_barriers;
-+static void
-+gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd,
-+                                const struct pipe_grid_info *info)
-+{
-+   struct nvc0_program *cp = nvc0->compprog;
-+   struct nvc0_screen *screen = nvc0->screen;
-+   uint64_t entry =
-+      screen->text->offset + nvc0_program_symbol_offset(cp, info->pc);
-+
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1);
-+   NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK);
-+   NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
-+                                  align(cp->cp.smem_size, 0x100));
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
-+                                 (cp->hdr[1] & 0xfffff0) +
-+                                 align(cp->cp.lmem_size, 0x10));
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
-+                                  gv100_sm_config_smem_size(8 * 1024));
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE,
-+                                  gv100_sm_config_smem_size(96 * 1024));
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE,
-+                                  gv100_sm_config_smem_size(cp->cp.smem_size));
-+
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, cp->num_gprs);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers);
- 
-    // Only bind user uniforms and the driver constant buffer through the
-    // launch descriptor because UBOs are sticked to the driver cb to avoid the
-    // limitation of 8 CBs.
-    if (nvc0->constbuf[5][0].user || cp->parm_size) {
--      gp100_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
-+      gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo,
-                                   NVC0_CB_USR_INFO(5), 1 << 16);
- 
-       // Later logic will attempt to bind a real buffer at position 0. That
-       // should not happen if we've bound a user buffer.
-       assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf);
-    }
--   gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
-+   gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo,
-                                NVC0_CB_AUX_INFO(5), 1 << 11);
- 
--   nve4_compute_setup_buf_cb(nvc0, true, desc);
-+   nve4_compute_setup_buf_cb(nvc0, true, qmd);
-+
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff);
-+   NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32);
- }
- 
- static inline void *
-@@ -677,6 +816,7 @@ nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
-       ptr += adj;
-       *pgpuaddr += adj;
-    }
-+   memset(ptr, 0x00, 256);
-    return ptr;
- }
- 
-@@ -734,6 +874,9 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
-    if (ret)
-       goto out;
- 
-+   if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
-+      gv100_compute_setup_launch_desc(nvc0, desc, info);
-+   else
-    if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
-       gp100_compute_setup_launch_desc(nvc0, desc, info);
-    else
-@@ -743,10 +886,14 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
- 
- #ifndef NDEBUG
-    if (debug_get_num_option("NV50_PROG_DEBUG", 0)) {
-+      debug_printf("Queue Meta Data:\n");
-+      if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS)
-+         NVC3C0QmdDump_V02_02(desc);
-+      else
-       if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS)
--         gp100_compute_dump_launch_desc(desc);
-+         NVC0C0QmdDump_V02_01(desc);
-       else
--         nve4_compute_dump_launch_desc(desc);
-+         NVA0C0QmdDump_V00_06(desc);
-    }
- #endif
- 
-@@ -877,115 +1024,6 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
-    nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
- }
- 
--
--#ifndef NDEBUG
--static const char *nve4_cache_split_name(unsigned value)
--{
--   switch (value) {
--   case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1";
--   case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1";
--   case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1";
--   default:
--      return "(invalid)";
--   }
--}
--
--static void
--nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
--{
--   const uint32_t *data = (const uint32_t *)desc;
--   unsigned i;
--   bool zero = false;
--
--   debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
--
--   for (i = 0; i < sizeof(*desc); i += 4) {
--      if (data[i / 4]) {
--         debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
--         zero = false;
--      } else
--      if (!zero) {
--         debug_printf("...\n");
--         zero = true;
--      }
--   }
--
--   debug_printf("entry = 0x%x\n", desc->entry);
--   debug_printf("grid dimensions = %ux%ux%u\n",
--                desc->griddim_x, desc->griddim_y, desc->griddim_z);
--   debug_printf("block dimensions = %ux%ux%u\n",
--                desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
--   debug_printf("s[] size: 0x%x\n", desc->shared_size);
--   debug_printf("l[] size: -0x%x / +0x%x\n",
--                desc->local_size_n, desc->local_size_p);
--   debug_printf("stack size: 0x%x\n", desc->cstack_size);
--   debug_printf("barrier count: %u\n", desc->bar_alloc);
--   debug_printf("$r count: %u\n", desc->gpr_alloc);
--   debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split));
--   debug_printf("linked tsc: %d\n", desc->linked_tsc);
--
--   for (i = 0; i < 8; ++i) {
--      uint64_t address;
--      uint32_t size = desc->cb[i].size;
--      bool valid = !!(desc->cb_mask & (1 << i));
--
--      address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
--
--      if (!valid && !address && !size)
--         continue;
--      debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
--                   i, address, size, valid ? "" : "  (invalid)");
--   }
--}
--
--static void
--gp100_compute_dump_launch_desc(const struct gp100_cp_launch_desc *desc)
--{
--   const uint32_t *data = (const uint32_t *)desc;
--   unsigned i;
--   bool zero = false;
--
--   debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
--
--   for (i = 0; i < sizeof(*desc); i += 4) {
--      if (data[i / 4]) {
--         debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
--         zero = false;
--      } else
--      if (!zero) {
--         debug_printf("...\n");
--         zero = true;
--      }
--   }
--
--   debug_printf("entry = 0x%x\n", desc->entry);
--   debug_printf("grid dimensions = %ux%ux%u\n",
--                desc->griddim_x, desc->griddim_y, desc->griddim_z);
--   debug_printf("block dimensions = %ux%ux%u\n",
--                desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
--   debug_printf("s[] size: 0x%x\n", desc->shared_size);
--   debug_printf("l[] size: -0x%x / +0x%x\n",
--                desc->local_size_n, desc->local_size_p);
--   debug_printf("stack size: 0x%x\n", desc->cstack_size);
--   debug_printf("barrier count: %u\n", desc->bar_alloc);
--   debug_printf("$r count: %u\n", desc->gpr_alloc);
--   debug_printf("linked tsc: %d\n", desc->linked_tsc);
--
--   for (i = 0; i < 8; ++i) {
--      uint64_t address;
--      uint32_t size = desc->cb[i].size_sh4 << 4;
--      bool valid = !!(desc->cb_mask & (1 << i));
--
--      address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
--
--      if (!valid && !address && !size)
--         continue;
--      debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
--                   i, address, size, valid ? "" : "  (invalid)");
--   }
--}
--#endif
--
- #ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
- static void
- nve4_compute_trap_info(struct nvc0_context *nvc0)
-diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
-index 7ff6935cc3d..d2599f7a71d 100644
---- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
-+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
-@@ -4,142 +4,6 @@
- 
- #include "nvc0/nve4_compute.xml.h"
- 
--struct nve4_cp_launch_desc
--{
--   u32 unk0[8];
--   u32 entry;
--   u32 unk9[2];
--   u32 unk11_0      : 30;
--   u32 linked_tsc   : 1;
--   u32 unk11_31     : 1;
--   u32 griddim_x    : 31;
--   u32 unk12        : 1;
--   u16 griddim_y;
--   u16 griddim_z;
--   u32 unk14[3];
--   u16 shared_size; /* must be aligned to 0x100 */
--   u16 unk17;
--   u16 unk18;
--   u16 blockdim_x;
--   u16 blockdim_y;
--   u16 blockdim_z;
--   u32 cb_mask      : 8;
--   u32 unk20_8      : 21;
--   u32 cache_split  : 2;
--   u32 unk20_31     : 1;
--   u32 unk21[8];
--   struct {
--      u32 address_l;
--      u32 address_h : 8;
--      u32 reserved  : 7;
--      u32 size      : 17;
--   } cb[8];
--   u32 local_size_p : 20;
--   u32 unk45_20     : 7;
--   u32 bar_alloc    : 5;
--   u32 local_size_n : 20;
--   u32 unk46_20     : 4;
--   u32 gpr_alloc    : 8;
--   u32 cstack_size  : 20;
--   u32 unk47_20     : 12;
--   u32 unk48[16];
--};
--
--struct gp100_cp_launch_desc
--{
--   u32 unk0[8];
--   u32 entry;
--   u32 unk9[2];
--   u32 unk11_0      : 30;
--   u32 linked_tsc   : 1;
--   u32 unk11_31     : 1;
--   u32 griddim_x    : 31;
--   u32 unk12        : 1;
--   u16 griddim_y;
--   u16 unk13;
--   u16 griddim_z;
--   u16 unk14;
--   u32 unk15[2];
--   u32 shared_size  : 18;
--   u32 unk17        : 14;
--   u16 unk18;
--   u16 blockdim_x;
--   u16 blockdim_y;
--   u16 blockdim_z;
--   u32 cb_mask      : 8;
--   u32 unk20        : 24;
--   u32 unk21[8];
--   u32 local_size_p : 24;
--   u32 unk29        : 3;
--   u32 bar_alloc    : 5;
--   u32 local_size_n : 24;
--   u32 gpr_alloc    : 8;
--   u32 cstack_size  : 24;
--   u32 unk31        : 8;
--   struct {
--      u32 address_l;
--      u32 address_h : 17;
--      u32 reserved  : 2;
--      u32 size_sh4  : 13;
--   } cb[8];
--   u32 unk48[16];
--};
--
--static inline void
--nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
--{
--   memset(desc, 0, sizeof(*desc));
--
--   desc->unk0[7]  = 0xbc000000;
--   desc->unk11_0  = 0x04014000;
--   desc->unk47_20 = 0x300;
--}
--
--static inline void
--nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
--                           unsigned index,
--                           struct nouveau_bo *bo,
--                           uint32_t base, uint32_t size)
--{
--   uint64_t address = bo->offset + base;
--
--   assert(index < 8);
--   assert(!(base & 0xff));
--
--   desc->cb[index].address_l = address;
--   desc->cb[index].address_h = address >> 32;
--   desc->cb[index].size = size;
--
--   desc->cb_mask |= 1 << index;
--}
--
--static inline void
--gp100_cp_launch_desc_init_default(struct gp100_cp_launch_desc *desc)
--{
--   memset(desc, 0, sizeof(*desc));
--
--   desc->unk0[4]  = 0x40;
--   desc->unk11_0  = 0x04014000;
--}
--
--static inline void
--gp100_cp_launch_desc_set_cb(struct gp100_cp_launch_desc *desc,
--                            unsigned index,
--                            struct nouveau_bo *bo,
--                            uint32_t base, uint32_t size)
--{
--   uint64_t address = bo->offset + base;
--
--   assert(index < 8);
--   assert(!(base & 0xff));
--
--   desc->cb[index].address_l = address;
--   desc->cb[index].address_h = address >> 32;
--   desc->cb[index].size_sh4 = DIV_ROUND_UP(size, 16);
--
--   desc->cb_mask |= 1 << index;
--}
--
- struct nve4_mp_trap_info {
-    u32 lock;
-    u32 pc;
-diff --git a/src/gallium/drivers/nouveau/nvc0/qmd.h b/src/gallium/drivers/nouveau/nvc0/qmd.h
-new file mode 100644
-index 00000000000..86c290fe836
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/qmd.h
-@@ -0,0 +1,68 @@
-+#ifndef __NVHW_QMD_H__
-+#define __NVHW_QMD_H__
-+#include <stdio.h>
-+#include <stdint.h>
-+#include "util/u_debug.h"
-+#include "drf.h"
-+
-+#define NVQMD_ENUM_1(X,drf,v0)                                                 \
-+   [drf##_##v0] = #v0
-+#define NVQMD_ENUM_2(X,drf,v0,v1)                                              \
-+   [drf##_##v0] = #v0,                                                         \
-+   [drf##_##v1] = #v1
-+#define NVQMD_ENUM_3(X,drf,v0,v1,v2)                                           \
-+   [drf##_##v0] = #v0,                                                         \
-+   [drf##_##v1] = #v1,                                                         \
-+   [drf##_##v2] = #v2
-+#define NVQMD_ENUM_8(X,drf,v0,v1,v2,v3,v4,v5,v6,v7)                            \
-+   [drf##_##v0] = #v0,                                                         \
-+   [drf##_##v1] = #v1,                                                         \
-+   [drf##_##v2] = #v2,                                                         \
-+   [drf##_##v3] = #v3,                                                         \
-+   [drf##_##v4] = #v4,                                                         \
-+   [drf##_##v5] = #v5,                                                         \
-+   [drf##_##v6] = #v6,                                                         \
-+   [drf##_##v7] = #v7
-+
-+#define NVQMD_ENUM_(X,_1,_2,_3,_4,_5,_6,_7,_8,_9,IMPL,...) IMPL
-+#define NVQMD_ENUM(A...) NVQMD_ENUM_(X, ##A, NVQMD_ENUM_8, NVQMD_ENUM_7,       \
-+                                             NVQMD_ENUM_6, NVQMD_ENUM_5,       \
-+                                             NVQMD_ENUM_4, NVQMD_ENUM_3,       \
-+                                             NVQMD_ENUM_2, NVQMD_ENUM_1)(X, ##A)
-+
-+#define NVQMD_VAL_N(X,d,r,p,f,o) do {                                          \
-+   uint32_t val = NVVAL_MW_GET_X((p), d##_##r##_##f);                          \
-+   debug_printf("   %-36s: "o"\n", #f, val);                                   \
-+} while(0)
-+#define NVQMD_VAL_I(X,d,r,p,f,i,o) do {                                        \
-+   uint32_t val = NVVAL_MW_GET_X((p), d##_##r##_##f(i));                       \
-+   char name[80];                                                              \
-+   snprintf(name, sizeof(name), "%s(%d)", #f, i);                              \
-+   debug_printf("   %-36s: "o"\n", name, val);                                 \
-+} while(0)
-+#define NVQMD_VAL_(X,_1,_2,_3,_4,_5,_6,IMPL,...) IMPL
-+#define NVQMD_VAL(A...) NVQMD_VAL_(X, ##A, NVQMD_VAL_I, NVQMD_VAL_N)(X, ##A)
-+
-+#define NVQMD_DEF(d,r,p,f,e...) do {                                           \
-+   static const char *ev[] = { NVQMD_ENUM(d##_##r##_##f,##e) };                \
-+   uint32_t val = NVVAL_MW_GET((p), d, r, f);                                  \
-+   if (val < ARRAY_SIZE(ev) && ev[val])                                        \
-+      debug_printf("   %-36s: %s\n", #f, ev[val]);                             \
-+   else                                                                        \
-+      debug_printf("   %-36s: UNKNOWN 0x%x\n", #f, val);                       \
-+} while(0)
-+#define NVQMD_IDX(d,r,p,f,i,e...) do {                                         \
-+   static const char *ev[] = { NVQMD_ENUM(d##_##r##_##f,##e) };                \
-+   char name[80];                                                              \
-+   snprintf(name, sizeof(name), "%s(%d)", #f, i);                              \
-+   uint32_t val = NVVAL_MW_GET((p), d, r, f, i);                               \
-+   if (val < ARRAY_SIZE(ev) && ev[val])                                        \
-+      debug_printf("   %-36s: %s\n", name, ev[val]);                           \
-+   else                                                                        \
-+      debug_printf("   %-36s: UNKNOWN 0x%x\n", name, val);                     \
-+} while(0)
-+
-+void NVA0C0QmdDump_V00_06(uint32_t *);
-+void NVC0C0QmdDump_V02_01(uint32_t *);
-+void NVC3C0QmdDump_V02_02(uint32_t *);
-+#endif
-diff --git a/src/gallium/drivers/nouveau/nvc0/qmda0c0.c b/src/gallium/drivers/nouveau/nvc0/qmda0c0.c
-new file mode 100644
-index 00000000000..7103a893af5
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/qmda0c0.c
-@@ -0,0 +1,166 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#include "qmd.h"
-+#include "cla0c0qmd.h"
-+
-+#define NVA0C0_QMDV00_06_VAL(a...) NVQMD_VAL(NVA0C0, QMDV00_06, ##a)
-+#define NVA0C0_QMDV00_06_DEF(a...) NVQMD_DEF(NVA0C0, QMDV00_06, ##a)
-+#define NVA0C0_QMDV00_06_IDX(a...) NVQMD_IDX(NVA0C0, QMDV00_06, ##a)
-+
-+void
-+NVA0C0QmdDump_V00_06(uint32_t *qmd)
-+{
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_A, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_B, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_C, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_D, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_E, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_F, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_G, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_H, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_A_A, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_I, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_J, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_A, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_K, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_L, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, SEMAPHORE_RELEASE_ENABLE0, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, SEMAPHORE_RELEASE_ENABLE1, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_B, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_M, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_N, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_O, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_C, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_TEXTURE_DATA_CACHE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_SHADER_DATA_CACHE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_INSTRUCTION_CACHE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_VAL(qmd, PROGRAM_OFFSET, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_P, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_Q, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_D, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_R, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_S, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE_MEMBAR_TYPE, FE_NONE, FE_SYSMEMBAR);
-+   NVA0C0_QMDV00_06_DEF(qmd, CWD_MEMBAR_TYPE, L1_NONE, L1_SYSMEMBAR, L1_MEMBAR);
-+   NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_T, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_U, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, THROTTLED, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E2_A, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E2_B, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, API_VISIBLE_CALL_LIMIT, _32, NO_CHECK);
-+   NVA0C0_QMDV00_06_DEF(qmd, SHARED_MEMORY_BANK_MAPPING, FOUR_BYTES_PER_BANK,
-+                                                         EIGHT_BYTES_PER_BANK);
-+   NVA0C0_QMDV00_06_DEF(qmd, SAMPLER_INDEX, INDEPENDENTLY, VIA_HEADER_INDEX);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_E3_A, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_WIDTH, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_HEIGHT, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_DEPTH, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_WIDTH_RESUME, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_HEIGHT_RESUME, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_RASTER_DEPTH_RESUME, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_V, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_F, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, QMD_RESERVED_V1_W, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_VAL(qmd, SHARED_MEMORY_SIZE, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_G, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_VERSION, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_MAJOR_VERSION, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_H, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_THREAD_DIMENSION0, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_THREAD_DIMENSION1, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, CTA_THREAD_DIMENSION2, "0x%x");
-+   for (int i = 0; i < 8; i++)
-+      NVA0C0_QMDV00_06_IDX(qmd, CONSTANT_BUFFER_VALID, i, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_I, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, L1_CONFIGURATION,
-+                             DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB,
-+                             DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB,
-+                             DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_X, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_V1_Y, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, RELEASE0_ADDRESS_LOWER, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, RELEASE0_ADDRESS_UPPER, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_J, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_REDUCTION_OP, RED_ADD,
-+                                                    RED_MIN,
-+                                                    RED_MAX,
-+                                                    RED_INC,
-+                                                    RED_DEC,
-+                                                    RED_AND,
-+                                                    RED_OR,
-+                                                    RED_XOR);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_K, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32);
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_REDUCTION_ENABLE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE0_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD);
-+   NVA0C0_QMDV00_06_VAL(qmd, RELEASE0_PAYLOAD, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, RELEASE1_ADDRESS_LOWER, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, RELEASE1_ADDRESS_UPPER, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_L, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_REDUCTION_OP, RED_ADD,
-+                                                    RED_MIN,
-+                                                    RED_MAX,
-+                                                    RED_INC,
-+                                                    RED_DEC,
-+                                                    RED_AND,
-+                                                    RED_OR,
-+                                                    RED_XOR);
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_M, "0x%x");
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32);
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_REDUCTION_ENABLE, FALSE, TRUE);
-+   NVA0C0_QMDV00_06_DEF(qmd, RELEASE1_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD);
-+   NVA0C0_QMDV00_06_VAL(qmd, RELEASE1_PAYLOAD, "0x%x");
-+   for (int i = 0; i < 8; i++) {
-+      NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_ADDR_LOWER, i, "0x%x");
-+      NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_ADDR_UPPER, i, "0x%x");
-+      NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_RESERVED_ADDR, i, "0x%x");
-+      NVA0C0_QMDV00_06_IDX(qmd, CONSTANT_BUFFER_INVALIDATE, i, FALSE, TRUE);
-+      NVA0C0_QMDV00_06_VAL(qmd, CONSTANT_BUFFER_SIZE, i, "0x%x");
-+   }
-+   NVA0C0_QMDV00_06_VAL(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_RESERVED_N, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, BARRIER_COUNT, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, REGISTER_COUNT, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, SASS_VERSION, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_A, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_B, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_C, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_D, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_E, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_F, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_G, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_H, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_I, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_J, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_K, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_L, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_M, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, QMD_SPARE_N, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, DEBUG_ID_UPPER, "0x%x");
-+   NVA0C0_QMDV00_06_VAL(qmd, DEBUG_ID_LOWER, "0x%x");
-+}
-diff --git a/src/gallium/drivers/nouveau/nvc0/qmdc0c0.c b/src/gallium/drivers/nouveau/nvc0/qmdc0c0.c
-new file mode 100644
-index 00000000000..945439ee0c8
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/qmdc0c0.c
-@@ -0,0 +1,165 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#include "qmd.h"
-+#include "clc0c0qmd.h"
-+
-+#define NVC0C0_QMDV02_01_VAL(a...) NVQMD_VAL(NVC0C0, QMDV02_01, ##a)
-+#define NVC0C0_QMDV02_01_DEF(a...) NVQMD_DEF(NVC0C0, QMDV02_01, ##a)
-+#define NVC0C0_QMDV02_01_IDX(a...) NVQMD_IDX(NVC0C0, QMDV02_01, ##a)
-+
-+void
-+NVC0C0QmdDump_V02_01(uint32_t *qmd)
-+{
-+   NVC0C0_QMDV02_01_VAL(qmd, OUTER_PUT, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, OUTER_OVERFLOW, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, OUTER_GET, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, OUTER_STICKY_OVERFLOW, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, INNER_GET, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, INNER_OVERFLOW, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, INNER_PUT, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, INNER_STICKY_OVERFLOW, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_GROUP_ID, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SM_GLOBAL_CACHING_ENABLE, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, RUN_CTA_IN_ONE_SM_PARTITION, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, IS_QUEUE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, SEMAPHORE_RELEASE_ENABLE0, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, SEMAPHORE_RELEASE_ENABLE1, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, REQUIRE_SCHEDULING_PCAS, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, DEPENDENT_QMD_SCHEDULE_ENABLE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, DEPENDENT_QMD_TYPE, QUEUE, GRID);
-+   NVC0C0_QMDV02_01_DEF(qmd, DEPENDENT_QMD_FIELD_COPY, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_B, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_SIZE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_C, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_TEXTURE_DATA_CACHE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_SHADER_DATA_CACHE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_INSTRUCTION_CACHE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_WIDTH_RESUME, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_HEIGHT_RESUME, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_DEPTH_RESUME, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, PROGRAM_OFFSET, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_ADDR_LOWER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_ADDR_UPPER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_D, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CIRCULAR_QUEUE_ENTRY_SIZE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CWD_REFERENCE_COUNT_ID, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CWD_REFERENCE_COUNT_DELTA_MINUS_ONE, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE_MEMBAR_TYPE, FE_NONE, FE_SYSMEMBAR);
-+   NVC0C0_QMDV02_01_DEF(qmd, CWD_REFERENCE_COUNT_INCR_ENABLE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, CWD_MEMBAR_TYPE, L1_NONE, L1_SYSMEMBAR, L1_MEMBAR);
-+   NVC0C0_QMDV02_01_DEF(qmd, SEQUENTIALLY_RUN_CTAS, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, CWD_REFERENCE_COUNT_DECR_ENABLE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, THROTTLED, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, API_VISIBLE_CALL_LIMIT, _32, NO_CHECK);
-+   NVC0C0_QMDV02_01_DEF(qmd, SAMPLER_INDEX, INDEPENDENTLY, VIA_HEADER_INDEX);
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_WIDTH, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_HEIGHT, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED13A, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_RASTER_DEPTH, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED14A, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, DEPENDENT_QMD_POINTER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QUEUE_ENTRIES_PER_CTA_MINUS_ONE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, COALESCE_WAITING_PERIOD, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SHARED_MEMORY_SIZE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_G, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_VERSION, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_MAJOR_VERSION, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_H, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_THREAD_DIMENSION0, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_THREAD_DIMENSION1, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, CTA_THREAD_DIMENSION2, "0x%x");
-+   for (int i = 0; i < 8; i++)
-+      NVC0C0_QMDV02_01_IDX(qmd, CONSTANT_BUFFER_VALID, i, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_I, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SM_DISABLE_MASK_LOWER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SM_DISABLE_MASK_UPPER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, RELEASE0_ADDRESS_LOWER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, RELEASE0_ADDRESS_UPPER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_J, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_REDUCTION_OP, RED_ADD,
-+                                                    RED_MIN,
-+                                                    RED_MAX,
-+                                                    RED_INC,
-+                                                    RED_DEC,
-+                                                    RED_AND,
-+                                                    RED_OR,
-+                                                    RED_XOR);
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_K, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32);
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_REDUCTION_ENABLE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE0_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD);
-+   NVC0C0_QMDV02_01_VAL(qmd, RELEASE0_PAYLOAD, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, RELEASE1_ADDRESS_LOWER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, RELEASE1_ADDRESS_UPPER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_L, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_REDUCTION_OP, RED_ADD,
-+                                                    RED_MIN,
-+                                                    RED_MAX,
-+                                                    RED_INC,
-+                                                    RED_DEC,
-+                                                    RED_AND,
-+                                                    RED_OR,
-+                                                    RED_XOR);
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_M, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32);
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_REDUCTION_ENABLE, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_DEF(qmd, RELEASE1_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD);
-+   NVC0C0_QMDV02_01_VAL(qmd, RELEASE1_PAYLOAD, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_N, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, BARRIER_COUNT, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, REGISTER_COUNT, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, SASS_VERSION, "0x%x");
-+   for (int i = 0; i < 8; i++) {
-+      NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_ADDR_LOWER, i, "0x%x");
-+      NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_ADDR_UPPER, i, "0x%x");
-+      NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_RESERVED_ADDR, i, "0x%x");
-+      NVC0C0_QMDV02_01_IDX(qmd, CONSTANT_BUFFER_INVALIDATE, i, FALSE, TRUE);
-+      NVC0C0_QMDV02_01_VAL(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, i, "0x%x");
-+   }
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_R, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_S, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_INNER_GET, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_REQUIRE_SCHEDULING_PCAS, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_INNER_PUT, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_SCG_TYPE, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_RESERVED_Q, "0x%x");
-+   NVC0C0_QMDV02_01_DEF(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID, FALSE, TRUE);
-+   NVC0C0_QMDV02_01_VAL(qmd, HW_ONLY_SKED_NEXT_QMD_POINTER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_G, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_H, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_I, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_J, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_K, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_L, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_M, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, QMD_SPARE_N, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, DEBUG_ID_UPPER, "0x%x");
-+   NVC0C0_QMDV02_01_VAL(qmd, DEBUG_ID_LOWER, "0x%x");
-+}
-diff --git a/src/gallium/drivers/nouveau/nvc0/qmdc3c0.c b/src/gallium/drivers/nouveau/nvc0/qmdc3c0.c
-new file mode 100644
-index 00000000000..c9bd8966114
---- /dev/null
-+++ b/src/gallium/drivers/nouveau/nvc0/qmdc3c0.c
-@@ -0,0 +1,168 @@
-+/*
-+ * Copyright 2020 Red Hat Inc.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the "Software"),
-+ * to deal in the Software without restriction, including without limitation
-+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
-+ * and/or sell copies of the Software, and to permit persons to whom the
-+ * Software is furnished to do so, subject to the following conditions:
-+ *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
-+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-+ * OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+#include "qmd.h"
-+#include "clc3c0qmd.h"
-+
-+#define NVC3C0_QMDV02_02_VAL(a...) NVQMD_VAL(NVC3C0, QMDV02_02, ##a)
-+#define NVC3C0_QMDV02_02_DEF(a...) NVQMD_DEF(NVC3C0, QMDV02_02, ##a)
-+#define NVC3C0_QMDV02_02_IDX(a...) NVQMD_IDX(NVC3C0, QMDV02_02, ##a)
-+
-+void
-+NVC3C0QmdDump_V02_02(uint32_t *qmd)
-+{
-+   NVC3C0_QMDV02_02_VAL(qmd, OUTER_PUT, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, OUTER_OVERFLOW, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, OUTER_GET, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, OUTER_STICKY_OVERFLOW, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, INNER_GET, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, INNER_OVERFLOW, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, INNER_PUT, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, INNER_STICKY_OVERFLOW, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_GROUP_ID, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SM_GLOBAL_CACHING_ENABLE, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, RUN_CTA_IN_ONE_SM_PARTITION, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, IS_QUEUE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, ADD_TO_HEAD_OF_QMD_GROUP_LINKED_LIST, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, SEMAPHORE_RELEASE_ENABLE0, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, SEMAPHORE_RELEASE_ENABLE1, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, REQUIRE_SCHEDULING_PCAS, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, DEPENDENT_QMD_SCHEDULE_ENABLE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, DEPENDENT_QMD_TYPE, QUEUE, GRID);
-+   NVC3C0_QMDV02_02_DEF(qmd, DEPENDENT_QMD_FIELD_COPY, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_B, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_C, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_TEXTURE_DATA_CACHE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_SHADER_DATA_CACHE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_INSTRUCTION_CACHE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_WIDTH_RESUME, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_HEIGHT_RESUME, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_DEPTH_RESUME, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, PROGRAM_OFFSET, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_ADDR_LOWER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_ADDR_UPPER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_D, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CIRCULAR_QUEUE_ENTRY_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CWD_REFERENCE_COUNT_ID, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CWD_REFERENCE_COUNT_DELTA_MINUS_ONE, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE_MEMBAR_TYPE, FE_NONE, FE_SYSMEMBAR);
-+   NVC3C0_QMDV02_02_DEF(qmd, CWD_REFERENCE_COUNT_INCR_ENABLE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, CWD_MEMBAR_TYPE, L1_NONE, L1_SYSMEMBAR, L1_MEMBAR);
-+   NVC3C0_QMDV02_02_DEF(qmd, SEQUENTIALLY_RUN_CTAS, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, CWD_REFERENCE_COUNT_DECR_ENABLE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, API_VISIBLE_CALL_LIMIT, _32, NO_CHECK);
-+   NVC3C0_QMDV02_02_DEF(qmd, SAMPLER_INDEX, INDEPENDENTLY, VIA_HEADER_INDEX);
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_WIDTH, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_HEIGHT, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED13A, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_RASTER_DEPTH, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED14A, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, DEPENDENT_QMD_POINTER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QUEUE_ENTRIES_PER_CTA_MINUS_ONE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, COALESCE_WAITING_PERIOD, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SHARED_MEMORY_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_VERSION, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_MAJOR_VERSION, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_H, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_THREAD_DIMENSION0, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_THREAD_DIMENSION1, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, CTA_THREAD_DIMENSION2, "0x%x");
-+   for (int i = 0; i < 8; i++)
-+      NVC3C0_QMDV02_02_IDX(qmd, CONSTANT_BUFFER_VALID, i, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_VAL(qmd, REGISTER_COUNT_V, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, FREE_CTA_SLOTS_EMPTY_SM, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SM_DISABLE_MASK_LOWER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SM_DISABLE_MASK_UPPER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, RELEASE0_ADDRESS_LOWER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, RELEASE0_ADDRESS_UPPER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_J, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_REDUCTION_OP, RED_ADD,
-+                                                    RED_MIN,
-+                                                    RED_MAX,
-+                                                    RED_INC,
-+                                                    RED_DEC,
-+                                                    RED_AND,
-+                                                    RED_OR,
-+                                                    RED_XOR);
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_K, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32);
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_REDUCTION_ENABLE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE0_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD);
-+   NVC3C0_QMDV02_02_VAL(qmd, RELEASE0_PAYLOAD, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, RELEASE1_ADDRESS_LOWER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, RELEASE1_ADDRESS_UPPER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_L, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_REDUCTION_OP, RED_ADD,
-+                                                    RED_MIN,
-+                                                    RED_MAX,
-+                                                    RED_INC,
-+                                                    RED_DEC,
-+                                                    RED_AND,
-+                                                    RED_OR,
-+                                                    RED_XOR);
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_M, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_REDUCTION_FORMAT, UNSIGNED_32, SIGNED_32);
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_REDUCTION_ENABLE, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_DEF(qmd, RELEASE1_STRUCTURE_SIZE, FOUR_WORDS, ONE_WORD);
-+   NVC3C0_QMDV02_02_VAL(qmd, RELEASE1_PAYLOAD, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_N, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, BARRIER_COUNT, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, REGISTER_COUNT, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, SASS_VERSION, "0x%x");
-+   for (int i = 0; i < 8; i++) {
-+      NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_ADDR_LOWER, i, "0x%x");
-+      NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_ADDR_UPPER, i, "0x%x");
-+      NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_RESERVED_ADDR, i, "0x%x");
-+      NVC3C0_QMDV02_02_IDX(qmd, CONSTANT_BUFFER_INVALIDATE, i, FALSE, TRUE);
-+      NVC3C0_QMDV02_02_VAL(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, i, "0x%x");
-+   }
-+   NVC3C0_QMDV02_02_VAL(qmd, PROGRAM_ADDRESS_LOWER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, PROGRAM_ADDRESS_UPPER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_S, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_INNER_GET, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_REQUIRE_SCHEDULING_PCAS, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_INNER_PUT, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_SCG_TYPE, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_RESERVED_Q, "0x%x");
-+   NVC3C0_QMDV02_02_DEF(qmd, HW_ONLY_SPAN_LIST_HEAD_INDEX_VALID, FALSE, TRUE);
-+   NVC3C0_QMDV02_02_VAL(qmd, HW_ONLY_SKED_NEXT_QMD_POINTER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_G, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_H, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_I, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_J, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_K, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_L, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_M, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, QMD_SPARE_N, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, DEBUG_ID_UPPER, "0x%x");
-+   NVC3C0_QMDV02_02_VAL(qmd, DEBUG_ID_LOWER, "0x%x");
-+}
-diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
-index 5c43518afcb..d123c8a1c17 100644
---- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
-+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
-@@ -104,6 +104,8 @@ nouveau_drm_screen_create(int fd)
- 	case 0x110:
- 	case 0x120:
- 	case 0x130:
-+	case 0x140:
-+	case 0x160:
- 		init = nvc0_screen_create;
- 		break;
- 	default:
diff --git a/SPECS/mesa.spec b/SPECS/mesa.spec
index 177ac45..4c1ccd7 100644
--- a/SPECS/mesa.spec
+++ b/SPECS/mesa.spec
@@ -9,16 +9,16 @@
 %endif
 
 %ifarch %{ix86} x86_64
-%define platform_drivers ,i965
+%define platform_drivers i965
 %define with_vmware 1
 %define with_xa     1
 %define with_iris   1
 %endif
 
 %ifarch %{ix86} x86_64
-%define with_vulkan 1
+%define with_vulkan_hw 1
 %else
-%define with_vulkan 0
+%define with_vulkan_hw 0
 %endif
 
 %ifarch %{arm} aarch64
@@ -31,18 +31,20 @@
 
 %global dri_drivers %{?platform_drivers}
 
-%if 0%{?with_vulkan}
-%define vulkan_drivers intel,amd
+%if 0%{?with_vulkan_hw}
+%define vulkan_drivers swrast,intel,amd
+%else
+%define vulkan_drivers swrast
 %endif
 
 %global sanitize 0
 
-#global rctag rc4
+#global rctag rc2
 
 Name:           mesa
 Summary:        Mesa graphics libraries
-Version:        20.1.4
-Release:        1%{?rctag:.%{rctag}}%{?dist}
+Version:        20.3.3
+Release:        2%{?rctag:.%{rctag}}%{?dist}
 
 License:        MIT
 URL:            http://www.mesa3d.org
@@ -56,9 +58,11 @@ Source3:        Makefile
 # Fedora opts to ignore the optional part of clause 2 and treat that code as 2 clause BSD.
 Source4:        Mesa-MLAA-License-Clarification-Email.txt
 
-# Add support for TU11x nvidia
-Patch10: 0001-nir-use-bitfield_insert-instead-of-bfi-in-nir_lower_.patch
-Patch11: nouveau-tu1xx-support.patch
+Patch0:	lavapipe-disable-env-var.patch
+Patch1: mesa-20.3.3-stable-fixes.patch
+Patch2: anv-remove-warning.patch
+
+Patch10: cpu-affinity-fixes-20.3.3.patch
 
 BuildRequires:  gcc
 BuildRequires:  gcc-c++
@@ -67,7 +71,7 @@ BuildRequires:  meson >= 0.45
 %if %{with_hardware}
 BuildRequires:  kernel-headers
 %endif
-BuildRequires:  libdrm-devel >= 2.4.42
+BuildRequires:  libdrm-devel >= 2.4.103
 BuildRequires:  libXxf86vm-devel
 BuildRequires:  expat-devel
 BuildRequires:  xorg-x11-proto-devel
@@ -166,6 +170,7 @@ Provides:       libEGL-devel%{?_isa}
 %package dri-drivers
 Summary:        Mesa-based DRI drivers
 Requires:       %{name}-filesystem%{?_isa} = %{?epoch:%{epoch}:}%{version}-%{release}
+Requires:	libdrm >= 2.4.103
 
 %description dri-drivers
 %{summary}.
@@ -282,7 +287,6 @@ Requires:       %{name}-libd3d%{?_isa} = %{?epoch:%{epoch}:}%{version}-%{release
 %{summary}.
 %endif
 
-%if 0%{?with_vulkan}
 %package vulkan-drivers
 Summary:        Mesa Vulkan drivers
 Requires:       vulkan%{_isa}
@@ -290,6 +294,7 @@ Requires:       vulkan%{_isa}
 %description vulkan-drivers
 The drivers with support for the Vulkan API.
 
+%if 0%{?with_vulkan_hw}
 %package vulkan-devel
 Summary:        Mesa Vulkan development files
 Requires:       %{name}-vulkan-drivers%{?_isa} = %{?epoch:%{epoch}:}%{version}-%{release}
@@ -323,7 +328,7 @@ pathfix.py -i %{__python3} -pn bin/*.py src/egl/generate/*.py \
 export ASFLAGS="--generate-missing-build-notes=yes"
 %meson -Dcpp_std=gnu++14 \
   -Db_ndebug=true \
-  -Dplatforms=x11,wayland,drm,surfaceless \
+  -Dplatforms=x11,wayland \
   -Ddri3=true \
   -Ddri-drivers=%{?dri_drivers} \
 %if 0%{?with_hardware}
@@ -527,8 +532,8 @@ done
 %endif
 %endif
 
-%if 0%{?with_vulkan}
 %files vulkan-drivers
+%if 0%{?with_vulkan_hw}
 %{_libdir}/libvulkan_intel.so
 %{_libdir}/libvulkan_radeon.so
 %ifarch x86_64
@@ -538,14 +543,40 @@ done
 %{_datadir}/vulkan/icd.d/intel_icd.i686.json
 %{_datadir}/vulkan/icd.d/radeon_icd.i686.json
 %endif
+%endif
+%{_libdir}/libvulkan_lvp.so
+%{_datadir}/vulkan/icd.d/lvp_icd.*.json
 %{_libdir}/libVkLayer_MESA_device_select.so
 %{_datadir}/vulkan/implicit_layer.d/VkLayer_MESA_device_select.json
 
+%if 0%{?with_vulkan_hw}
 %files vulkan-devel
 %{_includedir}/vulkan/
 %endif
 
 %changelog
+* Fri Mar 26 2021 Dave Airlie <airlied@redhat.com> - 20.3.3-2
+- Fix CPU affinity memory corruption crash (#1938788)
+
+* Tue Feb 16 2021 Dave Airlie <airlied@redhat.com> - 20.3.3-1
+- Update to 20.3.3 + upstream fixes for qemu regression
+
+* Mon Jan 11 2021 Dave Airlie <airlied@redhat.com> - 20.3.2-1
+- Update to 20.3.2 for upstream fixes
+
+* Mon Dec 21 2020 Dave Airlie <airlied@redhat.com> - 20.3.1-1
+- Update to 20.3.1 for radeon fix
+
+* Mon Dec 07 2020 Dave Airlie <airlied@redhat.com> - 20.3.0-2
+- Fix regression with radeon si/cik cards
+
+* Fri Dec 04 2020 Dave Airlie <airlied@redhat.com> - 20.3.0-1
+- Update to 20.3.0 release
+
+* Thu Nov 19 2020 Dave Airlie <airlied@redhat.com> - 20.3.0-0.1.rc2
+- Update 20.3.0-rc2
+- enable lavapipe behind env var so it can be used for testing
+
 * Wed Aug 05 2020 Dave Airlie <airlied@redhat.com> - 20.1.4-1
 - Update to 20.1.4
 - Update nouveau tu1xx support patch (Karol)