From 003cedd10f98b0ff2abe2773f7a08da10d60f22f Mon Sep 17 00:00:00 2001
From: CentOS Sources <bugs@centos.org>
Date: Nov 19 2015 15:45:07 +0000
Subject: import mesa-private-llvm-3.6.2-2.el7


---

diff --git a/.gitignore b/.gitignore
index 5af8eef..7c6c75c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1 @@
-SOURCES/llvm-3.5.0.src.tar.xz
+SOURCES/llvm-3.6.2.src.tar.xz
diff --git a/.mesa-private-llvm.metadata b/.mesa-private-llvm.metadata
index 5cfad28..fb549fa 100644
--- a/.mesa-private-llvm.metadata
+++ b/.mesa-private-llvm.metadata
@@ -1 +1 @@
-58d817ac2ff573386941e7735d30702fe71267d5 SOURCES/llvm-3.5.0.src.tar.xz
+7a00257eb2bc9431e4c77c3a36b033072c54bc7e SOURCES/llvm-3.6.2.src.tar.xz
diff --git a/SOURCES/0001-AArch64-Fix-invalid-use-of-references-to-BuildMI.patch b/SOURCES/0001-AArch64-Fix-invalid-use-of-references-to-BuildMI.patch
new file mode 100644
index 0000000..e4ea42c
--- /dev/null
+++ b/SOURCES/0001-AArch64-Fix-invalid-use-of-references-to-BuildMI.patch
@@ -0,0 +1,48 @@
+From 5717e28019e7348a04f63dcf965121171da15c62 Mon Sep 17 00:00:00 2001
+From: James Molloy <james.molloy@arm.com>
+Date: Thu, 16 Apr 2015 11:37:40 +0000
+Subject: [PATCH] [AArch64] Fix invalid use of references to BuildMI.
+
+This was found in GCC PR65773 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65773).
+
+We shouldn't be taking a reference to the temporary that BuildMI returns, we must copy it.
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@235088 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/Target/AArch64/AArch64InstrInfo.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
+index 8e0af2d..db231c4 100644
+--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
++++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
+@@ -1526,7 +1526,7 @@ void AArch64InstrInfo::copyPhysRegTuple(
+   }
+ 
+   for (; SubReg != End; SubReg += Incr) {
+-    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
++    const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
+     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+@@ -1904,7 +1904,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
+   }
+   assert(Opc && "Unknown register class");
+ 
+-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
++  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                       .addReg(SrcReg, getKillRegState(isKill))
+                                       .addFrameIndex(FI);
+ 
+@@ -2002,7 +2002,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
+   }
+   assert(Opc && "Unknown register class");
+ 
+-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
++  const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                       .addReg(DestReg, getDefRegState(true))
+                                       .addFrameIndex(FI);
+   if (Offset)
+-- 
+2.4.3
+
diff --git a/SOURCES/llvm-3.5.0-build-fix.patch b/SOURCES/llvm-3.5.0-build-fix.patch
deleted file mode 100644
index da1eaec..0000000
--- a/SOURCES/llvm-3.5.0-build-fix.patch
+++ /dev/null
@@ -1,43 +0,0 @@
-Error.cpp:28:44: error: declaration of ‘virtual const char* {anonymous}::_object_error_category::name() const’ has a different exception specifier
- const char *_object_error_category::name() const {
-                                            ^
-Error.cpp:23:15: error: from previous declaration ‘virtual const char* {anonymous}::_object_error_category::name() const noexcept (true)’
-   const char* name() const LLVM_NOEXCEPT override;
-               ^
-
-diff -up llvm.src/lib/Object/Error.cpp.jx llvm.src/lib/Object/Error.cpp
---- llvm.src/lib/Object/Error.cpp.jx	2014-06-13 11:36:17.000000000 -0400
-+++ llvm.src/lib/Object/Error.cpp	2014-08-11 13:11:46.135014527 -0400
-@@ -25,7 +25,7 @@ public:
- };
- }
- 
--const char *_object_error_category::name() const {
-+const char *_object_error_category::name() const LLVM_NOEXCEPT {
-   return "llvm.object";
- }
- 
-diff -up llvm.src/tools/llvm-readobj/Error.cpp.jx llvm.src/tools/llvm-readobj/Error.cpp
---- llvm.src/tools/llvm-readobj/Error.cpp.jx	2014-06-13 11:36:17.000000000 -0400
-+++ llvm.src/tools/llvm-readobj/Error.cpp	2014-08-11 13:49:16.624287424 -0400
-@@ -24,7 +24,7 @@ public:
- };
- } // namespace
- 
--const char *_readobj_error_category::name() const {
-+const char *_readobj_error_category::name() const LLVM_NOEXCEPT {
-   return "llvm.readobj";
- }
- 
-diff -up llvm.src/tools/obj2yaml/Error.cpp.jx llvm.src/tools/obj2yaml/Error.cpp
---- llvm.src/tools/obj2yaml/Error.cpp.jx	2014-06-13 11:36:17.000000000 -0400
-+++ llvm.src/tools/obj2yaml/Error.cpp	2014-08-11 14:04:05.841996088 -0400
-@@ -20,7 +20,7 @@ public:
- };
- } // namespace
- 
--const char *_obj2yaml_error_category::name() const { return "obj2yaml"; }
-+const char *_obj2yaml_error_category::name() const LLVM_NOEXCEPT { return "obj2yaml"; }
- 
- std::string _obj2yaml_error_category::message(int ev) const {
-   switch (static_cast<obj2yaml_error>(ev)) {
diff --git a/SOURCES/llvm-3.6-large-struct-return.patch b/SOURCES/llvm-3.6-large-struct-return.patch
new file mode 100644
index 0000000..d387539
--- /dev/null
+++ b/SOURCES/llvm-3.6-large-struct-return.patch
@@ -0,0 +1,368 @@
+------------------------------------------------------------------------
+r244889 | uweigand | 2015-08-13 15:37:06 +0200 (Thu, 13 Aug 2015) | 22 lines
+
+[SystemZ] Support large LLVM IR struct return values
+
+Recent mesa/llvmpipe crashes on SystemZ due to a failed assertion when
+attempting to compile a routine with a return type of
+  { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+on a system without vector instruction support.
+
+This is because after legalizing the vector type, we get a return value
+consisting of 16 floats, which cannot all be returned in registers.
+
+Usually, what should happen in this case is that the target's CanLowerReturn
+routine rejects the return type, in which case SelectionDAG falls back to
+implementing a structure return in memory via implicit reference.
+
+However, the SystemZ target never actually implemented any CanLowerReturn
+routine, and thus would accept any struct return type.
+
+This patch fixes the crash by implementing CanLowerReturn.  As a side effect,
+this also handles fp128 return values, fixing a todo that was noted in
+SystemZCallingConv.td.
+
+Index: llvm-36/lib/Target/SystemZ/SystemZCallingConv.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZCallingConv.td
++++ llvm-36/lib/Target/SystemZ/SystemZCallingConv.td
+@@ -53,10 +53,6 @@ def RetCC_SystemZ : CallingConv<[
+   CCIfSubtarget<"hasVector()",
+     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
+-
+-  // ABI-compliant code returns long double by reference, but that conversion
+-  // is left to higher-level code.  Perhaps we could add an f128 definition
+-  // here for code that doesn't care about the ABI?
+ ]>;
+ 
+ //===----------------------------------------------------------------------===//
+Index: llvm-36/lib/Target/SystemZ/SystemZISelLowering.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZISelLowering.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZISelLowering.cpp
+@@ -1169,6 +1169,20 @@ SystemZTargetLowering::LowerCall(CallLow
+   return Chain;
+ }
+ 
++bool SystemZTargetLowering::
++CanLowerReturn(CallingConv::ID CallConv,
++               MachineFunction &MF, bool isVarArg,
++               const SmallVectorImpl<ISD::OutputArg> &Outs,
++               LLVMContext &Context) const {
++  // Detect unsupported vector return types.
++  if (Subtarget.hasVector())
++    VerifyVectorTypes(Outs);
++
++  SmallVector<CCValAssign, 16> RetLocs;
++  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
++  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
++}
++
+ SDValue
+ SystemZTargetLowering::LowerReturn(SDValue Chain,
+                                    CallingConv::ID CallConv, bool IsVarArg,
+Index: llvm-36/lib/Target/SystemZ/SystemZISelLowering.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZISelLowering.h
++++ llvm-36/lib/Target/SystemZ/SystemZISelLowering.h
+@@ -401,6 +401,10 @@ public:
+   SDValue LowerCall(CallLoweringInfo &CLI,
+                     SmallVectorImpl<SDValue> &InVals) const override;
+ 
++  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
++                      bool isVarArg,
++                      const SmallVectorImpl<ISD::OutputArg> &Outs,
++                      LLVMContext &Context) const override;
+   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+Index: llvm-36/test/CodeGen/SystemZ/args-04.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/args-04.ll
++++ llvm-36/test/CodeGen/SystemZ/args-04.ll
+@@ -124,3 +124,17 @@ define void @f13(fp128 *%r2, i16 %r3, i3
+   store fp128 %y, fp128 *%r2
+   ret void
+ }
++
++; Explicit fp128 return values are likewise passed indirectly.
++define fp128 @f14(fp128 %r3) {
++; CHECK-LABEL: f14:
++; CHECK: ld %f0, 0(%r3)
++; CHECK: ld %f2, 8(%r3)
++; CHECK: axbr %f0, %f0
++; CHECK: std %f0, 0(%r2)
++; CHECK: std %f2, 8(%r2)
++; CHECK: br %r14
++  %y = fadd fp128 %r3, %r3
++  ret fp128 %y
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/args-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/args-07.ll
+@@ -0,0 +1,60 @@
++; Test multiple return values (LLVM ABI extension)
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++
++; Up to four integer return values fit into GPRs.
++define { i64, i64, i64, i64 } @f1() {
++; CHECK-LABEL: f1:
++; CHECK: lghi %r2, 0
++; CHECK: lghi %r3, 1
++; CHECK: lghi %r4, 2
++; CHECK: lghi %r5, 3
++; CHECK: br %r14
++  ret { i64, i64, i64, i64 } { i64 0, i64 1, i64 2, i64 3 }
++}
++
++; More than four integer return values use sret.
++define { i64, i64, i64, i64, i64 } @f2() {
++; CHECK-LABEL: f2:
++; CHECK: mvghi 32(%r2), 4
++; CHECK: mvghi 24(%r2), 3
++; CHECK: mvghi 16(%r2), 2
++; CHECK: mvghi 8(%r2), 1
++; CHECK: mvghi 0(%r2), 0
++; CHECK: br %r14
++  ret { i64, i64, i64, i64, i64 } { i64 0, i64 1, i64 2, i64 3, i64 4 }
++}
++
++; Up to four floating-point return values fit into FPRs.
++define { double, double, double, double } @f3() {
++; CHECK-LABEL: f3:
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: ldeb %f0, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: ldeb %f2, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: ldeb %f4, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: ldeb %f6, 0([[TMP]])
++; CHECK: br %r14
++  ret { double, double, double, double }
++      { double 1.0, double 2.0, double 3.0, double 4.0 }
++}
++
++; More than four floating-point return values use sret.
++define { double, double, double, double, double } @f4() {
++; CHECK-LABEL: f4:
++; CHECK: llihh [[TMP:%r[0-5]]], 16404
++; CHECK: stg [[TMP]], 32(%r2)
++; CHECK: llihh [[TMP:%r[0-5]]], 16400
++; CHECK: stg [[TMP]], 24(%r2)
++; CHECK: llihh [[TMP:%r[0-5]]], 16392
++; CHECK: stg [[TMP]], 16(%r2)
++; CHECK: llihh [[TMP:%r[0-5]]], 16384
++; CHECK: stg [[TMP]], 8(%r2)
++; CHECK: llihh [[TMP:%r[0-5]]], 16368
++; CHECK: stg [[TMP]], 0(%r2)
++; CHECK: br %r14
++  ret { double, double, double, double, double }
++      { double 1.0, double 2.0, double 3.0, double 4.0, double 5.0 }
++}
+Index: llvm-36/test/CodeGen/SystemZ/args-08.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/args-08.ll
+@@ -0,0 +1,57 @@
++; Test calling functions with multiple return values (LLVM ABI extension)
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++
++; Up to four integer return values fit into GPRs.
++declare { i64, i64, i64, i64 } @bar1()
++
++define i64 @f1() {
++; CHECK-LABEL: f1:
++; CHECK: brasl %r14, bar1
++; CHECK: lgr %r2, %r5
++; CHECK: br %r14
++  %mret = call { i64, i64, i64, i64 } @bar1()
++  %ret = extractvalue { i64, i64, i64, i64 } %mret, 3
++  ret i64 %ret
++}
++
++; More than four integer return values use sret.
++declare { i64, i64, i64, i64, i64 } @bar2()
++
++define i64 @f2() {
++; CHECK-LABEL: f2:
++; CHECK: la %r2, 160(%r15)
++; CHECK: brasl %r14, bar2
++; CHECK: lg  %r2, 192(%r15)
++; CHECK: br %r14
++  %mret = call { i64, i64, i64, i64, i64 } @bar2()
++  %ret = extractvalue { i64, i64, i64, i64, i64 } %mret, 4
++  ret i64 %ret
++}
++
++; Up to four floating-point return values fit into GPRs.
++declare { double, double, double, double } @bar3()
++
++define double @f3() {
++; CHECK-LABEL: f3:
++; CHECK: brasl %r14, bar3
++; CHECK: ldr %f0, %f6
++; CHECK: br %r14
++  %mret = call { double, double, double, double } @bar3()
++  %ret = extractvalue { double, double, double, double } %mret, 3
++  ret double %ret
++}
++
++; More than four integer return values use sret.
++declare { double, double, double, double, double } @bar4()
++
++define double @f4() {
++; CHECK-LABEL: f4:
++; CHECK: la %r2, 160(%r15)
++; CHECK: brasl %r14, bar4
++; CHECK: ld  %f0, 192(%r15)
++; CHECK: br %r14
++  %mret = call { double, double, double, double, double } @bar4()
++  %ret = extractvalue { double, double, double, double, double } %mret, 4
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-06.ll
+@@ -0,0 +1,83 @@
++; Test multiple return values (LLVM ABI extension)
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Up to eight vector return values fit into VRs.
++define { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++         <2 x double>, <2 x double>, <2 x double>, <2 x double> } @f1() {
++; CHECK-LABEL: f1:
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v24, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v26, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v28, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v30, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v25, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v27, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v29, 0([[TMP]])
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl %v31, 0([[TMP]])
++; CHECK: br %r14
++  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++        <2 x double>, <2 x double>, <2 x double>, <2 x double> }
++      { <2 x double> <double 1.0, double 1.1>,
++        <2 x double> <double 2.0, double 2.1>,
++        <2 x double> <double 3.0, double 3.1>,
++        <2 x double> <double 4.0, double 4.1>,
++        <2 x double> <double 5.0, double 5.1>,
++        <2 x double> <double 6.0, double 6.1>,
++        <2 x double> <double 7.0, double 7.1>,
++        <2 x double> <double 8.0, double 8.1> }
++}
++
++; More than eight vector return values use sret.
++define { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++         <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++         <2 x double> } @f2() {
++; CHECK-LABEL: f2:
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 128(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 112(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 96(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 80(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 64(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 48(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 32(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 16(%r2)
++; CHECK: larl [[TMP:%r[0-5]]], .LCPI
++; CHECK: vl [[VTMP:%v[0-9]+]], 0([[TMP]])
++; CHECK: vst [[VTMP]], 0(%r2)
++; CHECK: br %r14
++  ret { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++        <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++        <2 x double> }
++      { <2 x double> <double 1.0, double 1.1>,
++        <2 x double> <double 2.0, double 2.1>,
++        <2 x double> <double 3.0, double 3.1>,
++        <2 x double> <double 4.0, double 4.1>,
++        <2 x double> <double 5.0, double 5.1>,
++        <2 x double> <double 6.0, double 6.1>,
++        <2 x double> <double 7.0, double 7.1>,
++        <2 x double> <double 8.0, double 8.1>,
++        <2 x double> <double 9.0, double 9.1> }
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-07.ll
+@@ -0,0 +1,47 @@
++; Test calling functions with multiple return values (LLVM ABI extension)
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Up to eight vector return values fit into VRs.
++declare { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++          <2 x double>, <2 x double>, <2 x double>, <2 x double> } @bar1()
++
++define <2 x double> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: brasl %r14, bar1
++; CHECK: vlr %v24, %v31
++; CHECK: br %r14
++  %mret = call { <2 x double>, <2 x double>,
++                 <2 x double>, <2 x double>,
++                 <2 x double>, <2 x double>,
++                 <2 x double>, <2 x double> } @bar1()
++  %ret = extractvalue { <2 x double>, <2 x double>,
++                        <2 x double>, <2 x double>,
++                        <2 x double>, <2 x double>,
++                        <2 x double>, <2 x double> } %mret, 7
++  ret <2 x double> %ret
++}
++
++; More than eight vector return values use sret.
++declare { <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++          <2 x double>, <2 x double>, <2 x double>, <2 x double>,
++          <2 x double> } @bar2()
++
++define <2 x double> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: la %r2, 160(%r15)
++; CHECK: brasl %r14, bar2
++; CHECK: vl  %v24, 288(%r15)
++; CHECK: br %r14
++  %mret = call { <2 x double>, <2 x double>,
++                 <2 x double>, <2 x double>,
++                 <2 x double>, <2 x double>,
++                 <2 x double>, <2 x double>,
++                 <2 x double> } @bar2()
++  %ret = extractvalue { <2 x double>, <2 x double>,
++                        <2 x double>, <2 x double>,
++                        <2 x double>, <2 x double>,
++                        <2 x double>, <2 x double>,
++                        <2 x double> } %mret, 8
++  ret <2 x double> %ret
++}
diff --git a/SOURCES/llvm-3.6.2-nerf-skylake.patch b/SOURCES/llvm-3.6.2-nerf-skylake.patch
new file mode 100644
index 0000000..5f8c3c4
--- /dev/null
+++ b/SOURCES/llvm-3.6.2-nerf-skylake.patch
@@ -0,0 +1,28 @@
+Skylake Pentium has the charming property of not supporting AVX, and
+getHostCPUName will return 'x86-64' since it doesn't know about skl at
+all in 3.6.x.  This confuses llvmpipe quite badly, as we'll emit SSE4.1
+intrinsics but llvm will think they're not valid, and we'll cough and
+die with a "Cannot select" message.
+
+Fix this by treating Skylake (and Broadwell, which also isn't present
+in 3.6) as if they were Haswell.  This isn't quite what upstream does,
+but upstream has changed this API a bit and introduced a getHostCPUFeatures
+to complement it, and while it looks like a much better approach it's
+quite a bit more invasive.
+
+diff -up llvm-3.6.2.src/lib/Support/Host.cpp.jx llvm-3.6.2.src/lib/Support/Host.cpp
+--- llvm-3.6.2.src/lib/Support/Host.cpp.jx	2015-10-01 12:08:39.000000000 -0400
++++ llvm-3.6.2.src/lib/Support/Host.cpp	2015-10-13 10:51:03.736425351 -0400
+@@ -362,6 +362,12 @@ StringRef sys::getHostCPUName() {
+       case 63:
+       case 69:
+       case 70:
++      // Broadwell:
++      case 61:
++      case 71:
++      // Skylake:
++      case 78:
++      case 94:
+         // Not all Haswell processors support AVX too (such as the Pentium
+         // versions instead of the i7 versions).
+         return HasAVX2 ? "core-avx2" : "corei7";
diff --git a/SOURCES/llvm-z13-backports.patch b/SOURCES/llvm-z13-backports.patch
new file mode 100644
index 0000000..c6aebb4
--- /dev/null
+++ b/SOURCES/llvm-z13-backports.patch
@@ -0,0 +1,39981 @@
+This patch backports z13 support and a number of other SystemZ
+enhancements to the LLVM 3.6 release branch.
+
+The patch consists of backports of the following mainline revisions:
+229652, 229654, 229658, 233540, 233541, 233688, 233689, 233690, 233700,
+233736, 233803, 236430, 236432, 236433, 236520, 236521, 236522, 236523,
+236524, 236525, 236526, 236527, 236528, 236529, 236530
+
+Index: llvm-36/include/llvm/IR/Intrinsics.td
+===================================================================
+--- llvm-36.orig/include/llvm/IR/Intrinsics.td
++++ llvm-36/include/llvm/IR/Intrinsics.td
+@@ -594,3 +594,4 @@ include "llvm/IR/IntrinsicsHexagon.td"
+ include "llvm/IR/IntrinsicsNVVM.td"
+ include "llvm/IR/IntrinsicsMips.td"
+ include "llvm/IR/IntrinsicsR600.td"
++include "llvm/IR/IntrinsicsSystemZ.td"
+Index: llvm-36/include/llvm/IR/IntrinsicsSystemZ.td
+===================================================================
+--- /dev/null
++++ llvm-36/include/llvm/IR/IntrinsicsSystemZ.td
+@@ -0,0 +1,378 @@
++//===- IntrinsicsSystemZ.td - Defines SystemZ intrinsics ---*- tablegen -*-===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines all of the SystemZ-specific intrinsics.
++//
++//===----------------------------------------------------------------------===//
++
++class SystemZUnaryConv<string name, LLVMType result, LLVMType arg>
++  : GCCBuiltin<"__builtin_s390_" ## name>,
++    Intrinsic<[result], [arg], [IntrNoMem]>;
++
++class SystemZUnary<string name, LLVMType type>
++  : SystemZUnaryConv<name, type, type>;
++
++class SystemZUnaryConvCC<LLVMType result, LLVMType arg>
++  : Intrinsic<[result, llvm_i32_ty], [arg], [IntrNoMem]>;
++
++class SystemZUnaryCC<LLVMType type>
++  : SystemZUnaryConvCC<type, type>;
++
++class SystemZBinaryConv<string name, LLVMType result, LLVMType arg>
++  : GCCBuiltin<"__builtin_s390_" ## name>,
++    Intrinsic<[result], [arg, arg], [IntrNoMem]>;
++
++class SystemZBinary<string name, LLVMType type>
++  : SystemZBinaryConv<name, type, type>;
++
++class SystemZBinaryInt<string name, LLVMType type>
++  : GCCBuiltin<"__builtin_s390_" ## name>,
++    Intrinsic<[type], [type, llvm_i32_ty], [IntrNoMem]>;
++
++class SystemZBinaryConvCC<LLVMType result, LLVMType arg>
++  : Intrinsic<[result, llvm_i32_ty], [arg, arg], [IntrNoMem]>;
++
++class SystemZBinaryConvIntCC<LLVMType result, LLVMType arg>
++  : Intrinsic<[result, llvm_i32_ty], [arg, llvm_i32_ty], [IntrNoMem]>;
++
++class SystemZBinaryCC<LLVMType type>
++  : SystemZBinaryConvCC<type, type>;
++
++class SystemZTernaryConv<string name, LLVMType result, LLVMType arg>
++  : GCCBuiltin<"__builtin_s390_" ## name>,
++    Intrinsic<[result], [arg, arg, result], [IntrNoMem]>;
++
++class SystemZTernary<string name, LLVMType type>
++  : SystemZTernaryConv<name, type, type>;
++
++class SystemZTernaryInt<string name, LLVMType type>
++  : GCCBuiltin<"__builtin_s390_" ## name>,
++    Intrinsic<[type], [type, type, llvm_i32_ty], [IntrNoMem]>;
++
++class SystemZTernaryIntCC<LLVMType type>
++  : Intrinsic<[type, llvm_i32_ty], [type, type, llvm_i32_ty], [IntrNoMem]>;
++
++class SystemZQuaternaryInt<string name, LLVMType type>
++  : GCCBuiltin<"__builtin_s390_" ## name>,
++    Intrinsic<[type], [type, type, type, llvm_i32_ty], [IntrNoMem]>;
++
++class SystemZQuaternaryIntCC<LLVMType type>
++  : Intrinsic<[type, llvm_i32_ty], [type, type, type, llvm_i32_ty],
++              [IntrNoMem]>;
++
++multiclass SystemZUnaryExtBHF<string name> {
++  def b : SystemZUnaryConv<name##"b", llvm_v8i16_ty, llvm_v16i8_ty>;
++  def h : SystemZUnaryConv<name##"h", llvm_v4i32_ty, llvm_v8i16_ty>;
++  def f : SystemZUnaryConv<name##"f", llvm_v2i64_ty, llvm_v4i32_ty>;
++}
++
++multiclass SystemZUnaryExtBHWF<string name> {
++  def b  : SystemZUnaryConv<name##"b",  llvm_v8i16_ty, llvm_v16i8_ty>;
++  def hw : SystemZUnaryConv<name##"hw", llvm_v4i32_ty, llvm_v8i16_ty>;
++  def f  : SystemZUnaryConv<name##"f",  llvm_v2i64_ty, llvm_v4i32_ty>;
++}
++
++multiclass SystemZUnaryBHF<string name> {
++  def b : SystemZUnary<name##"b", llvm_v16i8_ty>;
++  def h : SystemZUnary<name##"h", llvm_v8i16_ty>;
++  def f : SystemZUnary<name##"f", llvm_v4i32_ty>;
++}
++
++multiclass SystemZUnaryBHFG<string name> : SystemZUnaryBHF<name> {
++  def g : SystemZUnary<name##"g", llvm_v2i64_ty>;
++}
++
++multiclass SystemZUnaryCCBHF {
++  def bs : SystemZUnaryCC<llvm_v16i8_ty>;
++  def hs : SystemZUnaryCC<llvm_v8i16_ty>;
++  def fs : SystemZUnaryCC<llvm_v4i32_ty>;
++}
++
++multiclass SystemZBinaryTruncHFG<string name> {
++  def h : SystemZBinaryConv<name##"h", llvm_v16i8_ty, llvm_v8i16_ty>;
++  def f : SystemZBinaryConv<name##"f", llvm_v8i16_ty, llvm_v4i32_ty>;
++  def g : SystemZBinaryConv<name##"g", llvm_v4i32_ty, llvm_v2i64_ty>;
++}
++
++multiclass SystemZBinaryTruncCCHFG {
++  def hs : SystemZBinaryConvCC<llvm_v16i8_ty, llvm_v8i16_ty>;
++  def fs : SystemZBinaryConvCC<llvm_v8i16_ty, llvm_v4i32_ty>;
++  def gs : SystemZBinaryConvCC<llvm_v4i32_ty, llvm_v2i64_ty>;
++}
++
++multiclass SystemZBinaryExtBHF<string name> {
++  def b : SystemZBinaryConv<name##"b", llvm_v8i16_ty, llvm_v16i8_ty>;
++  def h : SystemZBinaryConv<name##"h", llvm_v4i32_ty, llvm_v8i16_ty>;
++  def f : SystemZBinaryConv<name##"f", llvm_v2i64_ty, llvm_v4i32_ty>;
++}
++
++multiclass SystemZBinaryExtBHFG<string name> : SystemZBinaryExtBHF<name> {
++  def g : SystemZBinaryConv<name##"g", llvm_v16i8_ty, llvm_v2i64_ty>;
++}
++
++multiclass SystemZBinaryBHF<string name> {
++  def b : SystemZBinary<name##"b", llvm_v16i8_ty>;
++  def h : SystemZBinary<name##"h", llvm_v8i16_ty>;
++  def f : SystemZBinary<name##"f", llvm_v4i32_ty>;
++}
++
++multiclass SystemZBinaryBHFG<string name> : SystemZBinaryBHF<name> {
++  def g : SystemZBinary<name##"g", llvm_v2i64_ty>;
++}
++
++multiclass SystemZBinaryIntBHFG<string name> {
++  def b : SystemZBinaryInt<name##"b", llvm_v16i8_ty>;
++  def h : SystemZBinaryInt<name##"h", llvm_v8i16_ty>;
++  def f : SystemZBinaryInt<name##"f", llvm_v4i32_ty>;
++  def g : SystemZBinaryInt<name##"g", llvm_v2i64_ty>;
++}
++
++multiclass SystemZBinaryCCBHF {
++  def bs : SystemZBinaryCC<llvm_v16i8_ty>;
++  def hs : SystemZBinaryCC<llvm_v8i16_ty>;
++  def fs : SystemZBinaryCC<llvm_v4i32_ty>;
++}
++
++multiclass SystemZCompareBHFG<string name> {
++  def bs : SystemZBinaryCC<llvm_v16i8_ty>;
++  def hs : SystemZBinaryCC<llvm_v8i16_ty>;
++  def fs : SystemZBinaryCC<llvm_v4i32_ty>;
++  def gs : SystemZBinaryCC<llvm_v2i64_ty>;
++}
++
++multiclass SystemZTernaryExtBHF<string name> {
++  def b : SystemZTernaryConv<name##"b", llvm_v8i16_ty, llvm_v16i8_ty>;
++  def h : SystemZTernaryConv<name##"h", llvm_v4i32_ty, llvm_v8i16_ty>;
++  def f : SystemZTernaryConv<name##"f", llvm_v2i64_ty, llvm_v4i32_ty>;
++}
++
++multiclass SystemZTernaryExtBHFG<string name> : SystemZTernaryExtBHF<name> {
++  def g : SystemZTernaryConv<name##"g", llvm_v16i8_ty, llvm_v2i64_ty>;
++}
++
++multiclass SystemZTernaryBHF<string name> {
++  def b : SystemZTernary<name##"b", llvm_v16i8_ty>;
++  def h : SystemZTernary<name##"h", llvm_v8i16_ty>;
++  def f : SystemZTernary<name##"f", llvm_v4i32_ty>;
++}
++
++multiclass SystemZTernaryIntBHF<string name> {
++  def b : SystemZTernaryInt<name##"b", llvm_v16i8_ty>;
++  def h : SystemZTernaryInt<name##"h", llvm_v8i16_ty>;
++  def f : SystemZTernaryInt<name##"f", llvm_v4i32_ty>;
++}
++
++multiclass SystemZTernaryIntCCBHF {
++  def bs : SystemZTernaryIntCC<llvm_v16i8_ty>;
++  def hs : SystemZTernaryIntCC<llvm_v8i16_ty>;
++  def fs : SystemZTernaryIntCC<llvm_v4i32_ty>;
++}
++
++multiclass SystemZQuaternaryIntBHF<string name> {
++  def b : SystemZQuaternaryInt<name##"b", llvm_v16i8_ty>;
++  def h : SystemZQuaternaryInt<name##"h", llvm_v8i16_ty>;
++  def f : SystemZQuaternaryInt<name##"f", llvm_v4i32_ty>;
++}
++
++multiclass SystemZQuaternaryIntBHFG<string name> : SystemZQuaternaryIntBHF<name> {
++  def g : SystemZQuaternaryInt<name##"g", llvm_v2i64_ty>;
++}
++
++multiclass SystemZQuaternaryIntCCBHF {
++  def bs : SystemZQuaternaryIntCC<llvm_v16i8_ty>;
++  def hs : SystemZQuaternaryIntCC<llvm_v8i16_ty>;
++  def fs : SystemZQuaternaryIntCC<llvm_v4i32_ty>;
++}
++
++//===----------------------------------------------------------------------===//
++//
++// Transactional-execution intrinsics
++//
++//===----------------------------------------------------------------------===//
++
++def llvm_ptr64_ty : LLVMPointerType<llvm_i64_ty>;
++
++let TargetPrefix = "s390" in {
++  def int_s390_tbegin : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
++                                  [IntrNoDuplicate]>;
++
++  def int_s390_tbegin_nofloat : Intrinsic<[llvm_i32_ty],
++                                          [llvm_ptr_ty, llvm_i32_ty],
++                                          [IntrNoDuplicate]>;
++
++  def int_s390_tbeginc : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
++                                   [IntrNoDuplicate]>;
++
++  def int_s390_tabort : Intrinsic<[], [llvm_i64_ty],
++                                  [IntrNoReturn, Throws]>;
++
++  def int_s390_tend : GCCBuiltin<"__builtin_tend">,
++                      Intrinsic<[llvm_i32_ty], []>;
++
++  def int_s390_etnd : GCCBuiltin<"__builtin_tx_nesting_depth">,
++                      Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
++
++  def int_s390_ntstg : Intrinsic<[], [llvm_i64_ty, llvm_ptr64_ty],
++                                 [IntrReadWriteArgMem]>;
++
++  def int_s390_ppa_txassist : GCCBuiltin<"__builtin_tx_assist">,
++                              Intrinsic<[], [llvm_i32_ty]>;
++}
++
++//===----------------------------------------------------------------------===//
++//
++// Vector intrinsics
++//
++//===----------------------------------------------------------------------===//
++
++let TargetPrefix = "s390" in {
++  def int_s390_lcbb : GCCBuiltin<"__builtin_s390_lcbb">,
++                      Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
++                                [IntrNoMem]>;
++
++  def int_s390_vlbb : GCCBuiltin<"__builtin_s390_vlbb">,
++                      Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
++                                [IntrReadArgMem]>;
++
++  def int_s390_vll : GCCBuiltin<"__builtin_s390_vll">,
++                     Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty, llvm_ptr_ty],
++                               [IntrReadArgMem]>;
++
++  def int_s390_vpdi : GCCBuiltin<"__builtin_s390_vpdi">,
++                      Intrinsic<[llvm_v2i64_ty],
++                                [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
++                                [IntrNoMem]>;
++
++  def int_s390_vperm : GCCBuiltin<"__builtin_s390_vperm">,
++                       Intrinsic<[llvm_v16i8_ty],
++                                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
++                                 [IntrNoMem]>;
++
++  defm int_s390_vpks : SystemZBinaryTruncHFG<"vpks">;
++  defm int_s390_vpks : SystemZBinaryTruncCCHFG;
++
++  defm int_s390_vpkls : SystemZBinaryTruncHFG<"vpkls">;
++  defm int_s390_vpkls : SystemZBinaryTruncCCHFG;
++
++  def int_s390_vstl : GCCBuiltin<"__builtin_s390_vstl">,
++                      Intrinsic<[], [llvm_v16i8_ty, llvm_i32_ty, llvm_ptr_ty],
++                                // In fact write-only but there's no property
++                                // for that.
++                                [IntrReadWriteArgMem]>;
++
++  defm int_s390_vupl  : SystemZUnaryExtBHWF<"vupl">;
++  defm int_s390_vupll : SystemZUnaryExtBHF<"vupll">;
++
++  defm int_s390_vuph  : SystemZUnaryExtBHF<"vuph">;
++  defm int_s390_vuplh : SystemZUnaryExtBHF<"vuplh">;
++
++  defm int_s390_vacc : SystemZBinaryBHFG<"vacc">;
++
++  def int_s390_vaq    : SystemZBinary<"vaq",     llvm_v16i8_ty>;
++  def int_s390_vacq   : SystemZTernary<"vacq",   llvm_v16i8_ty>;
++  def int_s390_vaccq  : SystemZBinary<"vaccq",   llvm_v16i8_ty>;
++  def int_s390_vacccq : SystemZTernary<"vacccq", llvm_v16i8_ty>;
++
++  defm int_s390_vavg  : SystemZBinaryBHFG<"vavg">;
++  defm int_s390_vavgl : SystemZBinaryBHFG<"vavgl">;
++
++  def int_s390_vcksm : SystemZBinary<"vcksm", llvm_v4i32_ty>;
++
++  defm int_s390_vgfm  : SystemZBinaryExtBHFG<"vgfm">;
++  defm int_s390_vgfma : SystemZTernaryExtBHFG<"vgfma">;
++
++  defm int_s390_vmah  : SystemZTernaryBHF<"vmah">;
++  defm int_s390_vmalh : SystemZTernaryBHF<"vmalh">;
++  defm int_s390_vmae  : SystemZTernaryExtBHF<"vmae">;
++  defm int_s390_vmale : SystemZTernaryExtBHF<"vmale">;
++  defm int_s390_vmao  : SystemZTernaryExtBHF<"vmao">;
++  defm int_s390_vmalo : SystemZTernaryExtBHF<"vmalo">;
++
++  defm int_s390_vmh  : SystemZBinaryBHF<"vmh">;
++  defm int_s390_vmlh : SystemZBinaryBHF<"vmlh">;
++  defm int_s390_vme  : SystemZBinaryExtBHF<"vme">;
++  defm int_s390_vmle : SystemZBinaryExtBHF<"vmle">;
++  defm int_s390_vmo  : SystemZBinaryExtBHF<"vmo">;
++  defm int_s390_vmlo : SystemZBinaryExtBHF<"vmlo">;
++
++  defm int_s390_verllv : SystemZBinaryBHFG<"verllv">;
++  defm int_s390_verll  : SystemZBinaryIntBHFG<"verll">;
++  defm int_s390_verim  : SystemZQuaternaryIntBHFG<"verim">;
++
++  def int_s390_vsl   : SystemZBinary<"vsl",   llvm_v16i8_ty>;
++  def int_s390_vslb  : SystemZBinary<"vslb",  llvm_v16i8_ty>;
++  def int_s390_vsra  : SystemZBinary<"vsra",  llvm_v16i8_ty>;
++  def int_s390_vsrab : SystemZBinary<"vsrab", llvm_v16i8_ty>;
++  def int_s390_vsrl  : SystemZBinary<"vsrl",  llvm_v16i8_ty>;
++  def int_s390_vsrlb : SystemZBinary<"vsrlb", llvm_v16i8_ty>;
++
++  def int_s390_vsldb : GCCBuiltin<"__builtin_s390_vsldb">,
++                       Intrinsic<[llvm_v16i8_ty],
++                                 [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
++                                 [IntrNoMem]>;
++
++  defm int_s390_vscbi : SystemZBinaryBHFG<"vscbi">;
++
++  def int_s390_vsq     : SystemZBinary<"vsq",      llvm_v16i8_ty>;
++  def int_s390_vsbiq   : SystemZTernary<"vsbiq",   llvm_v16i8_ty>;
++  def int_s390_vscbiq  : SystemZBinary<"vscbiq",   llvm_v16i8_ty>;
++  def int_s390_vsbcbiq : SystemZTernary<"vsbcbiq", llvm_v16i8_ty>;
++
++  def int_s390_vsumb : SystemZBinaryConv<"vsumb", llvm_v4i32_ty, llvm_v16i8_ty>;
++  def int_s390_vsumh : SystemZBinaryConv<"vsumh", llvm_v4i32_ty, llvm_v8i16_ty>;
++
++  def int_s390_vsumgh : SystemZBinaryConv<"vsumgh", llvm_v2i64_ty,
++                                          llvm_v8i16_ty>;
++  def int_s390_vsumgf : SystemZBinaryConv<"vsumgf", llvm_v2i64_ty,
++                                          llvm_v4i32_ty>;
++
++  def int_s390_vsumqf : SystemZBinaryConv<"vsumqf", llvm_v16i8_ty,
++                                          llvm_v4i32_ty>;
++  def int_s390_vsumqg : SystemZBinaryConv<"vsumqg", llvm_v16i8_ty,
++                                          llvm_v2i64_ty>;
++
++  def int_s390_vtm : SystemZBinaryConv<"vtm", llvm_i32_ty, llvm_v16i8_ty>;
++
++  defm int_s390_vceq : SystemZCompareBHFG<"vceq">;
++  defm int_s390_vch  : SystemZCompareBHFG<"vch">;
++  defm int_s390_vchl : SystemZCompareBHFG<"vchl">;
++
++  defm int_s390_vfae  : SystemZTernaryIntBHF<"vfae">;
++  defm int_s390_vfae  : SystemZTernaryIntCCBHF;
++  defm int_s390_vfaez : SystemZTernaryIntBHF<"vfaez">;
++  defm int_s390_vfaez : SystemZTernaryIntCCBHF;
++
++  defm int_s390_vfee  : SystemZBinaryBHF<"vfee">;
++  defm int_s390_vfee  : SystemZBinaryCCBHF;
++  defm int_s390_vfeez : SystemZBinaryBHF<"vfeez">;
++  defm int_s390_vfeez : SystemZBinaryCCBHF;
++
++  defm int_s390_vfene  : SystemZBinaryBHF<"vfene">;
++  defm int_s390_vfene  : SystemZBinaryCCBHF;
++  defm int_s390_vfenez : SystemZBinaryBHF<"vfenez">;
++  defm int_s390_vfenez : SystemZBinaryCCBHF;
++
++  defm int_s390_vistr : SystemZUnaryBHF<"vistr">;
++  defm int_s390_vistr : SystemZUnaryCCBHF;
++
++  defm int_s390_vstrc  : SystemZQuaternaryIntBHF<"vstrc">;
++  defm int_s390_vstrc  : SystemZQuaternaryIntCCBHF;
++  defm int_s390_vstrcz : SystemZQuaternaryIntBHF<"vstrcz">;
++  defm int_s390_vstrcz : SystemZQuaternaryIntCCBHF;
++
++  def int_s390_vfcedbs  : SystemZBinaryConvCC<llvm_v2i64_ty, llvm_v2f64_ty>;
++  def int_s390_vfchdbs  : SystemZBinaryConvCC<llvm_v2i64_ty, llvm_v2f64_ty>;
++  def int_s390_vfchedbs : SystemZBinaryConvCC<llvm_v2i64_ty, llvm_v2f64_ty>;
++
++  def int_s390_vftcidb : SystemZBinaryConvIntCC<llvm_v2i64_ty, llvm_v2f64_ty>;
++
++  def int_s390_vfidb : Intrinsic<[llvm_v2f64_ty],
++                                 [llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty],
++                                 [IntrNoMem]>;
++}
+Index: llvm-36/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+===================================================================
+--- llvm-36.orig/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
++++ llvm-36/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+@@ -10496,18 +10496,13 @@ SDValue DAGCombiner::ReplaceExtractVecto
+   if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+     int Elt = ConstEltNo->getZExtValue();
+     unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+-    if (TLI.isBigEndian())
+-      PtrOff = InVecVT.getSizeInBits() / 8 - PtrOff;
+     Offset = DAG.getConstant(PtrOff, PtrType);
+     MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+   } else {
++    Offset = DAG.getZExtOrTrunc(EltNo, SDLoc(EVE), PtrType);
+     Offset = DAG.getNode(
+-        ISD::MUL, SDLoc(EVE), EltNo.getValueType(), EltNo,
+-        DAG.getConstant(VecEltVT.getStoreSize(), EltNo.getValueType()));
+-    if (TLI.isBigEndian())
+-      Offset = DAG.getNode(
+-          ISD::SUB, SDLoc(EVE), EltNo.getValueType(),
+-          DAG.getConstant(InVecVT.getStoreSize(), EltNo.getValueType()), Offset);
++        ISD::MUL, SDLoc(EVE), PtrType, Offset,
++        DAG.getConstant(VecEltVT.getStoreSize(), PtrType));
+     MPI = OriginalLoad->getPointerInfo();
+   }
+   NewPtr = DAG.getNode(ISD::ADD, SDLoc(EVE), PtrType, NewPtr, Offset);
+Index: llvm-36/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+===================================================================
+--- llvm-36.orig/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
++++ llvm-36/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+@@ -2888,7 +2888,10 @@ static EVT FindMemType(SelectionDAG& DAG
+     unsigned MemVTWidth = MemVT.getSizeInBits();
+     if (MemVT.getSizeInBits() <= WidenEltWidth)
+       break;
+-    if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
++    auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
++    if ((Action == TargetLowering::TypeLegal ||
++         Action == TargetLowering::TypePromoteInteger) &&
++        (WidenWidth % MemVTWidth) == 0 &&
+         isPowerOf2_32(WidenWidth / MemVTWidth) &&
+         (MemVTWidth <= Width ||
+          (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+Index: llvm-36/lib/Support/Host.cpp
+===================================================================
+--- llvm-36.orig/lib/Support/Host.cpp
++++ llvm-36/lib/Support/Host.cpp
+@@ -655,6 +655,28 @@ StringRef sys::getHostCPUName() {
+   StringRef Str(buffer, CPUInfoSize);
+   SmallVector<StringRef, 32> Lines;
+   Str.split(Lines, "\n");
++
++  // Look for the CPU features.
++  SmallVector<StringRef, 32> CPUFeatures;
++  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
++    if (Lines[I].startswith("features")) {
++      size_t Pos = Lines[I].find(":");
++      if (Pos != StringRef::npos) {
++        Lines[I].drop_front(Pos + 1).split(CPUFeatures, " ");
++        break;
++      }
++    }
++
++  // We need to check for the presence of vector support independently of
++  // the machine type, since we may only use the vector register set when
++  // supported by the kernel (and hypervisor).
++  bool HaveVectorSupport = false;
++  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
++    if (CPUFeatures[I] == "vx")
++      HaveVectorSupport = true;
++  }
++
++  // Now check the processor machine type.
+   for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+     if (Lines[I].startswith("processor ")) {
+       size_t Pos = Lines[I].find("machine = ");
+@@ -662,6 +684,8 @@ StringRef sys::getHostCPUName() {
+         Pos += sizeof("machine = ") - 1;
+         unsigned int Id;
+         if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
++          if (Id >= 2964 && HaveVectorSupport)
++            return "z13";
+           if (Id >= 2827)
+             return "zEC12";
+           if (Id >= 2817)
+Index: llvm-36/lib/Support/Triple.cpp
+===================================================================
+--- llvm-36.orig/lib/Support/Triple.cpp
++++ llvm-36/lib/Support/Triple.cpp
+@@ -89,7 +89,7 @@ const char *Triple::getArchTypePrefix(Ar
+   case sparcv9:
+   case sparc:       return "sparc";
+ 
+-  case systemz:     return "systemz";
++  case systemz:     return "s390";
+ 
+   case x86:
+   case x86_64:      return "x86";
+Index: llvm-36/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
++++ llvm-36/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+@@ -39,13 +39,17 @@ enum RegisterKind {
+   ADDR64Reg,
+   FP32Reg,
+   FP64Reg,
+-  FP128Reg
++  FP128Reg,
++  VR32Reg,
++  VR64Reg,
++  VR128Reg
+ };
+ 
+ enum MemoryKind {
+   BDMem,
+   BDXMem,
+-  BDLMem
++  BDLMem,
++  BDVMem
+ };
+ 
+ class SystemZOperand : public MCParsedAsmOperand {
+@@ -57,6 +61,7 @@ private:
+     KindReg,
+     KindAccessReg,
+     KindImm,
++    KindImmTLS,
+     KindMem
+   };
+ 
+@@ -84,23 +89,31 @@ private:
+   };
+ 
+   // Base + Disp + Index, where Base and Index are LLVM registers or 0.
+-  // RegKind says what type the registers have (ADDR32Reg or ADDR64Reg).
+-  // Length is the operand length for D(L,B)-style operands, otherwise
+-  // it is null.
++  // MemKind says what type of memory this is and RegKind says what type
++  // the base register has (ADDR32Reg or ADDR64Reg).  Length is the operand
++  // length for D(L,B)-style operands, otherwise it is null.
+   struct MemOp {
+-    unsigned Base : 8;
+-    unsigned Index : 8;
+-    unsigned RegKind : 8;
+-    unsigned Unused : 8;
++    unsigned Base : 12;
++    unsigned Index : 12;
++    unsigned MemKind : 4;
++    unsigned RegKind : 4;
+     const MCExpr *Disp;
+     const MCExpr *Length;
+   };
+ 
++  // Imm is an immediate operand, and Sym is an optional TLS symbol
++  // for use with a __tls_get_offset marker relocation.
++  struct ImmTLSOp {
++    const MCExpr *Imm;
++    const MCExpr *Sym;
++  };
++
+   union {
+     TokenOp Token;
+     RegOp Reg;
+     unsigned AccessReg;
+     const MCExpr *Imm;
++    ImmTLSOp ImmTLS;
+     MemOp Mem;
+   };
+ 
+@@ -149,10 +162,11 @@ public:
+     return Op;
+   }
+   static std::unique_ptr<SystemZOperand>
+-  createMem(RegisterKind RegKind, unsigned Base, const MCExpr *Disp,
+-            unsigned Index, const MCExpr *Length, SMLoc StartLoc,
+-            SMLoc EndLoc) {
++  createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
++            const MCExpr *Disp, unsigned Index, const MCExpr *Length,
++            SMLoc StartLoc, SMLoc EndLoc) {
+     auto Op = make_unique<SystemZOperand>(KindMem, StartLoc, EndLoc);
++    Op->Mem.MemKind = MemKind;
+     Op->Mem.RegKind = RegKind;
+     Op->Mem.Base = Base;
+     Op->Mem.Index = Index;
+@@ -160,6 +174,14 @@ public:
+     Op->Mem.Length = Length;
+     return Op;
+   }
++  static std::unique_ptr<SystemZOperand>
++  createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
++               SMLoc StartLoc, SMLoc EndLoc) {
++    auto Op = make_unique<SystemZOperand>(KindImmTLS, StartLoc, EndLoc);
++    Op->ImmTLS.Imm = Imm;
++    Op->ImmTLS.Sym = Sym;
++    return Op;
++  }
+ 
+   // Token operands
+   bool isToken() const override {
+@@ -200,24 +222,40 @@ public:
+     return Imm;
+   }
+ 
++  // Immediate operands with optional TLS symbol.
++  bool isImmTLS() const {
++    return Kind == KindImmTLS;
++  }
++
+   // Memory operands.
+   bool isMem() const override {
+     return Kind == KindMem;
+   }
+-  bool isMem(RegisterKind RegKind, MemoryKind MemKind) const {
++  bool isMem(MemoryKind MemKind) const {
+     return (Kind == KindMem &&
+-            Mem.RegKind == RegKind &&
+-            (MemKind == BDXMem || !Mem.Index) &&
+-            (MemKind == BDLMem) == (Mem.Length != nullptr));
++            (Mem.MemKind == MemKind ||
++             // A BDMem can be treated as a BDXMem in which the index
++             // register field is 0.
++             (Mem.MemKind == BDMem && MemKind == BDXMem)));
++  }
++  bool isMem(MemoryKind MemKind, RegisterKind RegKind) const {
++    return isMem(MemKind) && Mem.RegKind == RegKind;
+   }
+-  bool isMemDisp12(RegisterKind RegKind, MemoryKind MemKind) const {
+-    return isMem(RegKind, MemKind) && inRange(Mem.Disp, 0, 0xfff);
++  bool isMemDisp12(MemoryKind MemKind, RegisterKind RegKind) const {
++    return isMem(MemKind, RegKind) && inRange(Mem.Disp, 0, 0xfff);
+   }
+-  bool isMemDisp20(RegisterKind RegKind, MemoryKind MemKind) const {
+-    return isMem(RegKind, MemKind) && inRange(Mem.Disp, -524288, 524287);
++  bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
++    return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
+   }
+   bool isMemDisp12Len8(RegisterKind RegKind) const {
+-    return isMemDisp12(RegKind, BDLMem) && inRange(Mem.Length, 1, 0x100);
++    return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length, 1, 0x100);
++  }
++  void addBDVAddrOperands(MCInst &Inst, unsigned N) const {
++    assert(N == 3 && "Invalid number of operands");
++    assert(isMem(BDVMem) && "Invalid operand type");
++    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
++    addExpr(Inst, Mem.Disp);
++    Inst.addOperand(MCOperand::CreateReg(Mem.Index));
+   }
+ 
+   // Override MCParsedAsmOperand.
+@@ -242,24 +280,31 @@ public:
+   }
+   void addBDAddrOperands(MCInst &Inst, unsigned N) const {
+     assert(N == 2 && "Invalid number of operands");
+-    assert(Kind == KindMem && Mem.Index == 0 && "Invalid operand type");
++    assert(isMem(BDMem) && "Invalid operand type");
+     Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+     addExpr(Inst, Mem.Disp);
+   }
+   void addBDXAddrOperands(MCInst &Inst, unsigned N) const {
+     assert(N == 3 && "Invalid number of operands");
+-    assert(Kind == KindMem && "Invalid operand type");
++    assert(isMem(BDXMem) && "Invalid operand type");
+     Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+     addExpr(Inst, Mem.Disp);
+     Inst.addOperand(MCOperand::CreateReg(Mem.Index));
+   }
+   void addBDLAddrOperands(MCInst &Inst, unsigned N) const {
+     assert(N == 3 && "Invalid number of operands");
+-    assert(Kind == KindMem && "Invalid operand type");
++    assert(isMem(BDLMem) && "Invalid operand type");
+     Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+     addExpr(Inst, Mem.Disp);
+     addExpr(Inst, Mem.Length);
+   }
++  void addImmTLSOperands(MCInst &Inst, unsigned N) const {
++    assert(N == 2 && "Invalid number of operands");
++    assert(Kind == KindImmTLS && "Invalid operand type");
++    addExpr(Inst, ImmTLS.Imm);
++    if (ImmTLS.Sym)
++      addExpr(Inst, ImmTLS.Sym);
++  }
+ 
+   // Used by the TableGen code to check for particular operand types.
+   bool isGR32() const { return isReg(GR32Reg); }
+@@ -273,17 +318,26 @@ public:
+   bool isFP32() const { return isReg(FP32Reg); }
+   bool isFP64() const { return isReg(FP64Reg); }
+   bool isFP128() const { return isReg(FP128Reg); }
+-  bool isBDAddr32Disp12() const { return isMemDisp12(ADDR32Reg, BDMem); }
+-  bool isBDAddr32Disp20() const { return isMemDisp20(ADDR32Reg, BDMem); }
+-  bool isBDAddr64Disp12() const { return isMemDisp12(ADDR64Reg, BDMem); }
+-  bool isBDAddr64Disp20() const { return isMemDisp20(ADDR64Reg, BDMem); }
+-  bool isBDXAddr64Disp12() const { return isMemDisp12(ADDR64Reg, BDXMem); }
+-  bool isBDXAddr64Disp20() const { return isMemDisp20(ADDR64Reg, BDXMem); }
++  bool isVR32() const { return isReg(VR32Reg); }
++  bool isVR64() const { return isReg(VR64Reg); }
++  bool isVF128() const { return false; }
++  bool isVR128() const { return isReg(VR128Reg); }
++  bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
++  bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
++  bool isBDAddr64Disp12() const { return isMemDisp12(BDMem, ADDR64Reg); }
++  bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
++  bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
++  bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+   bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
++  bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
++  bool isU1Imm() const { return isImm(0, 1); }
++  bool isU2Imm() const { return isImm(0, 3); }
++  bool isU3Imm() const { return isImm(0, 7); }
+   bool isU4Imm() const { return isImm(0, 15); }
+   bool isU6Imm() const { return isImm(0, 63); }
+   bool isU8Imm() const { return isImm(0, 255); }
+   bool isS8Imm() const { return isImm(-128, 127); }
++  bool isU12Imm() const { return isImm(0, 4095); }
+   bool isU16Imm() const { return isImm(0, 65535); }
+   bool isS16Imm() const { return isImm(-32768, 32767); }
+   bool isU32Imm() const { return isImm(0, (1LL << 32) - 1); }
+@@ -300,6 +354,7 @@ private:
+   enum RegisterGroup {
+     RegGR,
+     RegFP,
++    RegV,
+     RegAccess
+   };
+   struct Register {
+@@ -318,12 +373,15 @@ private:
+                                      RegisterKind Kind);
+ 
+   bool parseAddress(unsigned &Base, const MCExpr *&Disp,
+-                    unsigned &Index, const MCExpr *&Length,
++                    unsigned &Index, bool &IsVector, const MCExpr *&Length,
+                     const unsigned *Regs, RegisterKind RegKind);
+ 
+   OperandMatchResultTy parseAddress(OperandVector &Operands,
+-                                    const unsigned *Regs, RegisterKind RegKind,
+-                                    MemoryKind MemKind);
++                                    MemoryKind MemKind, const unsigned *Regs,
++                                    RegisterKind RegKind);
++
++  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
++                                  int64_t MaxVal, bool AllowTLS);
+ 
+   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
+ 
+@@ -382,26 +440,45 @@ public:
+   OperandMatchResultTy parseFP128(OperandVector &Operands) {
+     return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
+   }
++  OperandMatchResultTy parseVR32(OperandVector &Operands) {
++    return parseRegister(Operands, RegV, SystemZMC::VR32Regs, VR32Reg);
++  }
++  OperandMatchResultTy parseVR64(OperandVector &Operands) {
++    return parseRegister(Operands, RegV, SystemZMC::VR64Regs, VR64Reg);
++  }
++  OperandMatchResultTy parseVF128(OperandVector &Operands) {
++    llvm_unreachable("Shouldn't be used as an operand");
++  }
++  OperandMatchResultTy parseVR128(OperandVector &Operands) {
++    return parseRegister(Operands, RegV, SystemZMC::VR128Regs, VR128Reg);
++  }
+   OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
+-    return parseAddress(Operands, SystemZMC::GR32Regs, ADDR32Reg, BDMem);
++    return parseAddress(Operands, BDMem, SystemZMC::GR32Regs, ADDR32Reg);
+   }
+   OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
+-    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDMem);
++    return parseAddress(Operands, BDMem, SystemZMC::GR64Regs, ADDR64Reg);
+   }
+   OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
+-    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDXMem);
++    return parseAddress(Operands, BDXMem, SystemZMC::GR64Regs, ADDR64Reg);
+   }
+   OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
+-    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem);
++    return parseAddress(Operands, BDLMem, SystemZMC::GR64Regs, ADDR64Reg);
++  }
++  OperandMatchResultTy parseBDVAddr64(OperandVector &Operands) {
++    return parseAddress(Operands, BDVMem, SystemZMC::GR64Regs, ADDR64Reg);
+   }
+   OperandMatchResultTy parseAccessReg(OperandVector &Operands);
+-  OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
+-                                  int64_t MaxVal);
+   OperandMatchResultTy parsePCRel16(OperandVector &Operands) {
+-    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
++    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, false);
+   }
+   OperandMatchResultTy parsePCRel32(OperandVector &Operands) {
+-    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
++    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, false);
++  }
++  OperandMatchResultTy parsePCRelTLS16(OperandVector &Operands) {
++    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1, true);
++  }
++  OperandMatchResultTy parsePCRelTLS32(OperandVector &Operands) {
++    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
+   }
+ };
+ } // end anonymous namespace
+@@ -443,6 +520,8 @@ bool SystemZAsmParser::parseRegister(Reg
+     Reg.Group = RegGR;
+   else if (Prefix == 'f' && Reg.Num < 16)
+     Reg.Group = RegFP;
++  else if (Prefix == 'v' && Reg.Num < 32)
++    Reg.Group = RegV;
+   else if (Prefix == 'a' && Reg.Num < 16)
+     Reg.Group = RegAccess;
+   else
+@@ -493,8 +572,8 @@ SystemZAsmParser::parseRegister(OperandV
+ // Regs maps asm register numbers to LLVM register numbers and RegKind
+ // says what kind of address register we're using (ADDR32Reg or ADDR64Reg).
+ bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
+-                                    unsigned &Index, const MCExpr *&Length,
+-                                    const unsigned *Regs,
++                                    unsigned &Index, bool &IsVector,
++                                    const MCExpr *&Length, const unsigned *Regs,
+                                     RegisterKind RegKind) {
+   // Parse the displacement, which must always be present.
+   if (getParser().parseExpression(Disp))
+@@ -503,6 +582,7 @@ bool SystemZAsmParser::parseAddress(unsi
+   // Parse the optional base and index.
+   Index = 0;
+   Base = 0;
++  IsVector = false;
+   Length = nullptr;
+   if (getLexer().is(AsmToken::LParen)) {
+     Parser.Lex();
+@@ -510,12 +590,23 @@ bool SystemZAsmParser::parseAddress(unsi
+     if (getLexer().is(AsmToken::Percent)) {
+       // Parse the first register and decide whether it's a base or an index.
+       Register Reg;
+-      if (parseRegister(Reg, RegGR, Regs, RegKind))
++      if (parseRegister(Reg))
+         return true;
+-      if (getLexer().is(AsmToken::Comma))
+-        Index = Reg.Num;
+-      else
+-        Base = Reg.Num;
++      if (Reg.Group == RegV) {
++        // A vector index register.  The base register is optional.
++        IsVector = true;
++        Index = SystemZMC::VR128Regs[Reg.Num];
++      } else if (Reg.Group == RegGR) {
++        if (Reg.Num == 0)
++          return Error(Reg.StartLoc, "%r0 used in an address");
++        // If the are two registers, the first one is the index and the
++        // second is the base.
++        if (getLexer().is(AsmToken::Comma))
++          Index = Regs[Reg.Num];
++        else
++          Base = Regs[Reg.Num];
++      } else
++        return Error(Reg.StartLoc, "invalid address register");
+     } else {
+       // Parse the length.
+       if (getParser().parseExpression(Length))
+@@ -542,37 +633,46 @@ bool SystemZAsmParser::parseAddress(unsi
+ // Parse a memory operand and add it to Operands.  The other arguments
+ // are as above.
+ SystemZAsmParser::OperandMatchResultTy
+-SystemZAsmParser::parseAddress(OperandVector &Operands, const unsigned *Regs,
+-                               RegisterKind RegKind, MemoryKind MemKind) {
++SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
++                               const unsigned *Regs, RegisterKind RegKind) {
+   SMLoc StartLoc = Parser.getTok().getLoc();
+   unsigned Base, Index;
++  bool IsVector;
+   const MCExpr *Disp;
+   const MCExpr *Length;
+-  if (parseAddress(Base, Disp, Index, Length, Regs, RegKind))
++  if (parseAddress(Base, Disp, Index, IsVector, Length, Regs, RegKind))
+     return MatchOperand_ParseFail;
+ 
+-  if (Index && MemKind != BDXMem)
+-    {
+-      Error(StartLoc, "invalid use of indexed addressing");
+-      return MatchOperand_ParseFail;
+-    }
++  if (IsVector && MemKind != BDVMem) {
++    Error(StartLoc, "invalid use of vector addressing");
++    return MatchOperand_ParseFail;
++  }
+ 
+-  if (Length && MemKind != BDLMem)
+-    {
+-      Error(StartLoc, "invalid use of length addressing");
+-      return MatchOperand_ParseFail;
+-    }
++  if (!IsVector && MemKind == BDVMem) {
++    Error(StartLoc, "vector index required in address");
++    return MatchOperand_ParseFail;
++  }
+ 
+-  if (!Length && MemKind == BDLMem)
+-    {
+-      Error(StartLoc, "missing length in address");
+-      return MatchOperand_ParseFail;
+-    }
++  if (Index && MemKind != BDXMem && MemKind != BDVMem) {
++    Error(StartLoc, "invalid use of indexed addressing");
++    return MatchOperand_ParseFail;
++  }
++
++  if (Length && MemKind != BDLMem) {
++    Error(StartLoc, "invalid use of length addressing");
++    return MatchOperand_ParseFail;
++  }
++
++  if (!Length && MemKind == BDLMem) {
++    Error(StartLoc, "missing length in address");
++    return MatchOperand_ParseFail;
++  }
+ 
+   SMLoc EndLoc =
+     SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+-  Operands.push_back(SystemZOperand::createMem(RegKind, Base, Disp, Index,
+-                                               Length, StartLoc, EndLoc));
++  Operands.push_back(SystemZOperand::createMem(MemKind, RegKind, Base, Disp,
++                                               Index, Length, StartLoc,
++                                               EndLoc));
+   return MatchOperand_Success;
+ }
+ 
+@@ -589,6 +689,8 @@ bool SystemZAsmParser::ParseRegister(uns
+     RegNo = SystemZMC::GR64Regs[Reg.Num];
+   else if (Reg.Group == RegFP)
+     RegNo = SystemZMC::FP64Regs[Reg.Num];
++  else if (Reg.Group == RegV)
++    RegNo = SystemZMC::VR128Regs[Reg.Num];
+   else
+     // FIXME: Access registers aren't modelled as LLVM registers yet.
+     return Error(Reg.StartLoc, "invalid operand for instruction");
+@@ -661,8 +763,10 @@ bool SystemZAsmParser::parseOperand(Oper
+   // so we treat any plain expression as an immediate.
+   SMLoc StartLoc = Parser.getTok().getLoc();
+   unsigned Base, Index;
++  bool IsVector;
+   const MCExpr *Expr, *Length;
+-  if (parseAddress(Base, Expr, Index, Length, SystemZMC::GR64Regs, ADDR64Reg))
++  if (parseAddress(Base, Expr, Index, IsVector, Length, SystemZMC::GR64Regs,
++                   ADDR64Reg))
+     return true;
+ 
+   SMLoc EndLoc =
+@@ -743,7 +847,7 @@ SystemZAsmParser::parseAccessReg(Operand
+ 
+ SystemZAsmParser::OperandMatchResultTy
+ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
+-                             int64_t MaxVal) {
++                             int64_t MaxVal, bool AllowTLS) {
+   MCContext &Ctx = getContext();
+   MCStreamer &Out = getStreamer();
+   const MCExpr *Expr;
+@@ -766,9 +870,54 @@ SystemZAsmParser::parsePCRel(OperandVect
+     Expr = Value == 0 ? Base : MCBinaryExpr::CreateAdd(Base, Expr, Ctx);
+   }
+ 
++  // Optionally match :tls_gdcall: or :tls_ldcall: followed by a TLS symbol.
++  const MCExpr *Sym = nullptr;
++  if (AllowTLS && getLexer().is(AsmToken::Colon)) {
++    Parser.Lex();
++
++    if (Parser.getTok().isNot(AsmToken::Identifier)) {
++      Error(Parser.getTok().getLoc(), "unexpected token");
++      return MatchOperand_ParseFail;
++    }
++
++    MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
++    StringRef Name = Parser.getTok().getString();
++    if (Name == "tls_gdcall")
++      Kind = MCSymbolRefExpr::VK_TLSGD;
++    else if (Name == "tls_ldcall")
++      Kind = MCSymbolRefExpr::VK_TLSLDM;
++    else {
++      Error(Parser.getTok().getLoc(), "unknown TLS tag");
++      return MatchOperand_ParseFail;
++    }
++    Parser.Lex();
++
++    if (Parser.getTok().isNot(AsmToken::Colon)) {
++      Error(Parser.getTok().getLoc(), "unexpected token");
++      return MatchOperand_ParseFail;
++    }
++    Parser.Lex();
++
++    if (Parser.getTok().isNot(AsmToken::Identifier)) {
++      Error(Parser.getTok().getLoc(), "unexpected token");
++      return MatchOperand_ParseFail;
++    }
++
++    StringRef Identifier = Parser.getTok().getString();
++    Sym = MCSymbolRefExpr::Create(Ctx.GetOrCreateSymbol(Identifier),
++                                  Kind, Ctx);
++    Parser.Lex();
++  }
++
+   SMLoc EndLoc =
+     SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+-  Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
++
++  if (AllowTLS)
++    Operands.push_back(SystemZOperand::createImmTLS(Expr, Sym,
++                                                    StartLoc, EndLoc));
++  else
++    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
++
+   return MatchOperand_Success;
+ }
+ 
+Index: llvm-36/lib/Target/SystemZ/CMakeLists.txt
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/CMakeLists.txt
++++ llvm-36/lib/Target/SystemZ/CMakeLists.txt
+@@ -20,6 +20,7 @@ add_llvm_target(SystemZCodeGen
+   SystemZISelDAGToDAG.cpp
+   SystemZISelLowering.cpp
+   SystemZInstrInfo.cpp
++  SystemZLDCleanup.cpp
+   SystemZLongBranch.cpp
+   SystemZMachineFunctionInfo.cpp
+   SystemZMCInstLower.cpp
+@@ -28,6 +29,7 @@ add_llvm_target(SystemZCodeGen
+   SystemZShortenInst.cpp
+   SystemZSubtarget.cpp
+   SystemZTargetMachine.cpp
++  SystemZTargetTransformInfo.cpp
+   )
+ 
+ add_subdirectory(AsmParser)
+Index: llvm-36/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
++++ llvm-36/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+@@ -47,8 +47,8 @@ extern "C" void LLVMInitializeSystemZDis
+ }
+ 
+ static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+-                                        const unsigned *Regs) {
+-  assert(RegNo < 16 && "Invalid register");
++                                        const unsigned *Regs, unsigned Size) {
++  assert(RegNo < Size && "Invalid register");
+   RegNo = Regs[RegNo];
+   if (RegNo == 0)
+     return MCDisassembler::Fail;
+@@ -59,61 +59,81 @@ static DecodeStatus decodeRegisterClass(
+ static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs, 16);
+ }
+ 
+ static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+-  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs);
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs, 16);
++}
++
++static DecodeStatus DecodeVR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
++                                               uint64_t Address,
++                                               const void *Decoder) {
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::VR32Regs, 32);
++}
++
++static DecodeStatus DecodeVR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
++                                               uint64_t Address,
++                                               const void *Decoder) {
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::VR64Regs, 32);
++}
++
++static DecodeStatus DecodeVR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
++                                                uint64_t Address,
++                                                const void *Decoder) {
++  return decodeRegisterClass(Inst, RegNo, SystemZMC::VR128Regs, 32);
+ }
+ 
+ template<unsigned N>
+ static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
+-  assert(isUInt<N>(Imm) && "Invalid immediate");
++  if (!isUInt<N>(Imm))
++    return MCDisassembler::Fail;
+   Inst.addOperand(MCOperand::CreateImm(Imm));
+   return MCDisassembler::Success;
+ }
+ 
+ template<unsigned N>
+ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
+-  assert(isUInt<N>(Imm) && "Invalid immediate");
++  if (!isUInt<N>(Imm))
++    return MCDisassembler::Fail;
+   Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm)));
+   return MCDisassembler::Success;
+ }
+@@ -124,6 +144,21 @@ static DecodeStatus decodeAccessRegOpera
+   return decodeUImmOperand<4>(Inst, Imm);
+ }
+ 
++static DecodeStatus decodeU1ImmOperand(MCInst &Inst, uint64_t Imm,
++                                       uint64_t Address, const void *Decoder) {
++  return decodeUImmOperand<1>(Inst, Imm);
++}
++
++static DecodeStatus decodeU2ImmOperand(MCInst &Inst, uint64_t Imm,
++                                       uint64_t Address, const void *Decoder) {
++  return decodeUImmOperand<2>(Inst, Imm);
++}
++
++static DecodeStatus decodeU3ImmOperand(MCInst &Inst, uint64_t Imm,
++                                       uint64_t Address, const void *Decoder) {
++  return decodeUImmOperand<3>(Inst, Imm);
++}
++
+ static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+   return decodeUImmOperand<4>(Inst, Imm);
+@@ -139,6 +174,11 @@ static DecodeStatus decodeU8ImmOperand(M
+   return decodeUImmOperand<8>(Inst, Imm);
+ }
+ 
++static DecodeStatus decodeU12ImmOperand(MCInst &Inst, uint64_t Imm,
++                                        uint64_t Address, const void *Decoder) {
++  return decodeUImmOperand<12>(Inst, Imm);
++}
++
+ static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address, const void *Decoder) {
+   return decodeUImmOperand<16>(Inst, Imm);
+@@ -240,6 +280,18 @@ static DecodeStatus decodeBDLAddr12Len8O
+   return MCDisassembler::Success;
+ }
+ 
++static DecodeStatus decodeBDVAddr12Operand(MCInst &Inst, uint64_t Field,
++                                           const unsigned *Regs) {
++  uint64_t Index = Field >> 16;
++  uint64_t Base = (Field >> 12) & 0xf;
++  uint64_t Disp = Field & 0xfff;
++  assert(Index < 32 && "Invalid BDVAddr12");
++  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
++  Inst.addOperand(MCOperand::CreateImm(Disp));
++  Inst.addOperand(MCOperand::CreateReg(SystemZMC::VR128Regs[Index]));
++  return MCDisassembler::Success;
++}
++
+ static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+@@ -283,6 +335,12 @@ static DecodeStatus decodeBDLAddr64Disp1
+   return decodeBDLAddr12Len8Operand(Inst, Field, SystemZMC::GR64Regs);
+ }
+ 
++static DecodeStatus decodeBDVAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
++                                                 uint64_t Address,
++                                                 const void *Decoder) {
++  return decodeBDVAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
++}
++
+ #include "SystemZGenDisassemblerTables.inc"
+ 
+ DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+Index: llvm-36/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
++++ llvm-36/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+@@ -10,6 +10,7 @@
+ #include "SystemZInstPrinter.h"
+ #include "llvm/MC/MCExpr.h"
+ #include "llvm/MC/MCInstrInfo.h"
++#include "llvm/MC/MCSymbol.h"
+ #include "llvm/Support/raw_ostream.h"
+ 
+ using namespace llvm;
+@@ -21,13 +22,17 @@ using namespace llvm;
+ void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
+                                       unsigned Index, raw_ostream &O) {
+   O << Disp;
+-  if (Base) {
++  if (Base || Index) {
+     O << '(';
+-    if (Index)
+-      O << '%' << getRegisterName(Index) << ',';
+-    O << '%' << getRegisterName(Base) << ')';
+-  } else
+-    assert(!Index && "Shouldn't have an index without a base");
++    if (Index) {
++      O << '%' << getRegisterName(Index);
++      if (Base)
++        O << ',';
++    }
++    if (Base)
++      O << '%' << getRegisterName(Base);
++    O << ')';
++  }
+ }
+ 
+ void SystemZInstPrinter::printOperand(const MCOperand &MO, raw_ostream &O) {
+@@ -51,60 +56,78 @@ void SystemZInstPrinter::printRegName(ra
+   O << '%' << getRegisterName(RegNo);
+ }
+ 
+-void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum,
+-                                           raw_ostream &O) {
++template<unsigned N>
++void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+   int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isUInt<4>(Value) && "Invalid u4imm argument");
++  assert(isUInt<N>(Value) && "Invalid uimm argument");
+   O << Value;
+ }
+ 
+-void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum,
+-                                           raw_ostream &O) {
++template<unsigned N>
++void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) {
+   int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isUInt<6>(Value) && "Invalid u6imm argument");
++  assert(isInt<N>(Value) && "Invalid simm argument");
+   O << Value;
+ }
+ 
++void SystemZInstPrinter::printU1ImmOperand(const MCInst *MI, int OpNum,
++                                           raw_ostream &O) {
++  printUImmOperand<1>(MI, OpNum, O);
++}
++
++void SystemZInstPrinter::printU2ImmOperand(const MCInst *MI, int OpNum,
++                                           raw_ostream &O) {
++  printUImmOperand<2>(MI, OpNum, O);
++}
++
++void SystemZInstPrinter::printU3ImmOperand(const MCInst *MI, int OpNum,
++                                           raw_ostream &O) {
++  printUImmOperand<3>(MI, OpNum, O);
++}
++
++void SystemZInstPrinter::printU4ImmOperand(const MCInst *MI, int OpNum,
++                                           raw_ostream &O) {
++  printUImmOperand<4>(MI, OpNum, O);
++}
++
++void SystemZInstPrinter::printU6ImmOperand(const MCInst *MI, int OpNum,
++                                           raw_ostream &O) {
++  printUImmOperand<6>(MI, OpNum, O);
++}
++
+ void SystemZInstPrinter::printS8ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+-  int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isInt<8>(Value) && "Invalid s8imm argument");
+-  O << Value;
++  printSImmOperand<8>(MI, OpNum, O);
+ }
+ 
+ void SystemZInstPrinter::printU8ImmOperand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+-  int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isUInt<8>(Value) && "Invalid u8imm argument");
+-  O << Value;
++  printUImmOperand<8>(MI, OpNum, O);
++}
++
++void SystemZInstPrinter::printU12ImmOperand(const MCInst *MI, int OpNum,
++                                            raw_ostream &O) {
++  printUImmOperand<12>(MI, OpNum, O);
+ }
+ 
+ void SystemZInstPrinter::printS16ImmOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+-  int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isInt<16>(Value) && "Invalid s16imm argument");
+-  O << Value;
++  printSImmOperand<16>(MI, OpNum, O);
+ }
+ 
+ void SystemZInstPrinter::printU16ImmOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+-  int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isUInt<16>(Value) && "Invalid u16imm argument");
+-  O << Value;
++  printUImmOperand<16>(MI, OpNum, O);
+ }
+ 
+ void SystemZInstPrinter::printS32ImmOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+-  int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isInt<32>(Value) && "Invalid s32imm argument");
+-  O << Value;
++  printSImmOperand<32>(MI, OpNum, O);
+ }
+ 
+ void SystemZInstPrinter::printU32ImmOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+-  int64_t Value = MI->getOperand(OpNum).getImm();
+-  assert(isUInt<32>(Value) && "Invalid u32imm argument");
+-  O << Value;
++  printUImmOperand<32>(MI, OpNum, O);
+ }
+ 
+ void SystemZInstPrinter::printAccessRegOperand(const MCInst *MI, int OpNum,
+@@ -124,6 +147,29 @@ void SystemZInstPrinter::printPCRelOpera
+     O << *MO.getExpr();
+ }
+ 
++void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
++                                              raw_ostream &O) {
++  // Output the PC-relative operand.
++  printPCRelOperand(MI, OpNum, O);
++
++  // Output the TLS marker if present.
++  if ((unsigned)OpNum + 1 < MI->getNumOperands()) {
++    const MCOperand &MO = MI->getOperand(OpNum + 1);
++    const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
++    switch (refExp.getKind()) {
++      case MCSymbolRefExpr::VK_TLSGD:
++        O << ":tls_gdcall:";
++        break;
++      case MCSymbolRefExpr::VK_TLSLDM:
++        O << ":tls_ldcall:";
++        break;
++      default:
++        llvm_unreachable("Unexpected symbol kind");
++    }
++    O << refExp.getSymbol().getName();
++  }
++}
++
+ void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
+                                       raw_ostream &O) {
+   printOperand(MI->getOperand(OpNum), O);
+@@ -153,6 +199,13 @@ void SystemZInstPrinter::printBDLAddrOpe
+   O << ')';
+ }
+ 
++void SystemZInstPrinter::printBDVAddrOperand(const MCInst *MI, int OpNum,
++                                             raw_ostream &O) {
++  printAddress(MI->getOperand(OpNum).getReg(),
++               MI->getOperand(OpNum + 1).getImm(),
++               MI->getOperand(OpNum + 2).getReg(), O);
++}
++
+ void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum,
+                                            raw_ostream &O) {
+   static const char *const CondNames[] = {
+Index: llvm-36/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
++++ llvm-36/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+@@ -47,15 +47,21 @@ private:
+   void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
++  void printBDVAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
++  void printU1ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
++  void printU2ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
++  void printU3ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printU8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
++  void printU12ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printS16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
++  void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+   void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+ 
+   // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
+Index: llvm-36/lib/Target/SystemZ/LLVMBuild.txt
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/LLVMBuild.txt
++++ llvm-36/lib/Target/SystemZ/LLVMBuild.txt
+@@ -31,5 +31,5 @@ has_jit = 1
+ type = Library
+ name = SystemZCodeGen
+ parent = SystemZ
+-required_libraries = AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
++required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
+ add_to_library_groups = SystemZ
+Index: llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
++++ llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+@@ -27,9 +27,10 @@ static uint64_t extractBitsForFixup(MCFi
+   switch (unsigned(Kind)) {
+   case SystemZ::FK_390_PC16DBL:
+   case SystemZ::FK_390_PC32DBL:
+-  case SystemZ::FK_390_PLT16DBL:
+-  case SystemZ::FK_390_PLT32DBL:
+     return (int64_t)Value / 2;
++
++  case SystemZ::FK_390_TLS_CALL:
++    return 0;
+   }
+ 
+   llvm_unreachable("Unknown fixup kind!");
+@@ -72,8 +73,7 @@ SystemZMCAsmBackend::getFixupKindInfo(MC
+   const static MCFixupKindInfo Infos[SystemZ::NumTargetFixupKinds] = {
+     { "FK_390_PC16DBL",  0, 16, MCFixupKindInfo::FKF_IsPCRel },
+     { "FK_390_PC32DBL",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
+-    { "FK_390_PLT16DBL", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+-    { "FK_390_PLT32DBL", 0, 32, MCFixupKindInfo::FKF_IsPCRel }
++    { "FK_390_TLS_CALL", 0, 0, 0 }
+   };
+ 
+   if (Kind < FirstTargetFixupKind)
+Index: llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
++++ llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+@@ -70,24 +70,43 @@ private:
+   uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
++  uint64_t getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
++                                SmallVectorImpl<MCFixup> &Fixups,
++                                const MCSubtargetInfo &STI) const;
+ 
+   // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
+   // Offset bytes from the start of MI.  Add the fixup to Fixups
+   // and return the in-place addend, which since we're a RELA target
+-  // is always 0.
++  // is always 0.  If AllowTLS is true and optional operand OpNum + 1
++  // is present, also emit a TLS call fixup for it.
+   uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                             SmallVectorImpl<MCFixup> &Fixups,
+-                            unsigned Kind, int64_t Offset) const;
++                            unsigned Kind, int64_t Offset,
++                            bool AllowTLS) const;
+ 
+   uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC16DBL, 2);
++    return getPCRelEncoding(MI, OpNum, Fixups,
++                            SystemZ::FK_390_PC16DBL, 2, false);
+   }
+   uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const {
+-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
++    return getPCRelEncoding(MI, OpNum, Fixups,
++                            SystemZ::FK_390_PC32DBL, 2, false);
++  }
++  uint64_t getPC16DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
++                                 SmallVectorImpl<MCFixup> &Fixups,
++                                 const MCSubtargetInfo &STI) const {
++    return getPCRelEncoding(MI, OpNum, Fixups,
++                            SystemZ::FK_390_PC16DBL, 2, true);
++  }
++  uint64_t getPC32DBLTLSEncoding(const MCInst &MI, unsigned OpNum,
++                                 SmallVectorImpl<MCFixup> &Fixups,
++                                 const MCSubtargetInfo &STI) const {
++    return getPCRelEncoding(MI, OpNum, Fixups,
++                            SystemZ::FK_390_PC32DBL, 2, true);
+   }
+ };
+ } // end anonymous namespace
+@@ -178,10 +197,22 @@ getBDLAddr12Len8Encoding(const MCInst &M
+   return (Len << 16) | (Base << 12) | Disp;
+ }
+ 
++uint64_t SystemZMCCodeEmitter::
++getBDVAddr12Encoding(const MCInst &MI, unsigned OpNum,
++                     SmallVectorImpl<MCFixup> &Fixups,
++                     const MCSubtargetInfo &STI) const {
++  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
++  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
++  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI);
++  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<5>(Index));
++  return (Index << 16) | (Base << 12) | Disp;
++}
++
+ uint64_t
+ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+-                                       unsigned Kind, int64_t Offset) const {
++                                       unsigned Kind, int64_t Offset,
++                                       bool AllowTLS) const {
+   const MCOperand &MO = MI.getOperand(OpNum);
+   const MCExpr *Expr;
+   if (MO.isImm())
+@@ -198,6 +229,13 @@ SystemZMCCodeEmitter::getPCRelEncoding(c
+     }
+   }
+   Fixups.push_back(MCFixup::Create(Offset, Expr, (MCFixupKind)Kind));
++
++  // Output the fixup for the TLS marker if present.
++  if (AllowTLS && OpNum + 1 < MI.getNumOperands()) {
++    const MCOperand &MOTLS = MI.getOperand(OpNum + 1);
++    Fixups.push_back(MCFixup::Create(0, MOTLS.getExpr(),
++                                     (MCFixupKind)SystemZ::FK_390_TLS_CALL));
++  }
+   return 0;
+ }
+ 
+Index: llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
++++ llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+@@ -18,8 +18,7 @@ enum FixupKind {
+   // These correspond directly to R_390_* relocations.
+   FK_390_PC16DBL = FirstTargetFixupKind,
+   FK_390_PC32DBL,
+-  FK_390_PLT16DBL,
+-  FK_390_PLT32DBL,
++  FK_390_TLS_CALL,
+ 
+   // Marker
+   LastTargetFixupKind,
+Index: llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
++++ llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+@@ -55,8 +55,6 @@ static unsigned getPCRelReloc(unsigned K
+   case FK_Data_8:                return ELF::R_390_PC64;
+   case SystemZ::FK_390_PC16DBL:  return ELF::R_390_PC16DBL;
+   case SystemZ::FK_390_PC32DBL:  return ELF::R_390_PC32DBL;
+-  case SystemZ::FK_390_PLT16DBL: return ELF::R_390_PLT16DBL;
+-  case SystemZ::FK_390_PLT32DBL: return ELF::R_390_PLT32DBL;
+   }
+   llvm_unreachable("Unsupported PC-relative address");
+ }
+@@ -70,6 +68,35 @@ static unsigned getTLSLEReloc(unsigned K
+   llvm_unreachable("Unsupported absolute address");
+ }
+ 
++// Return the R_390_TLS_LDO* relocation type for MCFixupKind Kind.
++static unsigned getTLSLDOReloc(unsigned Kind) {
++  switch (Kind) {
++  case FK_Data_4: return ELF::R_390_TLS_LDO32;
++  case FK_Data_8: return ELF::R_390_TLS_LDO64;
++  }
++  llvm_unreachable("Unsupported absolute address");
++}
++
++// Return the R_390_TLS_LDM* relocation type for MCFixupKind Kind.
++static unsigned getTLSLDMReloc(unsigned Kind) {
++  switch (Kind) {
++  case FK_Data_4: return ELF::R_390_TLS_LDM32;
++  case FK_Data_8: return ELF::R_390_TLS_LDM64;
++  case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_LDCALL;
++  }
++  llvm_unreachable("Unsupported absolute address");
++}
++
++// Return the R_390_TLS_GD* relocation type for MCFixupKind Kind.
++static unsigned getTLSGDReloc(unsigned Kind) {
++  switch (Kind) {
++  case FK_Data_4: return ELF::R_390_TLS_GD32;
++  case FK_Data_8: return ELF::R_390_TLS_GD64;
++  case SystemZ::FK_390_TLS_CALL: return ELF::R_390_TLS_GDCALL;
++  }
++  llvm_unreachable("Unsupported absolute address");
++}
++
+ // Return the PLT relocation counterpart of MCFixupKind Kind.
+ static unsigned getPLTReloc(unsigned Kind) {
+   switch (Kind) {
+@@ -94,6 +121,23 @@ unsigned SystemZObjectWriter::GetRelocTy
+     assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
+     return getTLSLEReloc(Kind);
+ 
++  case MCSymbolRefExpr::VK_INDNTPOFF:
++    if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
++      return ELF::R_390_TLS_IEENT;
++    llvm_unreachable("Only PC-relative INDNTPOFF accesses are supported for now");
++
++  case MCSymbolRefExpr::VK_DTPOFF:
++    assert(!IsPCRel && "DTPOFF shouldn't be PC-relative");
++    return getTLSLDOReloc(Kind);
++
++  case MCSymbolRefExpr::VK_TLSLDM:
++    assert(!IsPCRel && "TLSLDM shouldn't be PC-relative");
++    return getTLSLDMReloc(Kind);
++
++  case MCSymbolRefExpr::VK_TLSGD:
++    assert(!IsPCRel && "TLSGD shouldn't be PC-relative");
++    return getTLSGDReloc(Kind);
++
+   case MCSymbolRefExpr::VK_GOT:
+     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
+       return ELF::R_390_GOTENT;
+Index: llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
++++ llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+@@ -76,6 +76,39 @@ const unsigned SystemZMC::FP128Regs[16]
+   SystemZ::F12Q, SystemZ::F13Q, 0, 0
+ };
+ 
++const unsigned SystemZMC::VR32Regs[32] = {
++  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
++  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
++  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
++  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S,
++  SystemZ::F16S, SystemZ::F17S, SystemZ::F18S, SystemZ::F19S,
++  SystemZ::F20S, SystemZ::F21S, SystemZ::F22S, SystemZ::F23S,
++  SystemZ::F24S, SystemZ::F25S, SystemZ::F26S, SystemZ::F27S,
++  SystemZ::F28S, SystemZ::F29S, SystemZ::F30S, SystemZ::F31S
++};
++
++const unsigned SystemZMC::VR64Regs[32] = {
++  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
++  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
++  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
++  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D,
++  SystemZ::F16D, SystemZ::F17D, SystemZ::F18D, SystemZ::F19D,
++  SystemZ::F20D, SystemZ::F21D, SystemZ::F22D, SystemZ::F23D,
++  SystemZ::F24D, SystemZ::F25D, SystemZ::F26D, SystemZ::F27D,
++  SystemZ::F28D, SystemZ::F29D, SystemZ::F30D, SystemZ::F31D
++};
++
++const unsigned SystemZMC::VR128Regs[32] = {
++  SystemZ::V0, SystemZ::V1, SystemZ::V2, SystemZ::V3,
++  SystemZ::V4, SystemZ::V5, SystemZ::V6, SystemZ::V7,
++  SystemZ::V8, SystemZ::V9, SystemZ::V10, SystemZ::V11,
++  SystemZ::V12, SystemZ::V13, SystemZ::V14, SystemZ::V15,
++  SystemZ::V16, SystemZ::V17, SystemZ::V18, SystemZ::V19,
++  SystemZ::V20, SystemZ::V21, SystemZ::V22, SystemZ::V23,
++  SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
++  SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31
++};
++
+ unsigned SystemZMC::getFirstReg(unsigned Reg) {
+   static unsigned Map[SystemZ::NUM_TARGET_REGS];
+   static bool Initialized = false;
+@@ -85,10 +118,13 @@ unsigned SystemZMC::getFirstReg(unsigned
+       Map[GRH32Regs[I]] = I;
+       Map[GR64Regs[I]] = I;
+       Map[GR128Regs[I]] = I;
+-      Map[FP32Regs[I]] = I;
+-      Map[FP64Regs[I]] = I;
+       Map[FP128Regs[I]] = I;
+     }
++    for (unsigned I = 0; I < 32; ++I) {
++      Map[VR32Regs[I]] = I;
++      Map[VR64Regs[I]] = I;
++      Map[VR128Regs[I]] = I;
++    }
+   }
+   assert(Reg < SystemZ::NUM_TARGET_REGS);
+   return Map[Reg];
+Index: llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
++++ llvm-36/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+@@ -48,6 +48,9 @@ extern const unsigned GR128Regs[16];
+ extern const unsigned FP32Regs[16];
+ extern const unsigned FP64Regs[16];
+ extern const unsigned FP128Regs[16];
++extern const unsigned VR32Regs[32];
++extern const unsigned VR64Regs[32];
++extern const unsigned VR128Regs[32];
+ 
+ // Return the 0-based number of the first architectural register that
+ // contains the given LLVM register.   E.g. R1D -> 1.
+@@ -67,6 +70,11 @@ inline unsigned getRegAsGR32(unsigned Re
+ inline unsigned getRegAsGRH32(unsigned Reg) {
+   return GRH32Regs[getFirstReg(Reg)];
+ }
++
++// Return the given register as a VR128.
++inline unsigned getRegAsVR128(unsigned Reg) {
++  return VR128Regs[getFirstReg(Reg)];
++}
+ } // end namespace SystemZMC
+ 
+ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+Index: llvm-36/lib/Target/SystemZ/SystemZ.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZ.h
++++ llvm-36/lib/Target/SystemZ/SystemZ.h
+@@ -21,6 +21,7 @@
+ namespace llvm {
+ class SystemZTargetMachine;
+ class FunctionPass;
++class ImmutablePass;
+ 
+ namespace SystemZ {
+ // Condition-code mask values.
+@@ -68,6 +69,25 @@ const unsigned CCMASK_TM_MSB_0       = C
+ const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+ const unsigned CCMASK_TM             = CCMASK_ANY;
+ 
++// Condition-code mask assignments for TRANSACTION_BEGIN.
++const unsigned CCMASK_TBEGIN_STARTED       = CCMASK_0;
++const unsigned CCMASK_TBEGIN_INDETERMINATE = CCMASK_1;
++const unsigned CCMASK_TBEGIN_TRANSIENT     = CCMASK_2;
++const unsigned CCMASK_TBEGIN_PERSISTENT    = CCMASK_3;
++const unsigned CCMASK_TBEGIN               = CCMASK_ANY;
++
++// Condition-code mask assignments for TRANSACTION_END.
++const unsigned CCMASK_TEND_TX   = CCMASK_0;
++const unsigned CCMASK_TEND_NOTX = CCMASK_2;
++const unsigned CCMASK_TEND      = CCMASK_TEND_TX | CCMASK_TEND_NOTX;
++
++// Condition-code mask assignments for vector comparisons (and similar
++// operations).
++const unsigned CCMASK_VCMP_ALL       = CCMASK_0;
++const unsigned CCMASK_VCMP_MIXED     = CCMASK_1;
++const unsigned CCMASK_VCMP_NONE      = CCMASK_3;
++const unsigned CCMASK_VCMP           = CCMASK_0 | CCMASK_1 | CCMASK_3;
++
+ // The position of the low CC bit in an IPM result.
+ const unsigned IPM_CC = 28;
+ 
+@@ -75,6 +95,13 @@ const unsigned IPM_CC = 28;
+ const unsigned PFD_READ  = 1;
+ const unsigned PFD_WRITE = 2;
+ 
++// Number of bits in a vector register.
++const unsigned VectorBits = 128;
++
++// Number of bytes in a vector register (and consequently the number of
++// bytes in a general permute vector).
++const unsigned VectorBytes = VectorBits / 8;
++
+ // Return true if Val fits an LLILL operand.
+ static inline bool isImmLL(uint64_t Val) {
+   return (Val & ~0x000000000000ffffULL) == 0;
+@@ -111,6 +138,9 @@ FunctionPass *createSystemZISelDag(Syste
+ FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+ FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
+ FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
++FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
++ImmutablePass *createSystemZTargetTransformInfoPass(
++  const SystemZTargetMachine *TM);
+ } // end namespace llvm
+ 
+ #endif
+Index: llvm-36/lib/Target/SystemZ/SystemZ.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZ.td
++++ llvm-36/lib/Target/SystemZ/SystemZ.td
+@@ -40,6 +40,7 @@ include "SystemZOperands.td"
+ include "SystemZPatterns.td"
+ include "SystemZInstrFormats.td"
+ include "SystemZInstrInfo.td"
++include "SystemZInstrVector.td"
+ include "SystemZInstrFP.td"
+ 
+ def SystemZInstrInfo : InstrInfo {}
+Index: llvm-36/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZAsmPrinter.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+@@ -66,6 +66,41 @@ static MCInst lowerRIEfLow(const Machine
+     .addImm(MI->getOperand(5).getImm());
+ }
+ 
++static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
++  StringRef Name = "__tls_get_offset";
++  return MCSymbolRefExpr::Create(Context.GetOrCreateSymbol(Name),
++                                 MCSymbolRefExpr::VK_PLT,
++                                 Context);
++}
++
++static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
++  StringRef Name = "_GLOBAL_OFFSET_TABLE_";
++  return MCSymbolRefExpr::Create(Context.GetOrCreateSymbol(Name),
++                                 MCSymbolRefExpr::VK_None,
++                                 Context);
++}
++
++// MI loads the high part of a vector from memory.  Return an instruction
++// that uses replicating vector load Opcode to do the same thing.
++static MCInst lowerSubvectorLoad(const MachineInstr *MI, unsigned Opcode) {
++  return MCInstBuilder(Opcode)
++    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
++    .addReg(MI->getOperand(1).getReg())
++    .addImm(MI->getOperand(2).getImm())
++    .addReg(MI->getOperand(3).getReg());
++}
++
++// MI stores the high part of a vector to memory.  Return an instruction
++// that uses elemental vector store Opcode to do the same thing.
++static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
++  return MCInstBuilder(Opcode)
++    .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
++    .addReg(MI->getOperand(1).getReg())
++    .addImm(MI->getOperand(2).getImm())
++    .addReg(MI->getOperand(3).getReg())
++    .addImm(0);
++}
++
+ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+   SystemZMCInstLower Lower(MF->getContext(), *this);
+   MCInst LoweredMI;
+@@ -95,6 +130,26 @@ void SystemZAsmPrinter::EmitInstruction(
+     LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+     break;
+ 
++  case SystemZ::TLS_GDCALL:
++    LoweredMI = MCInstBuilder(SystemZ::BRASL)
++      .addReg(SystemZ::R14D)
++      .addExpr(getTLSGetOffset(MF->getContext()))
++      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSGD));
++    break;
++
++  case SystemZ::TLS_LDCALL:
++    LoweredMI = MCInstBuilder(SystemZ::BRASL)
++      .addReg(SystemZ::R14D)
++      .addExpr(getTLSGetOffset(MF->getContext()))
++      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_TLSLDM));
++    break;
++
++  case SystemZ::GOT:
++    LoweredMI = MCInstBuilder(SystemZ::LARL)
++      .addReg(MI->getOperand(0).getReg())
++      .addExpr(getGlobalOffsetTable(MF->getContext()));
++    break;
++
+   case SystemZ::IILF64:
+     LoweredMI = MCInstBuilder(SystemZ::IILF)
+       .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+@@ -117,6 +172,51 @@ void SystemZAsmPrinter::EmitInstruction(
+     LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+     break;
+ 
++  case SystemZ::VLVGP32:
++    LoweredMI = MCInstBuilder(SystemZ::VLVGP)
++      .addReg(MI->getOperand(0).getReg())
++      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(1).getReg()))
++      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()));
++    break;
++
++  case SystemZ::VLR32:
++  case SystemZ::VLR64:
++    LoweredMI = MCInstBuilder(SystemZ::VLR)
++      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
++      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()));
++    break;
++
++  case SystemZ::VL32:
++    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPF);
++    break;
++
++  case SystemZ::VL64:
++    LoweredMI = lowerSubvectorLoad(MI, SystemZ::VLREPG);
++    break;
++
++  case SystemZ::VST32:
++    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEF);
++    break;
++
++  case SystemZ::VST64:
++    LoweredMI = lowerSubvectorStore(MI, SystemZ::VSTEG);
++    break;
++
++  case SystemZ::LFER:
++    LoweredMI = MCInstBuilder(SystemZ::VLGVF)
++      .addReg(SystemZMC::getRegAsGR64(MI->getOperand(0).getReg()))
++      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(1).getReg()))
++      .addReg(0).addImm(0);
++    break;
++
++  case SystemZ::LEFR:
++    LoweredMI = MCInstBuilder(SystemZ::VLVGF)
++      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
++      .addReg(SystemZMC::getRegAsVR128(MI->getOperand(0).getReg()))
++      .addReg(MI->getOperand(1).getReg())
++      .addReg(0).addImm(0);
++    break;
++
+ #define LOWER_LOW(NAME)                                                 \
+   case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+ 
+@@ -172,6 +272,9 @@ void SystemZAsmPrinter::EmitInstruction(
+ static MCSymbolRefExpr::VariantKind
+ getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
+   switch (Modifier) {
++  case SystemZCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD;
++  case SystemZCP::TLSLDM: return MCSymbolRefExpr::VK_TLSLDM;
++  case SystemZCP::DTPOFF: return MCSymbolRefExpr::VK_DTPOFF;
+   case SystemZCP::NTPOFF: return MCSymbolRefExpr::VK_NTPOFF;
+   }
+   llvm_unreachable("Invalid SystemCPModifier!");
+Index: llvm-36/lib/Target/SystemZ/SystemZCallingConv.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZCallingConv.h
++++ llvm-36/lib/Target/SystemZ/SystemZCallingConv.h
+@@ -10,6 +10,9 @@
+ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+ 
++#include "llvm/ADT/SmallVector.h"
++#include "llvm/CodeGen/CallingConvLower.h"
++
+ namespace llvm {
+ namespace SystemZ {
+   const unsigned NumArgGPRs = 5;
+@@ -18,6 +21,64 @@ namespace SystemZ {
+   const unsigned NumArgFPRs = 4;
+   extern const unsigned ArgFPRs[NumArgFPRs];
+ } // end namespace SystemZ
++
++class SystemZCCState : public CCState {
++private:
++  /// Records whether the value was a fixed argument.
++  /// See ISD::OutputArg::IsFixed.
++  SmallVector<bool, 4> ArgIsFixed;
++
++  /// Records whether the value was widened from a short vector type.
++  SmallVector<bool, 4> ArgIsShortVector;
++
++  // Check whether ArgVT is a short vector type.
++  bool IsShortVectorType(EVT ArgVT) {
++    return ArgVT.isVector() && ArgVT.getStoreSize() <= 8;
++  }
++
++public:
++  SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
++                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
++      : CCState(CC, isVarArg, MF, locs, C) {}
++
++  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
++                              CCAssignFn Fn) {
++    // Formal arguments are always fixed.
++    ArgIsFixed.clear();
++    for (unsigned i = 0; i < Ins.size(); ++i)
++      ArgIsFixed.push_back(true);
++    // Record whether the call operand was a short vector.
++    ArgIsShortVector.clear();
++    for (unsigned i = 0; i < Ins.size(); ++i)
++      ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT));
++
++    CCState::AnalyzeFormalArguments(Ins, Fn);
++  }
++
++  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
++                           CCAssignFn Fn) {
++    // Record whether the call operand was a fixed argument.
++    ArgIsFixed.clear();
++    for (unsigned i = 0; i < Outs.size(); ++i)
++      ArgIsFixed.push_back(Outs[i].IsFixed);
++    // Record whether the call operand was a short vector.
++    ArgIsShortVector.clear();
++    for (unsigned i = 0; i < Outs.size(); ++i)
++      ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT));
++
++    CCState::AnalyzeCallOperands(Outs, Fn);
++  }
++
++  // This version of AnalyzeCallOperands in the base class is not usable
++  // since we must provide a means of accessing ISD::OutputArg::IsFixed.
++  void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
++                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
++                           CCAssignFn Fn) = delete;
++
++  bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; }
++  bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; }
++};
++
+ } // end namespace llvm
+ 
+ #endif
+Index: llvm-36/lib/Target/SystemZ/SystemZCallingConv.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZCallingConv.td
++++ llvm-36/lib/Target/SystemZ/SystemZCallingConv.td
+@@ -12,6 +12,20 @@
+ class CCIfExtend<CCAction A>
+   : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+ 
++class CCIfSubtarget<string F, CCAction A>
++  : CCIf<!strconcat("static_cast<const SystemZSubtarget&>"
++                    "(State.getMachineFunction().getSubtarget()).", F),
++         A>;
++
++// Match if this specific argument is a fixed (i.e. named) argument.
++class CCIfFixed<CCAction A>
++    : CCIf<"static_cast<SystemZCCState *>(&State)->IsFixed(ValNo)", A>;
++
++// Match if this specific argument was widened from a short vector type.
++class CCIfShortVector<CCAction A>
++    : CCIf<"static_cast<SystemZCCState *>(&State)->IsShortVector(ValNo)", A>;
++
++
+ //===----------------------------------------------------------------------===//
+ // z/Linux return value calling convention
+ //===----------------------------------------------------------------------===//
+@@ -31,7 +45,14 @@ def RetCC_SystemZ : CallingConv<[
+   // doesn't care about the ABI.  All floating-point argument registers
+   // are call-clobbered, so we can use all of them here.
+   CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+-  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>
++  CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
++
++  // Similarly for vectors, with V24 being the ABI-compliant choice.
++  // Sub-128 vectors are returned in the same way, but they're widened
++  // to one of these types during type legalization.
++  CCIfSubtarget<"hasVector()",
++    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
++             CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
+ 
+   // ABI-compliant code returns long double by reference, but that conversion
+   // is left to higher-level code.  Perhaps we could add an f128 definition
+@@ -60,6 +81,25 @@ def CC_SystemZ : CallingConv<[
+   CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+   CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>,
+ 
++  // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
++  // are passed in the same way, but they're widened to one of these types
++  // during type legalization.
++  CCIfSubtarget<"hasVector()",
++    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
++             CCIfFixed<CCAssignToReg<[V24, V26, V28, V30,
++                                      V25, V27, V29, V31]>>>>,
++
++  // However, sub-128 vectors which need to go on the stack occupy just a
++  // single 8-byte-aligned 8-byte stack slot.  Pass as i64.
++  CCIfSubtarget<"hasVector()",
++    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
++             CCIfShortVector<CCBitConvertToType<i64>>>>,
++
++  // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
++  CCIfSubtarget<"hasVector()",
++    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
++             CCAssignToStack<16, 8>>>,
++
+   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
+   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+ ]>;
+Index: llvm-36/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+@@ -28,6 +28,11 @@ SystemZConstantPoolValue::Create(const G
+ 
+ unsigned SystemZConstantPoolValue::getRelocationInfo() const {
+   switch (Modifier) {
++  case SystemZCP::TLSGD:
++  case SystemZCP::TLSLDM:
++  case SystemZCP::DTPOFF:
++    // May require a dynamic relocation.
++    return 2;
+   case SystemZCP::NTPOFF:
+     // May require a relocation, but the relocations are always resolved
+     // by the static linker.
+Index: llvm-36/lib/Target/SystemZ/SystemZConstantPoolValue.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZConstantPoolValue.h
++++ llvm-36/lib/Target/SystemZ/SystemZConstantPoolValue.h
+@@ -19,13 +19,17 @@ class GlobalValue;
+ 
+ namespace SystemZCP {
+ enum SystemZCPModifier {
++  TLSGD,
++  TLSLDM,
++  DTPOFF,
+   NTPOFF
+ };
+ } // end namespace SystemZCP
+ 
+ /// A SystemZ-specific constant pool value.  At present, the only
+-/// defined constant pool values are offsets of thread-local variables
+-/// (written x@NTPOFF).
++/// defined constant pool values are module IDs or offsets of
++/// thread-local variables (written x@TLSGD, x@TLSLDM, x@DTPOFF,
++/// or x@NTPOFF).
+ class SystemZConstantPoolValue : public MachineConstantPoolValue {
+   const GlobalValue *GV;
+   SystemZCP::SystemZCPModifier Modifier;
+Index: llvm-36/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+@@ -256,6 +256,13 @@ class SystemZDAGToDAGISel : public Selec
+                          Addr, Base, Disp, Index);
+   }
+ 
++  // Try to match Addr as an address with a base, 12-bit displacement
++  // and index, where the index is element Elem of a vector.
++  // Return true on success, storing the base, displacement and vector
++  // in Base, Disp and Index respectively.
++  bool selectBDVAddr12Only(SDValue Addr, SDValue Elem, SDValue &Base,
++                           SDValue &Disp, SDValue &Index) const;
++
+   // Check whether (or Op (and X InsertMask)) is effectively an insertion
+   // of X into bits InsertMask of some Y != Op.  Return true if so and
+   // set Op to that Y.
+@@ -293,6 +300,12 @@ class SystemZDAGToDAGISel : public Selec
+   SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
+                               uint64_t UpperVal, uint64_t LowerVal);
+ 
++  // Try to use gather instruction Opcode to implement vector insertion N.
++  SDNode *tryGather(SDNode *N, unsigned Opcode);
++
++  // Try to use scatter instruction Opcode to implement store Store.
++  SDNode *tryScatter(StoreSDNode *Store, unsigned Opcode);
++
+   // Return true if Load and Store are loads and stores of the same size
+   // and are guaranteed not to overlap.  Such operations can be implemented
+   // using block (SS-format) instructions.
+@@ -643,6 +656,30 @@ bool SystemZDAGToDAGISel::selectBDXAddr(
+   return true;
+ }
+ 
++bool SystemZDAGToDAGISel::selectBDVAddr12Only(SDValue Addr, SDValue Elem,
++                                              SDValue &Base,
++                                              SDValue &Disp,
++                                              SDValue &Index) const {
++  SDValue Regs[2];
++  if (selectBDXAddr12Only(Addr, Regs[0], Disp, Regs[1]) &&
++      Regs[0].getNode() && Regs[1].getNode()) {
++    for (unsigned int I = 0; I < 2; ++I) {
++      Base = Regs[I];
++      Index = Regs[1 - I];
++      // We can't tell here whether the index vector has the right type
++      // for the access; the caller needs to do that instead.
++      if (Index.getOpcode() == ISD::ZERO_EXTEND)
++        Index = Index.getOperand(0);
++      if (Index.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
++          Index.getOperand(1) == Elem) {
++        Index = Index.getOperand(0);
++        return true;
++      }
++    }
++  }
++  return false;
++}
++
+ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
+                                                uint64_t InsertMask) const {
+   // We're only interested in cases where the insertion is into some operand
+@@ -896,6 +933,9 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZer
+   }  
+ 
+   unsigned Opcode = SystemZ::RISBG;
++  // Prefer RISBGN if available, since it does not clobber CC.
++  if (Subtarget.hasMiscellaneousExtensions())
++    Opcode = SystemZ::RISBGN;
+   EVT OpcodeVT = MVT::i64;
+   if (VT == MVT::i32 && Subtarget.hasHighWord()) {
+     Opcode = SystemZ::RISBMux;
+@@ -943,9 +983,13 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SD
+ 
+   // See whether we can avoid an AND in the first operand by converting
+   // ROSBG to RISBG.
+-  if (Opcode == SystemZ::ROSBG && detectOrAndInsertion(Op0, RxSBG[I].Mask))
++  if (Opcode == SystemZ::ROSBG && detectOrAndInsertion(Op0, RxSBG[I].Mask)) {
+     Opcode = SystemZ::RISBG;
+-           
++    // Prefer RISBGN if available, since it does not clobber CC.
++    if (Subtarget.hasMiscellaneousExtensions())
++      Opcode = SystemZ::RISBGN;
++  }
++
+   EVT VT = N->getValueType(0);
+   SDValue Ops[5] = {
+     convertTo(SDLoc(N), MVT::i64, Op0),
+@@ -973,6 +1017,71 @@ SDNode *SystemZDAGToDAGISel::splitLargeI
+   return Or.getNode();
+ }
+ 
++SDNode *SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) {
++  SDValue ElemV = N->getOperand(2);
++  auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
++  if (!ElemN)
++    return 0;
++
++  unsigned Elem = ElemN->getZExtValue();
++  EVT VT = N->getValueType(0);
++  if (Elem >= VT.getVectorNumElements())
++    return 0;
++
++  auto *Load = dyn_cast<LoadSDNode>(N->getOperand(1));
++  if (!Load || !Load->hasOneUse())
++    return 0;
++  if (Load->getMemoryVT().getSizeInBits() !=
++      Load->getValueType(0).getSizeInBits())
++    return 0;
++
++  SDValue Base, Disp, Index;
++  if (!selectBDVAddr12Only(Load->getBasePtr(), ElemV, Base, Disp, Index) ||
++      Index.getValueType() != VT.changeVectorElementTypeToInteger())
++    return 0;
++
++  SDLoc DL(Load);
++  SDValue Ops[] = {
++    N->getOperand(0), Base, Disp, Index,
++    CurDAG->getTargetConstant(Elem, MVT::i32), Load->getChain()
++  };
++  SDNode *Res = CurDAG->getMachineNode(Opcode, DL, VT, MVT::Other, Ops);
++  ReplaceUses(SDValue(Load, 1), SDValue(Res, 1));
++  return Res;
++}
++
++SDNode *SystemZDAGToDAGISel::tryScatter(StoreSDNode *Store, unsigned Opcode) {
++  SDValue Value = Store->getValue();
++  if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
++    return 0;
++  if (Store->getMemoryVT().getSizeInBits() !=
++      Value.getValueType().getSizeInBits())
++    return 0;
++
++  SDValue ElemV = Value.getOperand(1);
++  auto *ElemN = dyn_cast<ConstantSDNode>(ElemV);
++  if (!ElemN)
++    return 0;
++
++  SDValue Vec = Value.getOperand(0);
++  EVT VT = Vec.getValueType();
++  unsigned Elem = ElemN->getZExtValue();
++  if (Elem >= VT.getVectorNumElements())
++    return 0;
++
++  SDValue Base, Disp, Index;
++  if (!selectBDVAddr12Only(Store->getBasePtr(), ElemV, Base, Disp, Index) ||
++      Index.getValueType() != VT.changeVectorElementTypeToInteger())
++    return 0;
++
++  SDLoc DL(Store);
++  SDValue Ops[] = {
++    Vec, Base, Disp, Index, CurDAG->getTargetConstant(Elem, MVT::i32),
++    Store->getChain()
++  };
++  return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
++}
++
+ bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                                LoadSDNode *Load) const {
+   // Check that the two memory operands have the same size.
+@@ -1109,6 +1218,26 @@ SDNode *SystemZDAGToDAGISel::Select(SDNo
+     }
+     break;
+   }
++
++  case ISD::INSERT_VECTOR_ELT: {
++    EVT VT = Node->getValueType(0);
++    unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits();
++    if (ElemBitSize == 32)
++      ResNode = tryGather(Node, SystemZ::VGEF);
++    else if (ElemBitSize == 64)
++      ResNode = tryGather(Node, SystemZ::VGEG);
++    break;
++  }
++
++  case ISD::STORE: {
++    auto *Store = cast<StoreSDNode>(Node);
++    unsigned ElemBitSize = Store->getValue().getValueType().getSizeInBits();
++    if (ElemBitSize == 32)
++      ResNode = tryScatter(Store, SystemZ::VSCEF);
++    else if (ElemBitSize == 64)
++      ResNode = tryScatter(Store, SystemZ::VSCEG);
++    break;
++  }
+   }
+ 
+   // Select the default instruction
+Index: llvm-36/lib/Target/SystemZ/SystemZISelLowering.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZISelLowering.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZISelLowering.cpp
+@@ -20,6 +20,7 @@
+ #include "llvm/CodeGen/MachineInstrBuilder.h"
+ #include "llvm/CodeGen/MachineRegisterInfo.h"
+ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
++#include "llvm/IR/Intrinsics.h"
+ #include <cctype>
+ 
+ using namespace llvm;
+@@ -90,11 +91,25 @@ SystemZTargetLowering::SystemZTargetLowe
+     addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+   else
+     addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
+-  addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
+-  addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
+-  addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
++  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
++  if (Subtarget.hasVector()) {
++    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
++    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
++  } else {
++    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
++    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
++  }
+   addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+ 
++  if (Subtarget.hasVector()) {
++    addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
++    addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
++    addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
++    addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
++    addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
++    addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
++  }
++
+   // Compute derived properties from the register classes
+   computeRegisterProperties();
+ 
+@@ -110,7 +125,7 @@ SystemZTargetLowering::SystemZTargetLowe
+   setSchedulingPreference(Sched::RegPressure);
+ 
+   setBooleanContents(ZeroOrOneBooleanContent);
+-  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
++  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+ 
+   // Instructions are strings of 2-byte aligned 2-byte values.
+   setMinFunctionAlignment(2);
+@@ -163,8 +178,13 @@ SystemZTargetLowering::SystemZTargetLowe
+       // available, or if the operand is constant.
+       setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ 
++      // Use POPCNT on z196 and above.
++      if (Subtarget.hasPopulationCount())
++        setOperationAction(ISD::CTPOP, VT, Custom);
++      else
++        setOperationAction(ISD::CTPOP, VT, Expand);
++
+       // No special instructions for these.
+-      setOperationAction(ISD::CTPOP,           VT, Expand);
+       setOperationAction(ISD::CTTZ,            VT, Expand);
+       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+@@ -244,6 +264,90 @@ SystemZTargetLowering::SystemZTargetLowe
+   // Handle prefetches with PFD or PFDRL.
+   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+ 
++  for (MVT VT : MVT::vector_valuetypes()) {
++    // Assume by default that all vector operations need to be expanded.
++    for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
++      if (getOperationAction(Opcode, VT) == Legal)
++        setOperationAction(Opcode, VT, Expand);
++
++    // Likewise all truncating stores and extending loads.
++    for (MVT InnerVT : MVT::vector_valuetypes()) {
++      setTruncStoreAction(VT, InnerVT, Expand);
++      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
++      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
++      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
++    }
++
++    if (isTypeLegal(VT)) {
++      // These operations are legal for anything that can be stored in a
++      // vector register, even if there is no native support for the format
++      // as such.  In particular, we can do these for v4f32 even though there
++      // are no specific instructions for that format.
++      setOperationAction(ISD::LOAD, VT, Legal);
++      setOperationAction(ISD::STORE, VT, Legal);
++      setOperationAction(ISD::VSELECT, VT, Legal);
++      setOperationAction(ISD::BITCAST, VT, Legal);
++      setOperationAction(ISD::UNDEF, VT, Legal);
++
++      // Likewise, except that we need to replace the nodes with something
++      // more specific.
++      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
++    }
++  }
++
++  // Handle integer vector types.
++  for (MVT VT : MVT::integer_vector_valuetypes()) {
++    if (isTypeLegal(VT)) {
++      // These operations have direct equivalents.
++      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::ADD, VT, Legal);
++      setOperationAction(ISD::SUB, VT, Legal);
++      if (VT != MVT::v2i64)
++        setOperationAction(ISD::MUL, VT, Legal);
++      setOperationAction(ISD::AND, VT, Legal);
++      setOperationAction(ISD::OR, VT, Legal);
++      setOperationAction(ISD::XOR, VT, Legal);
++      setOperationAction(ISD::CTPOP, VT, Custom);
++      setOperationAction(ISD::CTTZ, VT, Legal);
++      setOperationAction(ISD::CTLZ, VT, Legal);
++      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
++      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
++
++      // Convert a GPR scalar to a vector by inserting it into element 0.
++      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
++
++      // Use a series of unpacks for extensions.
++      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
++      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
++
++      // Detect shifts by a scalar amount and convert them into
++      // V*_BY_SCALAR.
++      setOperationAction(ISD::SHL, VT, Custom);
++      setOperationAction(ISD::SRA, VT, Custom);
++      setOperationAction(ISD::SRL, VT, Custom);
++
++      // At present ROTL isn't matched by DAGCombiner.  ROTR should be
++      // converted into ROTL.
++      setOperationAction(ISD::ROTL, VT, Expand);
++      setOperationAction(ISD::ROTR, VT, Expand);
++
++      // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
++      // and inverting the result as necessary.
++      setOperationAction(ISD::SETCC, VT, Custom);
++    }
++  }
++
++  if (Subtarget.hasVector()) {
++    // There should be no need to check for float types other than v2f64
++    // since <2 x f32> isn't a legal type.
++    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
++    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
++    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
++    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
++  }
++
+   // Handle floating-point types.
+   for (unsigned I = MVT::FIRST_FP_VALUETYPE;
+        I <= MVT::LAST_FP_VALUETYPE;
+@@ -269,6 +373,36 @@ SystemZTargetLowering::SystemZTargetLowe
+     }
+   }
+ 
++  // Handle floating-point vector types.
++  if (Subtarget.hasVector()) {
++    // Scalar-to-vector conversion is just a subreg.
++    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
++    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
++
++    // Some insertions and extractions can be done directly but others
++    // need to go via integers.
++    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
++    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
++    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
++    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
++
++    // These operations have direct equivalents.
++    setOperationAction(ISD::FADD, MVT::v2f64, Legal);
++    setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
++    setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
++    setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
++    setOperationAction(ISD::FMA, MVT::v2f64, Legal);
++    setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
++    setOperationAction(ISD::FABS, MVT::v2f64, Legal);
++    setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
++    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
++    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
++    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
++    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
++    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
++    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
++  }
++
+   // We have fused multiply-addition for f32 and f64 but not f128.
+   setOperationAction(ISD::FMA, MVT::f32,  Legal);
+   setOperationAction(ISD::FMA, MVT::f64,  Legal);
+@@ -287,8 +421,10 @@ SystemZTargetLowering::SystemZTargetLowe
+ 
+   // We have 64-bit FPR<->GPR moves, but need special handling for
+   // 32-bit forms.
+-  setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+-  setOperationAction(ISD::BITCAST, MVT::f32, Custom);
++  if (!Subtarget.hasVector()) {
++    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
++    setOperationAction(ISD::BITCAST, MVT::f32, Custom);
++  }
+ 
+   // VASTART and VACOPY need to deal with the SystemZ-specific varargs
+   // structure, but VAEND is a no-op.
+@@ -298,6 +434,13 @@ SystemZTargetLowering::SystemZTargetLowe
+ 
+   // Codes for which we want to perform some z-specific combinations.
+   setTargetDAGCombine(ISD::SIGN_EXTEND);
++  setTargetDAGCombine(ISD::STORE);
++  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
++  setTargetDAGCombine(ISD::FP_ROUND);
++
++  // Handle intrinsics.
++  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
++  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ 
+   // We want to use MVC in preference to even a single load/store pair.
+   MaxStoresPerMemcpy = 0;
+@@ -342,6 +485,16 @@ bool SystemZTargetLowering::isFPImmLegal
+   return Imm.isZero() || Imm.isNegZero();
+ }
+ 
++bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
++  // We can use CGFI or CLGFI.
++  return isInt<32>(Imm) || isUInt<32>(Imm);
++}
++
++bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
++  // We can use ALGFI or SLGFI.
++  return isUInt<32>(Imm) || isUInt<32>(-Imm);
++}
++
+ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                            unsigned,
+                                                            unsigned,
+@@ -623,6 +776,24 @@ bool SystemZTargetLowering::mayBeEmitted
+   return true;
+ }
+ 
++// We do not yet support 128-bit single-element vector types.  If the user
++// attempts to use such types as function argument or return type, prefer
++// to error out instead of emitting code violating the ABI.
++static void VerifyVectorType(MVT VT, EVT ArgVT) {
++  if (ArgVT.isVector() && !VT.isVector())
++    report_fatal_error("Unsupported vector argument or return type");
++}
++
++static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
++  for (unsigned i = 0; i < Ins.size(); ++i)
++    VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
++}
++
++static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
++  for (unsigned i = 0; i < Outs.size(); ++i)
++    VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
++}
++
+ // Value is a value that has been passed to us in the location described by VA
+ // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
+ // any loads onto Chain.
+@@ -643,7 +814,15 @@ static SDValue convertLocVTToValVT(Selec
+   else if (VA.getLocInfo() == CCValAssign::Indirect)
+     Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value,
+                         MachinePointerInfo(), false, false, false, 0);
+-  else
++  else if (VA.getLocInfo() == CCValAssign::BCvt) {
++    // If this is a short vector argument loaded from the stack,
++    // extend from i64 to full vector size and then bitcast.
++    assert(VA.getLocVT() == MVT::i64);
++    assert(VA.getValVT().isVector());
++    Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64,
++                        Value, DAG.getUNDEF(MVT::i64));
++    Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
++  } else
+     assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
+   return Value;
+ }
+@@ -660,6 +839,14 @@ static SDValue convertValVTToLocVT(Selec
+     return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
+   case CCValAssign::AExt:
+     return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
++  case CCValAssign::BCvt:
++    // If this is a short vector argument to be stored to the stack,
++    // bitcast to v2i64 and then extract first element.
++    assert(VA.getLocVT() == MVT::i64);
++    assert(VA.getValVT().isVector());
++    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
++    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
++                       DAG.getConstant(0, MVT::i32));
+   case CCValAssign::Full:
+     return Value;
+   default:
+@@ -680,9 +867,13 @@ LowerFormalArguments(SDValue Chain, Call
+   auto *TFL = static_cast<const SystemZFrameLowering *>(
+       DAG.getSubtarget().getFrameLowering());
+ 
++  // Detect unsupported vector argument types.
++  if (Subtarget.hasVector())
++    VerifyVectorTypes(Ins);
++
+   // Assign locations to all of the incoming arguments.
+   SmallVector<CCValAssign, 16> ArgLocs;
+-  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
++  SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+ 
+   unsigned NumFixedGPRs = 0;
+@@ -714,6 +905,14 @@ LowerFormalArguments(SDValue Chain, Call
+         NumFixedFPRs += 1;
+         RC = &SystemZ::FP64BitRegClass;
+         break;
++      case MVT::v16i8:
++      case MVT::v8i16:
++      case MVT::v4i32:
++      case MVT::v2i64:
++      case MVT::v4f32:
++      case MVT::v2f64:
++        RC = &SystemZ::VR128BitRegClass;
++        break;
+       }
+ 
+       unsigned VReg = MRI.createVirtualRegister(RC);
+@@ -818,9 +1017,15 @@ SystemZTargetLowering::LowerCall(CallLow
+   MachineFunction &MF = DAG.getMachineFunction();
+   EVT PtrVT = getPointerTy();
+ 
++  // Detect unsupported vector argument and return types.
++  if (Subtarget.hasVector()) {
++    VerifyVectorTypes(Outs);
++    VerifyVectorTypes(Ins);
++  }
++
+   // Analyze the operands of the call, assigning locations to each operand.
+   SmallVector<CCValAssign, 16> ArgLocs;
+-  CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
++  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+ 
+   // We don't support GuaranteedTailCallOpt, only automatically-detected
+@@ -972,6 +1177,10 @@ SystemZTargetLowering::LowerReturn(SDVal
+                                    SDLoc DL, SelectionDAG &DAG) const {
+   MachineFunction &MF = DAG.getMachineFunction();
+ 
++  // Detect unsupported vector return types.
++  if (Subtarget.hasVector())
++    VerifyVectorTypes(Outs);
++
+   // Assign locations to each returned value.
+   SmallVector<CCValAssign, 16> RetLocs;
+   CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
+@@ -1015,6 +1224,207 @@ prepareVolatileOrAtomicLoad(SDValue Chai
+   return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
+ }
+ 
++// Return true if Op is an intrinsic node with chain that returns the CC value
++// as its only (other) argument.  Provide the associated SystemZISD opcode and
++// the mask of valid CC values if so.
++static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
++                                      unsigned &CCValid) {
++  unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
++  switch (Id) {
++  case Intrinsic::s390_tbegin:
++    Opcode = SystemZISD::TBEGIN;
++    CCValid = SystemZ::CCMASK_TBEGIN;
++    return true;
++
++  case Intrinsic::s390_tbegin_nofloat:
++    Opcode = SystemZISD::TBEGIN_NOFLOAT;
++    CCValid = SystemZ::CCMASK_TBEGIN;
++    return true;
++
++  case Intrinsic::s390_tend:
++    Opcode = SystemZISD::TEND;
++    CCValid = SystemZ::CCMASK_TEND;
++    return true;
++
++  default:
++    return false;
++  }
++}
++
++// Return true if Op is an intrinsic node without chain that returns the
++// CC value as its final argument.  Provide the associated SystemZISD
++// opcode and the mask of valid CC values if so.
++static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
++  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
++  switch (Id) {
++  case Intrinsic::s390_vpkshs:
++  case Intrinsic::s390_vpksfs:
++  case Intrinsic::s390_vpksgs:
++    Opcode = SystemZISD::PACKS_CC;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vpklshs:
++  case Intrinsic::s390_vpklsfs:
++  case Intrinsic::s390_vpklsgs:
++    Opcode = SystemZISD::PACKLS_CC;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vceqbs:
++  case Intrinsic::s390_vceqhs:
++  case Intrinsic::s390_vceqfs:
++  case Intrinsic::s390_vceqgs:
++    Opcode = SystemZISD::VICMPES;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vchbs:
++  case Intrinsic::s390_vchhs:
++  case Intrinsic::s390_vchfs:
++  case Intrinsic::s390_vchgs:
++    Opcode = SystemZISD::VICMPHS;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vchlbs:
++  case Intrinsic::s390_vchlhs:
++  case Intrinsic::s390_vchlfs:
++  case Intrinsic::s390_vchlgs:
++    Opcode = SystemZISD::VICMPHLS;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vtm:
++    Opcode = SystemZISD::VTM;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vfaebs:
++  case Intrinsic::s390_vfaehs:
++  case Intrinsic::s390_vfaefs:
++    Opcode = SystemZISD::VFAE_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vfaezbs:
++  case Intrinsic::s390_vfaezhs:
++  case Intrinsic::s390_vfaezfs:
++    Opcode = SystemZISD::VFAEZ_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vfeebs:
++  case Intrinsic::s390_vfeehs:
++  case Intrinsic::s390_vfeefs:
++    Opcode = SystemZISD::VFEE_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vfeezbs:
++  case Intrinsic::s390_vfeezhs:
++  case Intrinsic::s390_vfeezfs:
++    Opcode = SystemZISD::VFEEZ_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vfenebs:
++  case Intrinsic::s390_vfenehs:
++  case Intrinsic::s390_vfenefs:
++    Opcode = SystemZISD::VFENE_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vfenezbs:
++  case Intrinsic::s390_vfenezhs:
++  case Intrinsic::s390_vfenezfs:
++    Opcode = SystemZISD::VFENEZ_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vistrbs:
++  case Intrinsic::s390_vistrhs:
++  case Intrinsic::s390_vistrfs:
++    Opcode = SystemZISD::VISTR_CC;
++    CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
++    return true;
++
++  case Intrinsic::s390_vstrcbs:
++  case Intrinsic::s390_vstrchs:
++  case Intrinsic::s390_vstrcfs:
++    Opcode = SystemZISD::VSTRC_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vstrczbs:
++  case Intrinsic::s390_vstrczhs:
++  case Intrinsic::s390_vstrczfs:
++    Opcode = SystemZISD::VSTRCZ_CC;
++    CCValid = SystemZ::CCMASK_ANY;
++    return true;
++
++  case Intrinsic::s390_vfcedbs:
++    Opcode = SystemZISD::VFCMPES;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vfchdbs:
++    Opcode = SystemZISD::VFCMPHS;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vfchedbs:
++    Opcode = SystemZISD::VFCMPHES;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  case Intrinsic::s390_vftcidb:
++    Opcode = SystemZISD::VFTCI;
++    CCValid = SystemZ::CCMASK_VCMP;
++    return true;
++
++  default:
++    return false;
++  }
++}
++
++// Emit an intrinsic with chain with a glued value instead of its CC result.
++static SDValue emitIntrinsicWithChainAndGlue(SelectionDAG &DAG, SDValue Op,
++                                             unsigned Opcode) {
++  // Copy all operands except the intrinsic ID.
++  unsigned NumOps = Op.getNumOperands();
++  SmallVector<SDValue, 6> Ops;
++  Ops.reserve(NumOps - 1);
++  Ops.push_back(Op.getOperand(0));
++  for (unsigned I = 2; I < NumOps; ++I)
++    Ops.push_back(Op.getOperand(I));
++
++  assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
++  SDVTList RawVTs = DAG.getVTList(MVT::Other, MVT::Glue);
++  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
++  SDValue OldChain = SDValue(Op.getNode(), 1);
++  SDValue NewChain = SDValue(Intr.getNode(), 0);
++  DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
++  return Intr;
++}
++
++// Emit an intrinsic with a glued value instead of its CC result.
++static SDValue emitIntrinsicWithGlue(SelectionDAG &DAG, SDValue Op,
++                                     unsigned Opcode) {
++  // Copy all operands except the intrinsic ID.
++  unsigned NumOps = Op.getNumOperands();
++  SmallVector<SDValue, 6> Ops;
++  Ops.reserve(NumOps - 1);
++  for (unsigned I = 1; I < NumOps; ++I)
++    Ops.push_back(Op.getOperand(I));
++
++  if (Op->getNumValues() == 1)
++    return DAG.getNode(Opcode, SDLoc(Op), MVT::Glue, Ops);
++  assert(Op->getNumValues() == 2 && "Expected exactly one non-CC result");
++  SDVTList RawVTs = DAG.getVTList(Op->getValueType(0), MVT::Glue);
++  return DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
++}
++
+ // CC is a comparison that will be implemented using an integer or
+ // floating-point comparison.  Return the condition code mask for
+ // a branch on true.  In the integer case, CCMASK_CMP_UO is set for
+@@ -1529,6 +1939,8 @@ static void adjustForTestUnderMask(Selec
+     MaskVal = -(CmpVal & -CmpVal);
+     NewC.ICmpType = SystemZICMP::UnsignedOnly;
+   }
++  if (!MaskVal)
++    return;
+ 
+   // Check whether the combination of mask, comparison value and comparison
+   // type are suitable.
+@@ -1570,9 +1982,57 @@ static void adjustForTestUnderMask(Selec
+   C.CCMask = NewCCMask;
+ }
+ 
++// Return a Comparison that tests the condition-code result of intrinsic
++// node Call against constant integer CC using comparison code Cond.
++// Opcode is the opcode of the SystemZISD operation for the intrinsic
++// and CCValid is the set of possible condition-code results.
++static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
++                                  SDValue Call, unsigned CCValid, uint64_t CC,
++                                  ISD::CondCode Cond) {
++  Comparison C(Call, SDValue());
++  C.Opcode = Opcode;
++  C.CCValid = CCValid;
++  if (Cond == ISD::SETEQ)
++    // bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
++    C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
++  else if (Cond == ISD::SETNE)
++    // ...and the inverse of that.
++    C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
++  else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
++    // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
++    // always true for CC>3.
++    C.CCMask = CC < 4 ? -1 << (4 - CC) : -1;
++  else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
++    // ...and the inverse of that.
++    C.CCMask = CC < 4 ? ~(-1 << (4 - CC)) : 0;
++  else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
++    // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
++    // always true for CC>3.
++    C.CCMask = CC < 4 ? -1 << (3 - CC) : -1;
++  else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
++    // ...and the inverse of that.
++    C.CCMask = CC < 4 ? ~(-1 << (3 - CC)) : 0;
++  else
++    llvm_unreachable("Unexpected integer comparison type");
++  C.CCMask &= CCValid;
++  return C;
++}
++
+ // Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
+ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+                          ISD::CondCode Cond) {
++  if (CmpOp1.getOpcode() == ISD::Constant) {
++    uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
++    unsigned Opcode, CCValid;
++    if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
++        CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
++        isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
++      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
++    if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
++        CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
++        isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
++      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
++  }
+   Comparison C(CmpOp0, CmpOp1);
+   C.CCMask = CCMaskForCondCode(Cond);
+   if (C.Op0.getValueType().isFloatingPoint()) {
+@@ -1614,6 +2074,20 @@ static Comparison getCmp(SelectionDAG &D
+ 
+ // Emit the comparison instruction described by C.
+ static SDValue emitCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) {
++  if (!C.Op1.getNode()) {
++    SDValue Op;
++    switch (C.Op0.getOpcode()) {
++    case ISD::INTRINSIC_W_CHAIN:
++      Op = emitIntrinsicWithChainAndGlue(DAG, C.Op0, C.Opcode);
++      break;
++    case ISD::INTRINSIC_WO_CHAIN:
++      Op = emitIntrinsicWithGlue(DAG, C.Op0, C.Opcode);
++      break;
++    default:
++      llvm_unreachable("Invalid comparison operands");
++    }
++    return SDValue(Op.getNode(), Op->getNumValues() - 1);
++  }
+   if (C.Opcode == SystemZISD::ICMP)
+     return DAG.getNode(SystemZISD::ICMP, DL, MVT::Glue, C.Op0, C.Op1,
+                        DAG.getConstant(C.ICmpType, MVT::i32));
+@@ -1682,12 +2156,142 @@ static SDValue emitSETCC(SelectionDAG &D
+   return Result;
+ }
+ 
++// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
++// be done directly.  IsFP is true if CC is for a floating-point rather than
++// integer comparison.
++static unsigned getVectorComparison(ISD::CondCode CC, bool IsFP) {
++  switch (CC) {
++  case ISD::SETOEQ:
++  case ISD::SETEQ:
++    return IsFP ? SystemZISD::VFCMPE : SystemZISD::VICMPE;
++
++  case ISD::SETOGE:
++  case ISD::SETGE:
++    return IsFP ? SystemZISD::VFCMPHE : 0;
++
++  case ISD::SETOGT:
++  case ISD::SETGT:
++    return IsFP ? SystemZISD::VFCMPH : SystemZISD::VICMPH;
++
++  case ISD::SETUGT:
++    return IsFP ? 0 : SystemZISD::VICMPHL;
++
++  default:
++    return 0;
++  }
++}
++
++// Return the SystemZISD vector comparison operation for CC or its inverse,
++// or 0 if neither can be done directly.  Indicate in Invert whether the
++// result is for the inverse of CC.  IsFP is true if CC is for a
++// floating-point rather than integer comparison.
++static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, bool IsFP,
++                                            bool &Invert) {
++  if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
++    Invert = false;
++    return Opcode;
++  }
++
++  CC = ISD::getSetCCInverse(CC, !IsFP);
++  if (unsigned Opcode = getVectorComparison(CC, IsFP)) {
++    Invert = true;
++    return Opcode;
++  }
++
++  return 0;
++}
++
++// Return a v2f64 that contains the extended form of elements Start and Start+1
++// of v4f32 value Op.
++static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, SDLoc DL,
++                                  SDValue Op) {
++  int Mask[] = { Start, -1, Start + 1, -1 };
++  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
++  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
++}
++
++// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
++// producing a result of type VT.
++static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, SDLoc DL,
++                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
++  // There is no hardware support for v4f32, so extend the vector into
++  // two v2f64s and compare those.
++  if (CmpOp0.getValueType() == MVT::v4f32) {
++    SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
++    SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
++    SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
++    SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1);
++    SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
++    SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
++    return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
++  }
++  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
++}
++
++// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
++// an integer mask of type VT.
++static SDValue lowerVectorSETCC(SelectionDAG &DAG, SDLoc DL, EVT VT,
++                                ISD::CondCode CC, SDValue CmpOp0,
++                                SDValue CmpOp1) {
++  bool IsFP = CmpOp0.getValueType().isFloatingPoint();
++  bool Invert = false;
++  SDValue Cmp;
++  switch (CC) {
++    // Handle tests for order using (or (ogt y x) (oge x y)).
++  case ISD::SETUO:
++    Invert = true;
++  case ISD::SETO: {
++    assert(IsFP && "Unexpected integer comparison");
++    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
++    SDValue GE = getVectorCmp(DAG, SystemZISD::VFCMPHE, DL, VT, CmpOp0, CmpOp1);
++    Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
++    break;
++  }
++
++    // Handle <> tests using (or (ogt y x) (ogt x y)).
++  case ISD::SETUEQ:
++    Invert = true;
++  case ISD::SETONE: {
++    assert(IsFP && "Unexpected integer comparison");
++    SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
++    SDValue GT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp0, CmpOp1);
++    Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
++    break;
++  }
++
++    // Otherwise a single comparison is enough.  It doesn't really
++    // matter whether we try the inversion or the swap first, since
++    // there are no cases where both work.
++  default:
++    if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
++      Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1);
++    else {
++      CC = ISD::getSetCCSwappedOperands(CC);
++      if (unsigned Opcode = getVectorComparisonOrInvert(CC, IsFP, Invert))
++        Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0);
++      else
++        llvm_unreachable("Unhandled comparison");
++    }
++    break;
++  }
++  if (Invert) {
++    SDValue Mask = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
++                               DAG.getConstant(65535, MVT::i32));
++    Mask = DAG.getNode(ISD::BITCAST, DL, VT, Mask);
++    Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
++  }
++  return Cmp;
++}
++
+ SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+   SDValue CmpOp0   = Op.getOperand(0);
+   SDValue CmpOp1   = Op.getOperand(1);
+   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+   SDLoc DL(Op);
++  EVT VT = Op.getValueType();
++  if (VT.isVector())
++    return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
+ 
+   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+   SDValue Glue = emitCmp(DAG, DL, C);
+@@ -1695,7 +2299,6 @@ SDValue SystemZTargetLowering::lowerSETC
+ }
+ 
+ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+-  SDValue Chain    = Op.getOperand(0);
+   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+   SDValue CmpOp0   = Op.getOperand(2);
+   SDValue CmpOp1   = Op.getOperand(3);
+@@ -1705,7 +2308,7 @@ SDValue SystemZTargetLowering::lowerBR_C
+   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC));
+   SDValue Glue = emitCmp(DAG, DL, C);
+   return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
+-                     Chain, DAG.getConstant(C.CCValid, MVT::i32),
++                     Op.getOperand(0), DAG.getConstant(C.CCValid, MVT::i32),
+                      DAG.getConstant(C.CCMask, MVT::i32), Dest, Glue);
+ }
+ 
+@@ -1831,6 +2434,52 @@ SDValue SystemZTargetLowering::lowerGlob
+   return Result;
+ }
+ 
++SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
++                                                 SelectionDAG &DAG,
++                                                 unsigned Opcode,
++                                                 SDValue GOTOffset) const {
++  SDLoc DL(Node);
++  EVT PtrVT = getPointerTy();
++  SDValue Chain = DAG.getEntryNode();
++  SDValue Glue;
++
++  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
++  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
++  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
++  Glue = Chain.getValue(1);
++  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
++  Glue = Chain.getValue(1);
++
++  // The first call operand is the chain and the second is the TLS symbol.
++  SmallVector<SDValue, 8> Ops;
++  Ops.push_back(Chain);
++  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
++                                           Node->getValueType(0),
++                                           0, 0));
++
++  // Add argument registers to the end of the list so that they are
++  // known live into the call.
++  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
++  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
++
++  // Add a register mask operand representing the call-preserved registers.
++  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
++  const uint32_t *Mask = TRI->getCallPreservedMask(CallingConv::C);
++  assert(Mask && "Missing call preserved mask for calling convention");
++  Ops.push_back(DAG.getRegisterMask(Mask));
++
++  // Glue the call to the argument copies.
++  Ops.push_back(Glue);
++
++  // Emit the call.
++  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
++  Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
++  Glue = Chain.getValue(1);
++
++  // Copy the return value from %r2.
++  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
++}
++
+ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+ 						     SelectionDAG &DAG) const {
+   SDLoc DL(Node);
+@@ -1838,9 +2487,6 @@ SDValue SystemZTargetLowering::lowerGlob
+   EVT PtrVT = getPointerTy();
+   TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
+ 
+-  if (model != TLSModel::LocalExec)
+-    llvm_unreachable("only local-exec TLS mode supported");
+-
+   // The high part of the thread pointer is in access register 0.
+   SDValue TPHi = DAG.getNode(SystemZISD::EXTRACT_ACCESS, DL, MVT::i32,
+                              DAG.getConstant(0, MVT::i32));
+@@ -1856,15 +2502,79 @@ SDValue SystemZTargetLowering::lowerGlob
+ 				    DAG.getConstant(32, PtrVT));
+   SDValue TP = DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
+ 
+-  // Get the offset of GA from the thread pointer.
+-  SystemZConstantPoolValue *CPV =
+-    SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
+-
+-  // Force the offset into the constant pool and load it from there.
+-  SDValue CPAddr = DAG.getConstantPool(CPV, PtrVT, 8);
+-  SDValue Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
+-			       CPAddr, MachinePointerInfo::getConstantPool(),
+-			       false, false, false, 0);
++  // Get the offset of GA from the thread pointer, based on the TLS model.
++  SDValue Offset;
++  switch (model) {
++    case TLSModel::GeneralDynamic: {
++      // Load the GOT offset of the tls_index (module ID / per-symbol offset).
++      SystemZConstantPoolValue *CPV =
++        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
++
++      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
++      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
++                           Offset, MachinePointerInfo::getConstantPool(),
++                           false, false, false, 0);
++
++      // Call __tls_get_offset to retrieve the offset.
++      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
++      break;
++    }
++
++    case TLSModel::LocalDynamic: {
++      // Load the GOT offset of the module ID.
++      SystemZConstantPoolValue *CPV =
++        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
++
++      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
++      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
++                           Offset, MachinePointerInfo::getConstantPool(),
++                           false, false, false, 0);
++
++      // Call __tls_get_offset to retrieve the module base offset.
++      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
++
++      // Note: The SystemZLDCleanupPass will remove redundant computations
++      // of the module base offset.  Count total number of local-dynamic
++      // accesses to trigger execution of that pass.
++      SystemZMachineFunctionInfo* MFI =
++        DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
++      MFI->incNumLocalDynamicTLSAccesses();
++
++      // Add the per-symbol offset.
++      CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
++
++      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
++      DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
++                              DTPOffset, MachinePointerInfo::getConstantPool(),
++                              false, false, false, 0);
++
++      Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
++      break;
++    }
++
++    case TLSModel::InitialExec: {
++      // Load the offset from the GOT.
++      Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
++                                          SystemZII::MO_INDNTPOFF);
++      Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
++      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
++                           Offset, MachinePointerInfo::getGOT(),
++                           false, false, false, 0);
++      break;
++    }
++
++    case TLSModel::LocalExec: {
++      // Force the offset into the constant pool and load it from there.
++      SystemZConstantPoolValue *CPV =
++        SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
++
++      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
++      Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(),
++                           Offset, MachinePointerInfo::getConstantPool(),
++                           false, false, false, 0);
++      break;
++    }
++  }
+ 
+   // Add the base and offset together.
+   return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
+@@ -1916,6 +2626,13 @@ SDValue SystemZTargetLowering::lowerBITC
+   EVT InVT = In.getValueType();
+   EVT ResVT = Op.getValueType();
+ 
++  // Convert loads directly.  This is normally done by DAGCombiner,
++  // but we need this case for bitcasts that are created during lowering
++  // and which are then lowered themselves.
++  if (auto *LoadN = dyn_cast<LoadSDNode>(In))
++    return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
++                       LoadN->getMemOperand());
++
+   if (InVT == MVT::i32 && ResVT == MVT::f32) {
+     SDValue In64;
+     if (Subtarget.hasHighWord()) {
+@@ -1929,12 +2646,12 @@ SDValue SystemZTargetLowering::lowerBITC
+                          DAG.getConstant(32, MVT::i64));
+     }
+     SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+-    return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
++    return DAG.getTargetExtractSubreg(SystemZ::subreg_r32,
+                                       DL, MVT::f32, Out64);
+   }
+   if (InVT == MVT::f32 && ResVT == MVT::i32) {
+     SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
+-    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
++    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_r32, DL,
+                                              MVT::f64, SDValue(U64, 0), In);
+     SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+     if (Subtarget.hasHighWord())
+@@ -2187,6 +2904,80 @@ SDValue SystemZTargetLowering::lowerOR(S
+                                    MVT::i64, HighOp, Low32);
+ }
+ 
++SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
++                                          SelectionDAG &DAG) const {
++  EVT VT = Op.getValueType();
++  SDLoc DL(Op);
++  Op = Op.getOperand(0);
++
++  // Handle vector types via VPOPCT.
++  if (VT.isVector()) {
++    Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
++    Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
++    switch (VT.getVectorElementType().getSizeInBits()) {
++    case 8:
++      break;
++    case 16: {
++      Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
++      SDValue Shift = DAG.getConstant(8, MVT::i32);
++      SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
++      Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
++      Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
++      break;
++    }
++    case 32: {
++      SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
++                                DAG.getConstant(0, MVT::i32));
++      Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
++      break;
++    }
++    case 64: {
++      SDValue Tmp = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
++                                DAG.getConstant(0, MVT::i32));
++      Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
++      Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
++      break;
++    }
++    default:
++      llvm_unreachable("Unexpected type");
++    }
++    return Op;
++  }
++
++  // Get the known-zero mask for the operand.
++  APInt KnownZero, KnownOne;
++  DAG.computeKnownBits(Op, KnownZero, KnownOne);
++  unsigned NumSignificantBits = (~KnownZero).getActiveBits();
++  if (NumSignificantBits == 0)
++    return DAG.getConstant(0, VT);
++
++  // Skip known-zero high parts of the operand.
++  int64_t OrigBitSize = VT.getSizeInBits();
++  int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
++  BitSize = std::min(BitSize, OrigBitSize);
++
++  // The POPCNT instruction counts the number of bits in each byte.
++  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
++  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
++  Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
++
++  // Add up per-byte counts in a binary tree.  All bits of Op at
++  // position larger than BitSize remain zero throughout.
++  for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
++    SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, VT));
++    if (BitSize != OrigBitSize)
++      Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
++                        DAG.getConstant(((uint64_t)1 << BitSize) - 1, VT));
++    Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
++  }
++
++  // Extract overall result from high byte.
++  if (BitSize > 8)
++    Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(BitSize - 8, VT));
++
++  return Op;
++}
++
+ // Op is an atomic load.  Lower it into a normal volatile load.
+ SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+@@ -2400,6 +3191,1069 @@ SDValue SystemZTargetLowering::lowerPREF
+                                  Node->getMemoryVT(), Node->getMemOperand());
+ }
+ 
++// Return an i32 that contains the value of CC immediately after After,
++// whose final operand must be MVT::Glue.
++static SDValue getCCResult(SelectionDAG &DAG, SDNode *After) {
++  SDValue Glue = SDValue(After, After->getNumValues() - 1);
++  SDValue IPM = DAG.getNode(SystemZISD::IPM, SDLoc(After), MVT::i32, Glue);
++  return DAG.getNode(ISD::SRL, SDLoc(After), MVT::i32, IPM,
++                     DAG.getConstant(SystemZ::IPM_CC, MVT::i32));
++}
++
++SDValue
++SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
++                                              SelectionDAG &DAG) const {
++  unsigned Opcode, CCValid;
++  if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
++    assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
++    SDValue Glued = emitIntrinsicWithChainAndGlue(DAG, Op, Opcode);
++    SDValue CC = getCCResult(DAG, Glued.getNode());
++    DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
++    return SDValue();
++  }
++
++  return SDValue();
++}
++
++SDValue
++SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
++                                               SelectionDAG &DAG) const {
++  unsigned Opcode, CCValid;
++  if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
++    SDValue Glued = emitIntrinsicWithGlue(DAG, Op, Opcode);
++    SDValue CC = getCCResult(DAG, Glued.getNode());
++    if (Op->getNumValues() == 1)
++      return CC;
++    assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
++    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
++		    Glued, CC);
++  }
++
++  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
++  switch (Id) {
++  case Intrinsic::s390_vpdi:
++    return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
++
++  case Intrinsic::s390_vperm:
++    return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
++
++  case Intrinsic::s390_vuphb:
++  case Intrinsic::s390_vuphh:
++  case Intrinsic::s390_vuphf:
++    return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1));
++
++  case Intrinsic::s390_vuplhb:
++  case Intrinsic::s390_vuplhh:
++  case Intrinsic::s390_vuplhf:
++    return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1));
++
++  case Intrinsic::s390_vuplb:
++  case Intrinsic::s390_vuplhw:
++  case Intrinsic::s390_vuplf:
++    return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1));
++
++  case Intrinsic::s390_vupllb:
++  case Intrinsic::s390_vupllh:
++  case Intrinsic::s390_vupllf:
++    return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1));
++
++  case Intrinsic::s390_vsumb:
++  case Intrinsic::s390_vsumh:
++  case Intrinsic::s390_vsumgh:
++  case Intrinsic::s390_vsumgf:
++  case Intrinsic::s390_vsumqf:
++  case Intrinsic::s390_vsumqg:
++    return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
++                       Op.getOperand(1), Op.getOperand(2));
++  }
++
++  return SDValue();
++}
++
++namespace {
++// Says that SystemZISD operation Opcode can be used to perform the equivalent
++// of a VPERM with permute vector Bytes.  If Opcode takes three operands,
++// Operand is the constant third operand, otherwise it is the number of
++// bytes in each element of the result.
++struct Permute {
++  unsigned Opcode;
++  unsigned Operand;
++  unsigned char Bytes[SystemZ::VectorBytes];
++};
++}
++
++static const Permute PermuteForms[] = {
++  // VMRHG
++  { SystemZISD::MERGE_HIGH, 8,
++    { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
++  // VMRHF
++  { SystemZISD::MERGE_HIGH, 4,
++    { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
++  // VMRHH
++  { SystemZISD::MERGE_HIGH, 2,
++    { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
++  // VMRHB
++  { SystemZISD::MERGE_HIGH, 1,
++    { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
++  // VMRLG
++  { SystemZISD::MERGE_LOW, 8,
++    { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
++  // VMRLF
++  { SystemZISD::MERGE_LOW, 4,
++    { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
++  // VMRLH
++  { SystemZISD::MERGE_LOW, 2,
++    { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
++  // VMRLB
++  { SystemZISD::MERGE_LOW, 1,
++    { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
++  // VPKG
++  { SystemZISD::PACK, 4,
++    { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
++  // VPKF
++  { SystemZISD::PACK, 2,
++    { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
++  // VPKH
++  { SystemZISD::PACK, 1,
++    { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
++  // VPDI V1, V2, 4  (low half of V1, high half of V2)
++  { SystemZISD::PERMUTE_DWORDS, 4,
++    { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
++  // VPDI V1, V2, 1  (high half of V1, low half of V2)
++  { SystemZISD::PERMUTE_DWORDS, 1,
++    { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
++};
++
++// Called after matching a vector shuffle against a particular pattern.
++// Both the original shuffle and the pattern have two vector operands.
++// OpNos[0] is the operand of the original shuffle that should be used for
++// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
++// OpNos[1] is the same for operand 1 of the pattern.  Resolve these -1s and
++// set OpNo0 and OpNo1 to the shuffle operands that should actually be used
++// for operands 0 and 1 of the pattern.
++static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
++  if (OpNos[0] < 0) {
++    if (OpNos[1] < 0)
++      return false;
++    OpNo0 = OpNo1 = OpNos[1];
++  } else if (OpNos[1] < 0) {
++    OpNo0 = OpNo1 = OpNos[0];
++  } else {
++    OpNo0 = OpNos[0];
++    OpNo1 = OpNos[1];
++  }
++  return true;
++}
++
++// Bytes is a VPERM-like permute vector, except that -1 is used for
++// undefined bytes.  Return true if the VPERM can be implemented using P.
++// When returning true set OpNo0 to the VPERM operand that should be
++// used for operand 0 of P and likewise OpNo1 for operand 1 of P.
++//
++// For example, if swapping the VPERM operands allows P to match, OpNo0
++// will be 1 and OpNo1 will be 0.  If instead Bytes only refers to one
++// operand, but rewriting it to use two duplicated operands allows it to
++// match P, then OpNo0 and OpNo1 will be the same.
++static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
++                         unsigned &OpNo0, unsigned &OpNo1) {
++  int OpNos[] = { -1, -1 };
++  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
++    int Elt = Bytes[I];
++    if (Elt >= 0) {
++      // Make sure that the two permute vectors use the same suboperand
++      // byte number.  Only the operand numbers (the high bits) are
++      // allowed to differ.
++      if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
++        return false;
++      int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
++      int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
++      // Make sure that the operand mappings are consistent with previous
++      // elements.
++      if (OpNos[ModelOpNo] == 1 - RealOpNo)
++        return false;
++      OpNos[ModelOpNo] = RealOpNo;
++    }
++  }
++  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
++}
++
++// As above, but search for a matching permute.
++static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
++                                   unsigned &OpNo0, unsigned &OpNo1) {
++  for (auto &P : PermuteForms)
++    if (matchPermute(Bytes, P, OpNo0, OpNo1))
++      return &P;
++  return nullptr;
++}
++
++// Bytes is a VPERM-like permute vector, except that -1 is used for
++// undefined bytes.  This permute is an operand of an outer permute.
++// See whether redistributing the -1 bytes gives a shuffle that can be
++// implemented using P.  If so, set Transform to a VPERM-like permute vector
++// that, when applied to the result of P, gives the original permute in Bytes.
++static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
++                               const Permute &P,
++                               SmallVectorImpl<int> &Transform) {
++  unsigned To = 0;
++  for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
++    int Elt = Bytes[From];
++    if (Elt < 0)
++      // Byte number From of the result is undefined.
++      Transform[From] = -1;
++    else {
++      while (P.Bytes[To] != Elt) {
++        To += 1;
++        if (To == SystemZ::VectorBytes)
++          return false;
++      }
++      Transform[From] = To;
++    }
++  }
++  return true;
++}
++
++// As above, but search for a matching permute.
++static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
++                                         SmallVectorImpl<int> &Transform) {
++  for (auto &P : PermuteForms)
++    if (matchDoublePermute(Bytes, P, Transform))
++      return &P;
++  return nullptr;
++}
++
++// Convert the mask of the given VECTOR_SHUFFLE into a byte-level mask,
++// as if it had type vNi8.
++static void getVPermMask(ShuffleVectorSDNode *VSN,
++                         SmallVectorImpl<int> &Bytes) {
++  EVT VT = VSN->getValueType(0);
++  unsigned NumElements = VT.getVectorNumElements();
++  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
++  Bytes.resize(NumElements * BytesPerElement, -1);
++  for (unsigned I = 0; I < NumElements; ++I) {
++    int Index = VSN->getMaskElt(I);
++    if (Index >= 0)
++      for (unsigned J = 0; J < BytesPerElement; ++J)
++        Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
++  }
++}
++
++// Bytes is a VPERM-like permute vector, except that -1 is used for
++// undefined bytes.  See whether bytes [Start, Start + BytesPerElement) of
++// the result come from a contiguous sequence of bytes from one input.
++// Set Base to the selector for the first byte if so.
++static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
++                            unsigned BytesPerElement, int &Base) {
++  Base = -1;
++  for (unsigned I = 0; I < BytesPerElement; ++I) {
++    if (Bytes[Start + I] >= 0) {
++      unsigned Elem = Bytes[Start + I];
++      if (Base < 0) {
++        Base = Elem - I;
++        // Make sure the bytes would come from one input operand.
++        if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
++          return false;
++      } else if (unsigned(Base) != Elem - I)
++        return false;
++    }
++  }
++  return true;
++}
++
++// Bytes is a VPERM-like permute vector, except that -1 is used for
++// undefined bytes.  Return true if it can be performed using VSLDI.
++// When returning true, set StartIndex to the shift amount and OpNo0
++// and OpNo1 to the VPERM operands that should be used as the first
++// and second shift operand respectively.
++static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
++                               unsigned &StartIndex, unsigned &OpNo0,
++                               unsigned &OpNo1) {
++  int OpNos[] = { -1, -1 };
++  int Shift = -1;
++  for (unsigned I = 0; I < 16; ++I) {
++    int Index = Bytes[I];
++    if (Index >= 0) {
++      int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
++      int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
++      int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
++      if (Shift < 0)
++        Shift = ExpectedShift;
++      else if (Shift != ExpectedShift)
++        return false;
++      // Make sure that the operand mappings are consistent with previous
++      // elements.
++      if (OpNos[ModelOpNo] == 1 - RealOpNo)
++        return false;
++      OpNos[ModelOpNo] = RealOpNo;
++    }
++  }
++  StartIndex = Shift;
++  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
++}
++
++// Create a node that performs P on operands Op0 and Op1, casting the
++// operands to the appropriate type.  The type of the result is determined by P.
++static SDValue getPermuteNode(SelectionDAG &DAG, SDLoc DL,
++                              const Permute &P, SDValue Op0, SDValue Op1) {
++  // VPDI (PERMUTE_DWORDS) always operates on v2i64s.  The input
++  // elements of a PACK are twice as wide as the outputs.
++  unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
++                      P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
++                      P.Operand);
++  // Cast both operands to the appropriate type.
++  MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
++                              SystemZ::VectorBytes / InBytes);
++  Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
++  Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
++  SDValue Op;
++  if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
++    SDValue Op2 = DAG.getConstant(P.Operand, MVT::i32);
++    Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
++  } else if (P.Opcode == SystemZISD::PACK) {
++    MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
++                                 SystemZ::VectorBytes / P.Operand);
++    Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
++  } else {
++    Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
++  }
++  return Op;
++}
++
++// Bytes is a VPERM-like permute vector, except that -1 is used for
++// undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
++// VSLDI or VPERM.
++static SDValue getGeneralPermuteNode(SelectionDAG &DAG, SDLoc DL, SDValue *Ops,
++                                     const SmallVectorImpl<int> &Bytes) {
++  for (unsigned I = 0; I < 2; ++I)
++    Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
++
++  // First see whether VSLDI can be used.
++  unsigned StartIndex, OpNo0, OpNo1;
++  if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
++    return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
++                       Ops[OpNo1], DAG.getConstant(StartIndex, MVT::i32));
++
++  // Fall back on VPERM.  Construct an SDNode for the permute vector.
++  SDValue IndexNodes[SystemZ::VectorBytes];
++  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
++    if (Bytes[I] >= 0)
++      IndexNodes[I] = DAG.getConstant(Bytes[I], MVT::i32);
++    else
++      IndexNodes[I] = DAG.getUNDEF(MVT::i32);
++  SDValue Op2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, IndexNodes);
++  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
++}
++
++namespace {
++// Describes a general N-operand vector shuffle.
++struct GeneralShuffle {
++  GeneralShuffle(EVT vt) : VT(vt) {}
++  void addUndef();
++  void add(SDValue, unsigned);
++  SDValue getNode(SelectionDAG &, SDLoc);
++
++  // The operands of the shuffle.
++  SmallVector<SDValue, SystemZ::VectorBytes> Ops;
++
++  // Index I is -1 if byte I of the result is undefined.  Otherwise the
++  // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
++  // Bytes[I] / SystemZ::VectorBytes.
++  SmallVector<int, SystemZ::VectorBytes> Bytes;
++
++  // The type of the shuffle result.
++  EVT VT;
++};
++}
++
++// Add an extra undefined element to the shuffle.
++void GeneralShuffle::addUndef() {
++  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
++  for (unsigned I = 0; I < BytesPerElement; ++I)
++    Bytes.push_back(-1);
++}
++
++// Add an extra element to the shuffle, taking it from element Elem of Op.
++// A null Op indicates a vector input whose value will be calculated later;
++// there is at most one such input per shuffle and it always has the same
++// type as the result.
++void GeneralShuffle::add(SDValue Op, unsigned Elem) {
++  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
++
++  // The source vector can have wider elements than the result,
++  // either through an explicit TRUNCATE or because of type legalization.
++  // We want the least significant part.
++  EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
++  unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
++  assert(FromBytesPerElement >= BytesPerElement &&
++         "Invalid EXTRACT_VECTOR_ELT");
++  unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
++                   (FromBytesPerElement - BytesPerElement));
++
++  // Look through things like shuffles and bitcasts.
++  while (Op.getNode()) {
++    if (Op.getOpcode() == ISD::BITCAST)
++      Op = Op.getOperand(0);
++    else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
++      // See whether the bytes we need come from a contiguous part of one
++      // operand.
++      SmallVector<int, SystemZ::VectorBytes> OpBytes;
++      getVPermMask(cast<ShuffleVectorSDNode>(Op), OpBytes);
++      int NewByte;
++      if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
++        break;
++      if (NewByte < 0) {
++        addUndef();
++        return;
++      }
++      Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
++      Byte = unsigned(NewByte) % SystemZ::VectorBytes;
++    } else if (Op.getOpcode() == ISD::UNDEF) {
++      addUndef();
++      return;
++    } else
++      break;
++  }
++
++  // Make sure that the source of the extraction is in Ops.
++  unsigned OpNo = 0;
++  for (; OpNo < Ops.size(); ++OpNo)
++    if (Ops[OpNo] == Op)
++      break;
++  if (OpNo == Ops.size())
++    Ops.push_back(Op);
++
++  // Add the element to Bytes.
++  unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
++  for (unsigned I = 0; I < BytesPerElement; ++I)
++    Bytes.push_back(Base + I);
++}
++
++// Return SDNodes for the completed shuffle.
++SDValue GeneralShuffle::getNode(SelectionDAG &DAG, SDLoc DL) {
++  assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
++
++  if (Ops.size() == 0)
++    return DAG.getUNDEF(VT);
++
++  // Make sure that there are at least two shuffle operands.
++  if (Ops.size() == 1)
++    Ops.push_back(DAG.getUNDEF(MVT::v16i8));
++
++  // Create a tree of shuffles, deferring root node until after the loop.
++  // Try to redistribute the undefined elements of non-root nodes so that
++  // the non-root shuffles match something like a pack or merge, then adjust
++  // the parent node's permute vector to compensate for the new order.
++  // Among other things, this copes with vectors like <2 x i16> that were
++  // padded with undefined elements during type legalization.
++  //
++  // In the best case this redistribution will lead to the whole tree
++  // using packs and merges.  It should rarely be a loss in other cases.
++  unsigned Stride = 1;
++  for (; Stride * 2 < Ops.size(); Stride *= 2) {
++    for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
++      SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
++
++      // Create a mask for just these two operands.
++      SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
++      for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
++        unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
++        unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
++        if (OpNo == I)
++          NewBytes[J] = Byte;
++        else if (OpNo == I + Stride)
++          NewBytes[J] = SystemZ::VectorBytes + Byte;
++        else
++          NewBytes[J] = -1;
++      }
++      // See if it would be better to reorganize NewMask to avoid using VPERM.
++      SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
++      if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
++        Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
++        // Applying NewBytesMap to Ops[I] gets back to NewBytes.
++        for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
++          if (NewBytes[J] >= 0) {
++            assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
++                   "Invalid double permute");
++            Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
++          } else
++            assert(NewBytesMap[J] < 0 && "Invalid double permute");
++        }
++      } else {
++        // Just use NewBytes on the operands.
++        Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
++        for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
++          if (NewBytes[J] >= 0)
++            Bytes[J] = I * SystemZ::VectorBytes + J;
++      }
++    }
++  }
++
++  // Now we just have 2 inputs.  Put the second operand in Ops[1].
++  if (Stride > 1) {
++    Ops[1] = Ops[Stride];
++    for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
++      if (Bytes[I] >= int(SystemZ::VectorBytes))
++        Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
++  }
++
++  // Look for an instruction that can do the permute without resorting
++  // to VPERM.
++  unsigned OpNo0, OpNo1;
++  SDValue Op;
++  if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
++    Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
++  else
++    Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
++  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
++}
++
++// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
++static bool isScalarToVector(SDValue Op) {
++  for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
++    if (Op.getOperand(I).getOpcode() != ISD::UNDEF)
++      return false;
++  return true;
++}
++
++// Return a vector of type VT that contains Value in the first element.
++// The other elements don't matter.
++static SDValue buildScalarToVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
++                                   SDValue Value) {
++  // If we have a constant, replicate it to all elements and let the
++  // BUILD_VECTOR lowering take care of it.
++  if (Value.getOpcode() == ISD::Constant ||
++      Value.getOpcode() == ISD::ConstantFP) {
++    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
++    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
++  }
++  if (Value.getOpcode() == ISD::UNDEF)
++    return DAG.getUNDEF(VT);
++  return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
++}
++
++// Return a vector of type VT in which Op0 is in element 0 and Op1 is in
++// element 1.  Used for cases in which replication is cheap.
++static SDValue buildMergeScalars(SelectionDAG &DAG, SDLoc DL, EVT VT,
++                                 SDValue Op0, SDValue Op1) {
++  if (Op0.getOpcode() == ISD::UNDEF) {
++    if (Op1.getOpcode() == ISD::UNDEF)
++      return DAG.getUNDEF(VT);
++    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
++  }
++  if (Op1.getOpcode() == ISD::UNDEF)
++    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
++  return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
++                     buildScalarToVector(DAG, DL, VT, Op0),
++                     buildScalarToVector(DAG, DL, VT, Op1));
++}
++
++// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
++// vector for them.
++static SDValue joinDwords(SelectionDAG &DAG, SDLoc DL, SDValue Op0,
++                          SDValue Op1) {
++  if (Op0.getOpcode() == ISD::UNDEF && Op1.getOpcode() == ISD::UNDEF)
++    return DAG.getUNDEF(MVT::v2i64);
++  // If one of the two inputs is undefined then replicate the other one,
++  // in order to avoid using another register unnecessarily.
++  if (Op0.getOpcode() == ISD::UNDEF)
++    Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
++  else if (Op1.getOpcode() == ISD::UNDEF)
++    Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
++  else {
++    Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
++    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
++  }
++  return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
++}
++
++// Try to represent constant BUILD_VECTOR node BVN using a
++// SystemZISD::BYTE_MASK-style mask.  Store the mask value in Mask
++// on success.
++static bool tryBuildVectorByteMask(BuildVectorSDNode *BVN, uint64_t &Mask) {
++  EVT ElemVT = BVN->getValueType(0).getVectorElementType();
++  unsigned BytesPerElement = ElemVT.getStoreSize();
++  for (unsigned I = 0, E = BVN->getNumOperands(); I != E; ++I) {
++    SDValue Op = BVN->getOperand(I);
++    if (Op.getOpcode() != ISD::UNDEF) {
++      uint64_t Value;
++      if (Op.getOpcode() == ISD::Constant)
++        Value = dyn_cast<ConstantSDNode>(Op)->getZExtValue();
++      else if (Op.getOpcode() == ISD::ConstantFP)
++        Value = (dyn_cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt()
++                 .getZExtValue());
++      else
++        return false;
++      for (unsigned J = 0; J < BytesPerElement; ++J) {
++        uint64_t Byte = (Value >> (J * 8)) & 0xff;
++        if (Byte == 0xff)
++          Mask |= 1 << ((E - I - 1) * BytesPerElement + J);
++        else if (Byte != 0)
++          return false;
++      }
++    }
++  }
++  return true;
++}
++
++// Try to load a vector constant in which BitsPerElement-bit value Value
++// is replicated to fill the vector.  VT is the type of the resulting
++// constant, which may have elements of a different size from BitsPerElement.
++// Return the SDValue of the constant on success, otherwise return
++// an empty value.
++static SDValue tryBuildVectorReplicate(SelectionDAG &DAG,
++                                       const SystemZInstrInfo *TII,
++                                       SDLoc DL, EVT VT, uint64_t Value,
++                                       unsigned BitsPerElement) {
++  // Signed 16-bit values can be replicated using VREPI.
++  int64_t SignedValue = SignExtend64(Value, BitsPerElement);
++  if (isInt<16>(SignedValue)) {
++    MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
++                                 SystemZ::VectorBits / BitsPerElement);
++    SDValue Op = DAG.getNode(SystemZISD::REPLICATE, DL, VecVT,
++                             DAG.getConstant(SignedValue, MVT::i32));
++    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
++  }
++  // See whether rotating the constant left some N places gives a value that
++  // is one less than a power of 2 (i.e. all zeros followed by all ones).
++  // If so we can use VGM.
++  unsigned Start, End;
++  if (TII->isRxSBGMask(Value, BitsPerElement, Start, End)) {
++    // isRxSBGMask returns the bit numbers for a full 64-bit value,
++    // with 0 denoting 1 << 63 and 63 denoting 1.  Convert them to
++    // bit numbers for an BitsPerElement value, so that 0 denotes
++    // 1 << (BitsPerElement-1).
++    Start -= 64 - BitsPerElement;
++    End -= 64 - BitsPerElement;
++    MVT VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement),
++                                 SystemZ::VectorBits / BitsPerElement);
++    SDValue Op = DAG.getNode(SystemZISD::ROTATE_MASK, DL, VecVT,
++                             DAG.getConstant(Start, MVT::i32),
++                             DAG.getConstant(End, MVT::i32));
++    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
++  }
++  return SDValue();
++}
++
++// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
++// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
++// the non-EXTRACT_VECTOR_ELT elements.  See if the given BUILD_VECTOR
++// would benefit from this representation and return it if so.
++static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
++                                     BuildVectorSDNode *BVN) {
++  EVT VT = BVN->getValueType(0);
++  unsigned NumElements = VT.getVectorNumElements();
++
++  // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
++  // on byte vectors.  If there are non-EXTRACT_VECTOR_ELT elements that still
++  // need a BUILD_VECTOR, add an additional placeholder operand for that
++  // BUILD_VECTOR and store its operands in ResidueOps.
++  GeneralShuffle GS(VT);
++  SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
++  bool FoundOne = false;
++  for (unsigned I = 0; I < NumElements; ++I) {
++    SDValue Op = BVN->getOperand(I);
++    if (Op.getOpcode() == ISD::TRUNCATE)
++      Op = Op.getOperand(0);
++    if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
++        Op.getOperand(1).getOpcode() == ISD::Constant) {
++      unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
++      GS.add(Op.getOperand(0), Elem);
++      FoundOne = true;
++    } else if (Op.getOpcode() == ISD::UNDEF) {
++      GS.addUndef();
++    } else {
++      GS.add(SDValue(), ResidueOps.size());
++      ResidueOps.push_back(Op);
++    }
++  }
++
++  // Nothing to do if there are no EXTRACT_VECTOR_ELTs.
++  if (!FoundOne)
++    return SDValue();
++
++  // Create the BUILD_VECTOR for the remaining elements, if any.
++  if (!ResidueOps.empty()) {
++    while (ResidueOps.size() < NumElements)
++      ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType()));
++    for (auto &Op : GS.Ops) {
++      if (!Op.getNode()) {
++        Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps);
++        break;
++      }
++    }
++  }
++  return GS.getNode(DAG, SDLoc(BVN));
++}
++
++// Combine GPR scalar values Elems into a vector of type VT.
++static SDValue buildVector(SelectionDAG &DAG, SDLoc DL, EVT VT,
++                           SmallVectorImpl<SDValue> &Elems) {
++  // See whether there is a single replicated value.
++  SDValue Single;
++  unsigned int NumElements = Elems.size();
++  unsigned int Count = 0;
++  for (auto Elem : Elems) {
++    if (Elem.getOpcode() != ISD::UNDEF) {
++      if (!Single.getNode())
++        Single = Elem;
++      else if (Elem != Single) {
++        Single = SDValue();
++        break;
++      }
++      Count += 1;
++    }
++  }
++  // There are three cases here:
++  //
++  // - if the only defined element is a loaded one, the best sequence
++  //   is a replicating load.
++  //
++  // - otherwise, if the only defined element is an i64 value, we will
++  //   end up with the same VLVGP sequence regardless of whether we short-cut
++  //   for replication or fall through to the later code.
++  //
++  // - otherwise, if the only defined element is an i32 or smaller value,
++  //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
++  //   This is only a win if the single defined element is used more than once.
++  //   In other cases we're better off using a single VLVGx.
++  if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
++    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
++
++  // The best way of building a v2i64 from two i64s is to use VLVGP.
++  if (VT == MVT::v2i64)
++    return joinDwords(DAG, DL, Elems[0], Elems[1]);
++
++  // Use a 64-bit merge high to combine two doubles.
++  if (VT == MVT::v2f64)
++    return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
++
++  // Build v4f32 values directly from the FPRs:
++  //
++  //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
++  //         V              V         VMRHF
++  //      <ABxx>         <CDxx>
++  //                V                 VMRHG
++  //              <ABCD>
++  if (VT == MVT::v4f32) {
++    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
++    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
++    // Avoid unnecessary undefs by reusing the other operand.
++    if (Op01.getOpcode() == ISD::UNDEF)
++      Op01 = Op23;
++    else if (Op23.getOpcode() == ISD::UNDEF)
++      Op23 = Op01;
++    // Merging identical replications is a no-op.
++    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
++      return Op01;
++    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
++    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
++    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
++                             DL, MVT::v2i64, Op01, Op23);
++    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
++  }
++
++  // Collect the constant terms.
++  SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
++  SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
++
++  unsigned NumConstants = 0;
++  for (unsigned I = 0; I < NumElements; ++I) {
++    SDValue Elem = Elems[I];
++    if (Elem.getOpcode() == ISD::Constant ||
++        Elem.getOpcode() == ISD::ConstantFP) {
++      NumConstants += 1;
++      Constants[I] = Elem;
++      Done[I] = true;
++    }
++  }
++  // If there was at least one constant, fill in the other elements of
++  // Constants with undefs to get a full vector constant and use that
++  // as the starting point.
++  SDValue Result;
++  if (NumConstants > 0) {
++    for (unsigned I = 0; I < NumElements; ++I)
++      if (!Constants[I].getNode())
++        Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
++    Result = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Constants);
++  } else {
++    // Otherwise try to use VLVGP to start the sequence in order to
++    // avoid a false dependency on any previous contents of the vector
++    // register.  This only makes sense if one of the associated elements
++    // is defined.
++    unsigned I1 = NumElements / 2 - 1;
++    unsigned I2 = NumElements - 1;
++    bool Def1 = (Elems[I1].getOpcode() != ISD::UNDEF);
++    bool Def2 = (Elems[I2].getOpcode() != ISD::UNDEF);
++    if (Def1 || Def2) {
++      SDValue Elem1 = Elems[Def1 ? I1 : I2];
++      SDValue Elem2 = Elems[Def2 ? I2 : I1];
++      Result = DAG.getNode(ISD::BITCAST, DL, VT,
++                           joinDwords(DAG, DL, Elem1, Elem2));
++      Done[I1] = true;
++      Done[I2] = true;
++    } else
++      Result = DAG.getUNDEF(VT);
++  }
++
++  // Use VLVGx to insert the other elements.
++  for (unsigned I = 0; I < NumElements; ++I)
++    if (!Done[I] && Elems[I].getOpcode() != ISD::UNDEF)
++      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
++                           DAG.getConstant(I, MVT::i32));
++  return Result;
++}
++
++SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
++                                                 SelectionDAG &DAG) const {
++  const SystemZInstrInfo *TII =
++    static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
++  auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
++  SDLoc DL(Op);
++  EVT VT = Op.getValueType();
++
++  if (BVN->isConstant()) {
++    // Try using VECTOR GENERATE BYTE MASK.  This is the architecturally-
++    // preferred way of creating all-zero and all-one vectors so give it
++    // priority over other methods below.
++    uint64_t Mask = 0;
++    if (tryBuildVectorByteMask(BVN, Mask)) {
++      SDValue Op = DAG.getNode(SystemZISD::BYTE_MASK, DL, MVT::v16i8,
++                               DAG.getConstant(Mask, MVT::i32));
++      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
++    }
++
++    // Try using some form of replication.
++    APInt SplatBits, SplatUndef;
++    unsigned SplatBitSize;
++    bool HasAnyUndefs;
++    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
++                             8, true) &&
++        SplatBitSize <= 64) {
++      // First try assuming that any undefined bits above the highest set bit
++      // and below the lowest set bit are 1s.  This increases the likelihood of
++      // being able to use a sign-extended element value in VECTOR REPLICATE
++      // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
++      uint64_t SplatBitsZ = SplatBits.getZExtValue();
++      uint64_t SplatUndefZ = SplatUndef.getZExtValue();
++      uint64_t Lower = (SplatUndefZ
++                        & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
++      uint64_t Upper = (SplatUndefZ
++                        & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
++      uint64_t Value = SplatBitsZ | Upper | Lower;
++      SDValue Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value,
++                                           SplatBitSize);
++      if (Op.getNode())
++        return Op;
++
++      // Now try assuming that any undefined bits between the first and
++      // last defined set bits are set.  This increases the chances of
++      // using a non-wraparound mask.
++      uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
++      Value = SplatBitsZ | Middle;
++      Op = tryBuildVectorReplicate(DAG, TII, DL, VT, Value, SplatBitSize);
++      if (Op.getNode())
++        return Op;
++    }
++
++    // Fall back to loading it from memory.
++    return SDValue();
++  }
++
++  // See if we should use shuffles to construct the vector from other vectors.
++  SDValue Res = tryBuildVectorShuffle(DAG, BVN);
++  if (Res.getNode())
++    return Res;
++
++  // Detect SCALAR_TO_VECTOR conversions.
++  if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
++    return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));
++
++  // Otherwise use buildVector to build the vector up from GPRs.
++  unsigned NumElements = Op.getNumOperands();
++  SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
++  for (unsigned I = 0; I < NumElements; ++I)
++    Ops[I] = Op.getOperand(I);
++  return buildVector(DAG, DL, VT, Ops);
++}
++
++SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
++                                                   SelectionDAG &DAG) const {
++  auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
++  SDLoc DL(Op);
++  EVT VT = Op.getValueType();
++  unsigned NumElements = VT.getVectorNumElements();
++
++  if (VSN->isSplat()) {
++    SDValue Op0 = Op.getOperand(0);
++    unsigned Index = VSN->getSplatIndex();
++    assert(Index < VT.getVectorNumElements() &&
++           "Splat index should be defined and in first operand");
++    // See whether the value we're splatting is directly available as a scalar.
++    if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
++        Op0.getOpcode() == ISD::BUILD_VECTOR)
++      return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
++    // Otherwise keep it as a vector-to-vector operation.
++    return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
++                       DAG.getConstant(Index, MVT::i32));
++  }
++
++  GeneralShuffle GS(VT);
++  for (unsigned I = 0; I < NumElements; ++I) {
++    int Elt = VSN->getMaskElt(I);
++    if (Elt < 0)
++      GS.addUndef();
++    else
++      GS.add(Op.getOperand(unsigned(Elt) / NumElements),
++             unsigned(Elt) % NumElements);
++  }
++  return GS.getNode(DAG, SDLoc(VSN));
++}
++
++SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
++                                                     SelectionDAG &DAG) const {
++  SDLoc DL(Op);
++  // Just insert the scalar into element 0 of an undefined vector.
++  return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
++                     Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
++                     Op.getOperand(0), DAG.getConstant(0, MVT::i32));
++}
++
++SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
++                                                      SelectionDAG &DAG) const {
++  // Handle insertions of floating-point values.
++  SDLoc DL(Op);
++  SDValue Op0 = Op.getOperand(0);
++  SDValue Op1 = Op.getOperand(1);
++  SDValue Op2 = Op.getOperand(2);
++  EVT VT = Op.getValueType();
++
++  // Insertions into constant indices of a v2f64 can be done using VPDI.
++  // However, if the inserted value is a bitcast or a constant then it's
++  // better to use GPRs, as below.
++  if (VT == MVT::v2f64 &&
++      Op1.getOpcode() != ISD::BITCAST &&
++      Op1.getOpcode() != ISD::ConstantFP &&
++      Op2.getOpcode() == ISD::Constant) {
++    uint64_t Index = dyn_cast<ConstantSDNode>(Op2)->getZExtValue();
++    unsigned Mask = VT.getVectorNumElements() - 1;
++    if (Index <= Mask)
++      return Op;
++  }
++
++  // Otherwise bitcast to the equivalent integer form and insert via a GPR.
++  MVT IntVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
++  MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
++  SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
++                            DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
++                            DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
++  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
++}
++
++SDValue
++SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
++                                               SelectionDAG &DAG) const {
++  // Handle extractions of floating-point values.
++  SDLoc DL(Op);
++  SDValue Op0 = Op.getOperand(0);
++  SDValue Op1 = Op.getOperand(1);
++  EVT VT = Op.getValueType();
++  EVT VecVT = Op0.getValueType();
++
++  // Extractions of constant indices can be done directly.
++  if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
++    uint64_t Index = CIndexN->getZExtValue();
++    unsigned Mask = VecVT.getVectorNumElements() - 1;
++    if (Index <= Mask)
++      return Op;
++  }
++
++  // Otherwise bitcast to the equivalent integer form and extract via a GPR.
++  MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
++  MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
++  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
++                            DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
++  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
++}
++
++SDValue
++SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
++					      unsigned UnpackHigh) const {
++  SDValue PackedOp = Op.getOperand(0);
++  EVT OutVT = Op.getValueType();
++  EVT InVT = PackedOp.getValueType();
++  unsigned ToBits = OutVT.getVectorElementType().getSizeInBits();
++  unsigned FromBits = InVT.getVectorElementType().getSizeInBits();
++  do {
++    FromBits *= 2;
++    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
++                                 SystemZ::VectorBits / FromBits);
++    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
++  } while (FromBits != ToBits);
++  return PackedOp;
++}
++
++SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
++                                          unsigned ByScalar) const {
++  // Look for cases where a vector shift can use the *_BY_SCALAR form.
++  SDValue Op0 = Op.getOperand(0);
++  SDValue Op1 = Op.getOperand(1);
++  SDLoc DL(Op);
++  EVT VT = Op.getValueType();
++  unsigned ElemBitSize = VT.getVectorElementType().getSizeInBits();
++
++  // See whether the shift vector is a splat represented as BUILD_VECTOR.
++  if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
++    APInt SplatBits, SplatUndef;
++    unsigned SplatBitSize;
++    bool HasAnyUndefs;
++    // Check for constant splats.  Use ElemBitSize as the minimum element
++    // width and reject splats that need wider elements.
++    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
++                             ElemBitSize, true) &&
++        SplatBitSize == ElemBitSize) {
++      SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
++                                      MVT::i32);
++      return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
++    }
++    // Check for variable splats.
++    BitVector UndefElements;
++    SDValue Splat = BVN->getSplatValue(&UndefElements);
++    if (Splat) {
++      // Since i32 is the smallest legal type, we either need a no-op
++      // or a truncation.
++      SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
++      return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
++    }
++  }
++
++  // See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
++  // and the shift amount is directly available in a GPR.
++  if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
++    if (VSN->isSplat()) {
++      SDValue VSNOp0 = VSN->getOperand(0);
++      unsigned Index = VSN->getSplatIndex();
++      assert(Index < VT.getVectorNumElements() &&
++             "Splat index should be defined and in first operand");
++      if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
++          VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
++        // Since i32 is the smallest legal type, we either need a no-op
++        // or a truncation.
++        SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
++                                    VSNOp0.getOperand(Index));
++        return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
++      }
++    }
++  }
++
++  // Otherwise just treat the current form as legal.
++  return Op;
++}
++
+ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
+                                               SelectionDAG &DAG) const {
+   switch (Op.getOpcode()) {
+@@ -2437,6 +4291,14 @@ SDValue SystemZTargetLowering::LowerOper
+     return lowerUDIVREM(Op, DAG);
+   case ISD::OR:
+     return lowerOR(Op, DAG);
++  case ISD::CTPOP:
++    return lowerCTPOP(Op, DAG);
++  case ISD::CTLZ_ZERO_UNDEF:
++    return DAG.getNode(ISD::CTLZ, SDLoc(Op),
++                       Op.getValueType(), Op.getOperand(0));
++  case ISD::CTTZ_ZERO_UNDEF:
++    return DAG.getNode(ISD::CTTZ, SDLoc(Op),
++                       Op.getValueType(), Op.getOperand(0));
+   case ISD::ATOMIC_SWAP:
+     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
+   case ISD::ATOMIC_STORE:
+@@ -2471,6 +4333,30 @@ SDValue SystemZTargetLowering::LowerOper
+     return lowerSTACKRESTORE(Op, DAG);
+   case ISD::PREFETCH:
+     return lowerPREFETCH(Op, DAG);
++  case ISD::INTRINSIC_W_CHAIN:
++    return lowerINTRINSIC_W_CHAIN(Op, DAG);
++  case ISD::INTRINSIC_WO_CHAIN:
++    return lowerINTRINSIC_WO_CHAIN(Op, DAG);
++  case ISD::BUILD_VECTOR:
++    return lowerBUILD_VECTOR(Op, DAG);
++  case ISD::VECTOR_SHUFFLE:
++    return lowerVECTOR_SHUFFLE(Op, DAG);
++  case ISD::SCALAR_TO_VECTOR:
++    return lowerSCALAR_TO_VECTOR(Op, DAG);
++  case ISD::INSERT_VECTOR_ELT:
++    return lowerINSERT_VECTOR_ELT(Op, DAG);
++  case ISD::EXTRACT_VECTOR_ELT:
++    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
++  case ISD::SIGN_EXTEND_VECTOR_INREG:
++    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
++  case ISD::ZERO_EXTEND_VECTOR_INREG:
++    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
++  case ISD::SHL:
++    return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
++  case ISD::SRL:
++    return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
++  case ISD::SRA:
++    return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
+   default:
+     llvm_unreachable("Unexpected node to lower");
+   }
+@@ -2482,6 +4368,8 @@ const char *SystemZTargetLowering::getTa
+     OPCODE(RET_FLAG);
+     OPCODE(CALL);
+     OPCODE(SIBCALL);
++    OPCODE(TLS_GDCALL);
++    OPCODE(TLS_LDCALL);
+     OPCODE(PCREL_WRAPPER);
+     OPCODE(PCREL_OFFSET);
+     OPCODE(IABS);
+@@ -2492,7 +4380,9 @@ const char *SystemZTargetLowering::getTa
+     OPCODE(SELECT_CCMASK);
+     OPCODE(ADJDYNALLOC);
+     OPCODE(EXTRACT_ACCESS);
++    OPCODE(POPCNT);
+     OPCODE(UMUL_LOHI64);
++    OPCODE(SDIVREM32);
+     OPCODE(SDIVREM64);
+     OPCODE(UDIVREM32);
+     OPCODE(UDIVREM64);
+@@ -2506,11 +4396,60 @@ const char *SystemZTargetLowering::getTa
+     OPCODE(XC_LOOP);
+     OPCODE(CLC);
+     OPCODE(CLC_LOOP);
+-    OPCODE(STRCMP);
+     OPCODE(STPCPY);
++    OPCODE(STRCMP);
+     OPCODE(SEARCH_STRING);
+     OPCODE(IPM);
+     OPCODE(SERIALIZE);
++    OPCODE(TBEGIN);
++    OPCODE(TBEGIN_NOFLOAT);
++    OPCODE(TEND);
++    OPCODE(BYTE_MASK);
++    OPCODE(ROTATE_MASK);
++    OPCODE(REPLICATE);
++    OPCODE(JOIN_DWORDS);
++    OPCODE(SPLAT);
++    OPCODE(MERGE_HIGH);
++    OPCODE(MERGE_LOW);
++    OPCODE(SHL_DOUBLE);
++    OPCODE(PERMUTE_DWORDS);
++    OPCODE(PERMUTE);
++    OPCODE(PACK);
++    OPCODE(PACKS_CC);
++    OPCODE(PACKLS_CC);
++    OPCODE(UNPACK_HIGH);
++    OPCODE(UNPACKL_HIGH);
++    OPCODE(UNPACK_LOW);
++    OPCODE(UNPACKL_LOW);
++    OPCODE(VSHL_BY_SCALAR);
++    OPCODE(VSRL_BY_SCALAR);
++    OPCODE(VSRA_BY_SCALAR);
++    OPCODE(VSUM);
++    OPCODE(VICMPE);
++    OPCODE(VICMPH);
++    OPCODE(VICMPHL);
++    OPCODE(VICMPES);
++    OPCODE(VICMPHS);
++    OPCODE(VICMPHLS);
++    OPCODE(VFCMPE);
++    OPCODE(VFCMPH);
++    OPCODE(VFCMPHE);
++    OPCODE(VFCMPES);
++    OPCODE(VFCMPHS);
++    OPCODE(VFCMPHES);
++    OPCODE(VFTCI);
++    OPCODE(VEXTEND);
++    OPCODE(VROUND);
++    OPCODE(VTM);
++    OPCODE(VFAE_CC);
++    OPCODE(VFAEZ_CC);
++    OPCODE(VFEE_CC);
++    OPCODE(VFEEZ_CC);
++    OPCODE(VFENE_CC);
++    OPCODE(VFENEZ_CC);
++    OPCODE(VISTR_CC);
++    OPCODE(VSTRC_CC);
++    OPCODE(VSTRCZ_CC);
+     OPCODE(ATOMIC_SWAPW);
+     OPCODE(ATOMIC_LOADW_ADD);
+     OPCODE(ATOMIC_LOADW_SUB);
+@@ -2529,6 +4468,157 @@ const char *SystemZTargetLowering::getTa
+ #undef OPCODE
+ }
+ 
++// Return true if VT is a vector whose elements are a whole number of bytes
++// in width.
++static bool canTreatAsByteVector(EVT VT) {
++  return VT.isVector() && VT.getVectorElementType().getSizeInBits() % 8 == 0;
++}
++
++// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
++// producing a result of type ResVT.  Op is a possibly bitcast version
++// of the input vector and Index is the index (based on type VecVT) that
++// should be extracted.  Return the new extraction if a simplification
++// was possible or if Force is true.
++SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT,
++                                              SDValue Op, unsigned Index,
++                                              DAGCombinerInfo &DCI,
++                                              bool Force) const {
++  SelectionDAG &DAG = DCI.DAG;
++
++  // The number of bytes being extracted.
++  unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
++
++  for (;;) {
++    unsigned Opcode = Op.getOpcode();
++    if (Opcode == ISD::BITCAST)
++      // Look through bitcasts.
++      Op = Op.getOperand(0);
++    else if (Opcode == ISD::VECTOR_SHUFFLE &&
++             canTreatAsByteVector(Op.getValueType())) {
++      // Get a VPERM-like permute mask and see whether the bytes covered
++      // by the extracted element are a contiguous sequence from one
++      // source operand.
++      SmallVector<int, SystemZ::VectorBytes> Bytes;
++      getVPermMask(cast<ShuffleVectorSDNode>(Op), Bytes);
++      int First;
++      if (!getShuffleInput(Bytes, Index * BytesPerElement,
++                           BytesPerElement, First))
++        break;
++      if (First < 0)
++        return DAG.getUNDEF(ResVT);
++      // Make sure the contiguous sequence starts at a multiple of the
++      // original element size.
++      unsigned Byte = unsigned(First) % Bytes.size();
++      if (Byte % BytesPerElement != 0)
++        break;
++      // We can get the extracted value directly from an input.
++      Index = Byte / BytesPerElement;
++      Op = Op.getOperand(unsigned(First) / Bytes.size());
++      Force = true;
++    } else if (Opcode == ISD::BUILD_VECTOR &&
++               canTreatAsByteVector(Op.getValueType())) {
++      // We can only optimize this case if the BUILD_VECTOR elements are
++      // at least as wide as the extracted value.
++      EVT OpVT = Op.getValueType();
++      unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
++      if (OpBytesPerElement < BytesPerElement)
++        break;
++      // Make sure that the least-significant bit of the extracted value
++      // is the least significant bit of an input.
++      unsigned End = (Index + 1) * BytesPerElement;
++      if (End % OpBytesPerElement != 0)
++        break;
++      // We're extracting the low part of one operand of the BUILD_VECTOR.
++      Op = Op.getOperand(End / OpBytesPerElement - 1);
++      if (!Op.getValueType().isInteger()) {
++        EVT VT = MVT::getIntegerVT(Op.getValueType().getSizeInBits());
++        Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
++        DCI.AddToWorklist(Op.getNode());
++      }
++      EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
++      Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
++      if (VT != ResVT) {
++        DCI.AddToWorklist(Op.getNode());
++        Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
++      }
++      return Op;
++    } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
++		Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
++		Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
++	       canTreatAsByteVector(Op.getValueType()) &&
++               canTreatAsByteVector(Op.getOperand(0).getValueType())) {
++      // Make sure that only the unextended bits are significant.
++      EVT ExtVT = Op.getValueType();
++      EVT OpVT = Op.getOperand(0).getValueType();
++      unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
++      unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
++      unsigned Byte = Index * BytesPerElement;
++      unsigned SubByte = Byte % ExtBytesPerElement;
++      unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
++      if (SubByte < MinSubByte ||
++	  SubByte + BytesPerElement > ExtBytesPerElement)
++	break;
++      // Get the byte offset of the unextended element
++      Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
++      // ...then add the byte offset relative to that element.
++      Byte += SubByte - MinSubByte;
++      if (Byte % BytesPerElement != 0)
++	break;
++      Op = Op.getOperand(0);
++      Index = Byte / BytesPerElement;
++      Force = true;
++    } else
++      break;
++  }
++  if (Force) {
++    if (Op.getValueType() != VecVT) {
++      Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
++      DCI.AddToWorklist(Op.getNode());
++    }
++    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
++                       DAG.getConstant(Index, MVT::i32));
++  }
++  return SDValue();
++}
++
++// Optimize vector operations in scalar value Op on the basis that Op
++// is truncated to TruncVT.
++SDValue
++SystemZTargetLowering::combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op,
++                                              DAGCombinerInfo &DCI) const {
++  // If we have (trunc (extract_vector_elt X, Y)), try to turn it into
++  // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
++  // of type TruncVT.
++  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
++      TruncVT.getSizeInBits() % 8 == 0) {
++    SDValue Vec = Op.getOperand(0);
++    EVT VecVT = Vec.getValueType();
++    if (canTreatAsByteVector(VecVT)) {
++      if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
++        unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
++        unsigned TruncBytes = TruncVT.getStoreSize();
++        if (BytesPerElement % TruncBytes == 0) {
++          // Calculate the value of Y' in the above description.  We are
++          // splitting the original elements into Scale equal-sized pieces
++          // and for truncation purposes want the last (least-significant)
++          // of these pieces for IndexN.  This is easiest to do by calculating
++          // the start index of the following element and then subtracting 1.
++          unsigned Scale = BytesPerElement / TruncBytes;
++          unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;
++
++          // Defer the creation of the bitcast from X to combineExtract,
++          // which might be able to optimize the extraction.
++          VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
++                                   VecVT.getStoreSize() / TruncBytes);
++          EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
++          return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
++        }
++      }
++    }
++  }
++  return SDValue();
++}
++
+ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+   SelectionDAG &DAG = DCI.DAG;
+@@ -2559,6 +4649,114 @@ SDValue SystemZTargetLowering::PerformDA
+       }
+     }
+   }
++  if (Opcode == SystemZISD::MERGE_HIGH ||
++      Opcode == SystemZISD::MERGE_LOW) {
++    SDValue Op0 = N->getOperand(0);
++    SDValue Op1 = N->getOperand(1);
++    if (Op0.getOpcode() == ISD::BITCAST)
++      Op0 = Op0.getOperand(0);
++    if (Op0.getOpcode() == SystemZISD::BYTE_MASK &&
++        cast<ConstantSDNode>(Op0.getOperand(0))->getZExtValue() == 0) {
++      // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
++      // for v4f32.
++      if (Op1 == N->getOperand(0))
++        return Op1;
++      // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
++      EVT VT = Op1.getValueType();
++      unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
++      if (ElemBytes <= 4) {
++        Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
++                  SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
++        EVT InVT = VT.changeVectorElementTypeToInteger();
++        EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
++                                     SystemZ::VectorBytes / ElemBytes / 2);
++        if (VT != InVT) {
++          Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
++          DCI.AddToWorklist(Op1.getNode());
++        }
++        SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
++        DCI.AddToWorklist(Op.getNode());
++        return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
++      }
++    }
++  }
++  // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
++  // for the extraction to be done on a vMiN value, so that we can use VSTE.
++  // If X has wider elements then convert it to:
++  // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
++  if (Opcode == ISD::STORE) {
++    auto *SN = cast<StoreSDNode>(N);
++    EVT MemVT = SN->getMemoryVT();
++    if (MemVT.isInteger()) {
++      SDValue Value = combineTruncateExtract(SDLoc(N), MemVT,
++                                             SN->getValue(), DCI);
++      if (Value.getNode()) {
++        DCI.AddToWorklist(Value.getNode());
++
++        // Rewrite the store with the new form of stored value.
++        return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
++                                 SN->getBasePtr(), SN->getMemoryVT(),
++                                 SN->getMemOperand());
++      }
++    }
++  }
++  // Try to simplify a vector extraction.
++  if (Opcode == ISD::EXTRACT_VECTOR_ELT) {
++    if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
++      SDValue Op0 = N->getOperand(0);
++      EVT VecVT = Op0.getValueType();
++      return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
++                            IndexN->getZExtValue(), DCI, false);
++    }
++  }
++  // (join_dwords X, X) == (replicate X)
++  if (Opcode == SystemZISD::JOIN_DWORDS &&
++      N->getOperand(0) == N->getOperand(1))
++    return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
++                       N->getOperand(0));
++  // (fround (extract_vector_elt X 0))
++  // (fround (extract_vector_elt X 1)) ->
++  // (extract_vector_elt (VROUND X) 0)
++  // (extract_vector_elt (VROUND X) 1)
++  //
++  // This is a special case since the target doesn't really support v2f32s.
++  if (Opcode == ISD::FP_ROUND) {
++    SDValue Op0 = N->getOperand(0);
++    if (N->getValueType(0) == MVT::f32 &&
++        Op0.hasOneUse() &&
++        Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
++        Op0.getOperand(0).getValueType() == MVT::v2f64 &&
++        Op0.getOperand(1).getOpcode() == ISD::Constant &&
++        cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
++      SDValue Vec = Op0.getOperand(0);
++      for (auto *U : Vec->uses()) {
++        if (U != Op0.getNode() &&
++            U->hasOneUse() &&
++            U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
++            U->getOperand(0) == Vec &&
++            U->getOperand(1).getOpcode() == ISD::Constant &&
++            cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
++          SDValue OtherRound = SDValue(*U->use_begin(), 0);
++          if (OtherRound.getOpcode() == ISD::FP_ROUND &&
++              OtherRound.getOperand(0) == SDValue(U, 0) &&
++              OtherRound.getValueType() == MVT::f32) {
++            SDValue VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
++                                         MVT::v4f32, Vec);
++            DCI.AddToWorklist(VRound.getNode());
++            SDValue Extract1 =
++              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
++                          VRound, DAG.getConstant(2, MVT::i32));
++            DCI.AddToWorklist(Extract1.getNode());
++            DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
++            SDValue Extract0 =
++              DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
++                          VRound, DAG.getConstant(0, MVT::i32));
++            return Extract0;
++          }
++        }
++      }
++    }
++  }
+   return SDValue();
+ }
+ 
+@@ -3338,6 +5536,57 @@ SystemZTargetLowering::emitStringWrapper
+   return DoneMBB;
+ }
+ 
++// Update TBEGIN instruction with final opcode and register clobbers.
++MachineBasicBlock *
++SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI,
++                                            MachineBasicBlock *MBB,
++                                            unsigned Opcode,
++                                            bool NoFloat) const {
++  MachineFunction &MF = *MBB->getParent();
++  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
++  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
++
++  // Update opcode.
++  MI->setDesc(TII->get(Opcode));
++
++  // We cannot handle a TBEGIN that clobbers the stack or frame pointer.
++  // Make sure to add the corresponding GRSM bits if they are missing.
++  uint64_t Control = MI->getOperand(2).getImm();
++  static const unsigned GPRControlBit[16] = {
++    0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
++    0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
++  };
++  Control |= GPRControlBit[15];
++  if (TFI->hasFP(MF))
++    Control |= GPRControlBit[11];
++  MI->getOperand(2).setImm(Control);
++
++  // Add GPR clobbers.
++  for (int I = 0; I < 16; I++) {
++    if ((Control & GPRControlBit[I]) == 0) {
++      unsigned Reg = SystemZMC::GR64Regs[I];
++      MI->addOperand(MachineOperand::CreateReg(Reg, true, true));
++    }
++  }
++
++  // Add FPR/VR clobbers.
++  if (!NoFloat && (Control & 4) != 0) {
++    if (Subtarget.hasVector()) {
++      for (int I = 0; I < 32; I++) {
++        unsigned Reg = SystemZMC::VR128Regs[I];
++        MI->addOperand(MachineOperand::CreateReg(Reg, true, true));
++      }
++    } else {
++      for (int I = 0; I < 16; I++) {
++        unsigned Reg = SystemZMC::FP64Regs[I];
++        MI->addOperand(MachineOperand::CreateReg(Reg, true, true));
++      }
++    }
++  }
++
++  return MBB;
++}
++
+ MachineBasicBlock *SystemZTargetLowering::
+ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
+   switch (MI->getOpcode()) {
+@@ -3579,6 +5828,12 @@ EmitInstrWithCustomInserter(MachineInstr
+     return emitStringWrapper(MI, MBB, SystemZ::MVST);
+   case SystemZ::SRSTLoop:
+     return emitStringWrapper(MI, MBB, SystemZ::SRST);
++  case SystemZ::TBEGIN:
++    return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
++  case SystemZ::TBEGIN_nofloat:
++    return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
++  case SystemZ::TBEGINC:
++    return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
+   default:
+     llvm_unreachable("Unexpected instr type to insert");
+   }
+Index: llvm-36/lib/Target/SystemZ/SystemZISelLowering.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZISelLowering.h
++++ llvm-36/lib/Target/SystemZ/SystemZISelLowering.h
+@@ -34,6 +34,11 @@ enum {
+   CALL,
+   SIBCALL,
+ 
++  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
++  // (The call target is implicitly __tls_get_offset.)
++  TLS_GDCALL,
++  TLS_LDCALL,
++
+   // Wraps a TargetGlobalAddress that should be loaded using PC-relative
+   // accesses (LARL).  Operand 0 is the address.
+   PCREL_WRAPPER,
+@@ -82,6 +87,9 @@ enum {
+   // the number of the register.
+   EXTRACT_ACCESS,
+ 
++  // Count number of bits set in operand 0 per byte.
++  POPCNT,
++
+   // Wrappers around the ISD opcodes of the same name.  The output and
+   // first input operands are GR128s.  The trailing numbers are the
+   // widths of the second operand in bits.
+@@ -138,6 +146,135 @@ enum {
+   // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
+   SERIALIZE,
+ 
++  // Transaction begin.  The first operand is the chain, the second
++  // the TDB pointer, and the third the immediate control field.
++  // Returns chain and glue.
++  TBEGIN,
++  TBEGIN_NOFLOAT,
++
++  // Transaction end.  Just the chain operand.  Returns chain and glue.
++  TEND,
++
++  // Create a vector constant by filling byte N of the result with bit
++  // 15-N of the single operand.
++  BYTE_MASK,
++
++  // Create a vector constant by replicating an element-sized RISBG-style mask.
++  // The first operand specifies the starting set bit and the second operand
++  // specifies the ending set bit.  Both operands count from the MSB of the
++  // element.
++  ROTATE_MASK,
++
++  // Replicate a GPR scalar value into all elements of a vector.
++  REPLICATE,
++
++  // Create a vector from two i64 GPRs.
++  JOIN_DWORDS,
++
++  // Replicate one element of a vector into all elements.  The first operand
++  // is the vector and the second is the index of the element to replicate.
++  SPLAT,
++
++  // Interleave elements from the high half of operand 0 and the high half
++  // of operand 1.
++  MERGE_HIGH,
++
++  // Likewise for the low halves.
++  MERGE_LOW,
++
++  // Concatenate the vectors in the first two operands, shift them left
++  // by the third operand, and take the first half of the result.
++  SHL_DOUBLE,
++
++  // Take one element of the first v2i64 operand and the one element of
++  // the second v2i64 operand and concatenate them to form a v2i64 result.
++  // The third operand is a 4-bit value of the form 0A0B, where A and B
++  // are the element selectors for the first operand and second operands
++  // respectively.
++  PERMUTE_DWORDS,
++
++  // Perform a general vector permute on vector operands 0 and 1.
++  // Each byte of operand 2 controls the corresponding byte of the result,
++  // in the same way as a byte-level VECTOR_SHUFFLE mask.
++  PERMUTE,
++
++  // Pack vector operands 0 and 1 into a single vector with half-sized elements.
++  PACK,
++
++  // Likewise, but saturate the result and set CC.  PACKS_CC does signed
++  // saturation and PACKLS_CC does unsigned saturation.
++  PACKS_CC,
++  PACKLS_CC,
++
++  // Unpack the first half of vector operand 0 into double-sized elements.
++  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
++  UNPACK_HIGH,
++  UNPACKL_HIGH,
++
++  // Likewise for the second half.
++  UNPACK_LOW,
++  UNPACKL_LOW,
++
++  // Shift each element of vector operand 0 by the number of bits specified
++  // by scalar operand 1.
++  VSHL_BY_SCALAR,
++  VSRL_BY_SCALAR,
++  VSRA_BY_SCALAR,
++
++  // For each element of the output type, sum across all sub-elements of
++  // operand 0 belonging to the corresponding element, and add in the
++  // rightmost sub-element of the corresponding element of operand 1.
++  VSUM,
++
++  // Compare integer vector operands 0 and 1 to produce the usual 0/-1
++  // vector result.  VICMPE is for equality, VICMPH for "signed greater than"
++  // and VICMPHL for "unsigned greater than".
++  VICMPE,
++  VICMPH,
++  VICMPHL,
++
++  // Likewise, but also set the condition codes on the result.
++  VICMPES,
++  VICMPHS,
++  VICMPHLS,
++
++  // Compare floating-point vector operands 0 and 1 to preoduce the usual 0/-1
++  // vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
++  // greater than" and VFCMPHE for "ordered and greater than or equal to".
++  VFCMPE,
++  VFCMPH,
++  VFCMPHE,
++
++  // Likewise, but also set the condition codes on the result.
++  VFCMPES,
++  VFCMPHS,
++  VFCMPHES,
++
++  // Test floating-point data class for vectors.
++  VFTCI,
++
++  // Extend the even f32 elements of vector operand 0 to produce a vector
++  // of f64 elements.
++  VEXTEND,
++
++  // Round the f64 elements of vector operand 0 to f32s and store them in the
++  // even elements of the result.
++  VROUND,
++
++  // AND the two vector operands together and set CC based on the result.
++  VTM,
++
++  // String operations that set CC as a side-effect.
++  VFAE_CC,
++  VFAEZ_CC,
++  VFEE_CC,
++  VFEEZ_CC,
++  VFENE_CC,
++  VFENEZ_CC,
++  VISTR_CC,
++  VSTRC_CC,
++  VSTRCZ_CC,
++
+   // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+   // ATOMIC_LOAD_<op>.
+   //
+@@ -204,9 +341,33 @@ public:
+   MVT getScalarShiftAmountTy(EVT LHSTy) const override {
+     return MVT::i32;
+   }
++  MVT getVectorIdxTy() const override {
++    // Only the lower 12 bits of an element index are used, so we don't
++    // want to clobber the upper 32 bits of a GPR unnecessarily.
++    return MVT::i32;
++  }
++  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
++    const override {
++    // Widen subvectors to the full width rather than promoting integer
++    // elements.  This is better because:
++    //
++    // (a) it means that we can handle the ABI for passing and returning
++    //     sub-128 vectors without having to handle them as legal types.
++    //
++    // (b) we don't have instructions to extend on load and truncate on store,
++    //     so promoting the integers is less efficient.
++    //
++    // (c) there are no multiplication instructions for the widest integer
++    //     type (v2i64).
++    if (VT.getVectorElementType().getSizeInBits() % 8 == 0)
++      return TypeWidenVector;
++    return TargetLoweringBase::getPreferredVectorAction(VT);
++  }
+   EVT getSetCCResultType(LLVMContext &, EVT) const override;
+   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
++  bool isLegalICmpImmediate(int64_t Imm) const override;
++  bool isLegalAddImmediate(int64_t Imm) const override;
+   bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                       unsigned Align,
+@@ -257,6 +418,9 @@ private:
+   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
+                              SelectionDAG &DAG) const;
++  SDValue lowerTLSGetOffset(GlobalAddressSDNode *Node,
++                            SelectionDAG &DAG, unsigned Opcode,
++                            SDValue GOTOffset) const;
+   SDValue lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
+                                 SelectionDAG &DAG) const;
+   SDValue lowerBlockAddress(BlockAddressSDNode *Node,
+@@ -272,6 +436,7 @@ private:
+   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
+@@ -282,6 +447,22 @@ private:
+   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
++				 unsigned UnpackHigh) const;
++  SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
++
++  SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
++                         unsigned Index, DAGCombinerInfo &DCI,
++                         bool Force) const;
++  SDValue combineTruncateExtract(SDLoc DL, EVT TruncVT, SDValue Op,
++                                 DAGCombinerInfo &DCI) const;
+ 
+   // If the last instruction before MBBI in MBB was some form of COMPARE,
+   // try to replace it with a COMPARE AND BRANCH just before MBBI.
+@@ -319,6 +500,10 @@ private:
+   MachineBasicBlock *emitStringWrapper(MachineInstr *MI,
+                                        MachineBasicBlock *BB,
+                                        unsigned Opcode) const;
++  MachineBasicBlock *emitTransactionBegin(MachineInstr *MI,
++                                          MachineBasicBlock *MBB,
++                                          unsigned Opcode,
++                                          bool NoFloat) const;
+ };
+ } // end namespace llvm
+ 
+Index: llvm-36/lib/Target/SystemZ/SystemZInstrFP.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZInstrFP.td
++++ llvm-36/lib/Target/SystemZ/SystemZInstrFP.td
+@@ -46,9 +46,14 @@ let Defs = [CC], CCValues = 0xF, Compare
+   defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
+   defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
+ }
+-defm : CompareZeroFP<LTEBRCompare, FP32>;
+-defm : CompareZeroFP<LTDBRCompare, FP64>;
+-defm : CompareZeroFP<LTXBRCompare, FP128>;
++// Note that the comparison against zero operation is not available if we
++// have vector support, since load-and-test instructions will partially
++// clobber the target (vector) register.
++let Predicates = [FeatureNoVector] in {
++  defm : CompareZeroFP<LTEBRCompare, FP32>;
++  defm : CompareZeroFP<LTDBRCompare, FP64>;
++  defm : CompareZeroFP<LTXBRCompare, FP128>;
++}
+ 
+ // Moves between 64-bit integer and floating-point registers.
+ def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
+@@ -98,6 +103,9 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1
+   defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
+   defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
+ 
++  // For z13 we prefer LDE over LE to avoid partial register dependencies.
++  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
++
+   // These instructions are split after register allocation, so we don't
+   // want a custom inserter.
+   let Has20BitOffset = 1, HasIndex = 1, Is128Bit = 1 in {
+@@ -141,7 +149,7 @@ def LDXBRA : UnaryRRF4<"ldxbra", 0xB345,
+              Requires<[FeatureFPExtension]>;
+ 
+ def : Pat<(f32 (fround FP128:$src)),
+-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
++          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+ def : Pat<(f64 (fround FP128:$src)),
+           (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+ 
+@@ -345,13 +353,13 @@ def MDB  : BinaryRXE<"mdb",  0xED1C, fmu
+ def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>;
+ def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
+           (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+-                                FP32:$src1, subreg_h32), FP32:$src2)>;
++                                FP32:$src1, subreg_r32), FP32:$src2)>;
+ 
+ // f64 multiplication of an FP32 register and an f32 memory.
+ def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
+ def : Pat<(fmul (f64 (fextend FP32:$src1)),
+                 (f64 (extloadf32 bdxaddr12only:$addr))),
+-          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
++          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_r32),
+                 bdxaddr12only:$addr)>;
+ 
+ // f128 multiplication of two FP64 registers.
+Index: llvm-36/lib/Target/SystemZ/SystemZInstrFormats.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZInstrFormats.td
++++ llvm-36/lib/Target/SystemZ/SystemZInstrFormats.td
+@@ -142,10 +142,13 @@ def getThreeOperandOpcode : InstrMapping
+ // Formats are specified using operand field declarations of the form:
+ //
+ //   bits<4> Rn   : register input or output for operand n
++//   bits<5> Vn   : vector register input or output for operand n
+ //   bits<m> In   : immediate value of width m for operand n
+ //   bits<4> BDn  : address operand n, which has a base and a displacement
+ //   bits<m> XBDn : address operand n, which has an index, a base and a
+ //                  displacement
++//   bits<m> VBDn : address operand n, which has a vector index, a base and a
++//                  displacement
+ //   bits<4> Xn   : index register for address operand n
+ //   bits<4> Mn   : mode value for operand n
+ //
+@@ -339,11 +342,13 @@ class InstRXE<bits<16> op, dag outs, dag
+ 
+   bits<4> R1;
+   bits<20> XBD2;
++  bits<4> M3;
+ 
+   let Inst{47-40} = op{15-8};
+   let Inst{39-36} = R1;
+   let Inst{35-16} = XBD2;
+-  let Inst{15-8}  = 0;
++  let Inst{15-12} = M3;
++  let Inst{11-8}  = 0;
+   let Inst{7-0}   = op{7-0};
+ 
+   let HasIndex = 1;
+@@ -473,6 +478,393 @@ class InstSS<bits<8> op, dag outs, dag i
+   let Inst{15-0}  = BD2;
+ }
+ 
++class InstS<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<4, outs, ins, asmstr, pattern> {
++  field bits<32> Inst;
++  field bits<32> SoftFail = 0;
++
++  bits<16> BD2;
++
++  let Inst{31-16} = op;
++  let Inst{15-0}  = BD2;
++}
++
++class InstVRIa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<16> I2;
++  bits<4> M3;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = 0;
++  let Inst{31-16} = I2;
++  let Inst{15-12} = M3;
++  let Inst{11}    = V1{4};
++  let Inst{10-8}  = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRIb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<8> I2;
++  bits<8> I3;
++  bits<4> M4;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = 0;
++  let Inst{31-24} = I2;
++  let Inst{23-16} = I3;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10-8}  = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRIc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V3;
++  bits<16> I2;
++  bits<4> M4;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V3{3-0};
++  let Inst{31-16} = I2;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V3{4};
++  let Inst{9-8}   = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRId<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<5> V3;
++  bits<8> I4;
++  bits<4> M5;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-28} = V3{3-0};
++  let Inst{27-24} = 0;
++  let Inst{23-16} = I4;
++  let Inst{15-12} = M5;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9}     = V3{4};
++  let Inst{8}     = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<12> I3;
++  bits<4> M4;
++  bits<4> M5;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-20} = I3;
++  let Inst{19-16} = M5;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9-8}   = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++// Depending on the instruction mnemonic, certain bits may be or-ed into
++// the M4 value provided as explicit operand.  These are passed as m4or.
++class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
++               bits<4> m4or = 0>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<4> M3;
++  bits<4> M4;
++  bits<4> M5;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-24} = 0;
++  let Inst{23-20} = M5;
++  let Inst{19}    = !if (!eq (m4or{3}, 1), 1, M4{3});
++  let Inst{18}    = !if (!eq (m4or{2}, 1), 1, M4{2});
++  let Inst{17}    = !if (!eq (m4or{1}, 1), 1, M4{1});
++  let Inst{16}    = !if (!eq (m4or{0}, 1), 1, M4{0});
++  let Inst{15-12} = M3;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9-8}   = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++// Depending on the instruction mnemonic, certain bits may be or-ed into
++// the M5 value provided as explicit operand.  These are passed as m5or.
++class InstVRRb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
++               bits<4> m5or = 0>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<5> V3;
++  bits<4> M4;
++  bits<4> M5;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-28} = V3{3-0};
++  let Inst{27-24} = 0;
++  let Inst{23}    = !if (!eq (m5or{3}, 1), 1, M5{3});
++  let Inst{22}    = !if (!eq (m5or{2}, 1), 1, M5{2});
++  let Inst{21}    = !if (!eq (m5or{1}, 1), 1, M5{1});
++  let Inst{20}    = !if (!eq (m5or{0}, 1), 1, M5{0});
++  let Inst{19-16} = 0;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9}     = V3{4};
++  let Inst{8}     = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRRc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<5> V3;
++  bits<4> M4;
++  bits<4> M5;
++  bits<4> M6;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-28} = V3{3-0};
++  let Inst{27-24} = 0;
++  let Inst{23-20} = M6;
++  let Inst{19-16} = M5;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9}     = V3{4};
++  let Inst{8}     = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++// Depending on the instruction mnemonic, certain bits may be or-ed into
++// the M6 value provided as explicit operand.  These are passed as m6or.
++class InstVRRd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
++               bits<4> m6or = 0>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<5> V3;
++  bits<5> V4;
++  bits<4> M5;
++  bits<4> M6;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-28} = V3{3-0};
++  let Inst{27-24} = M5;
++  let Inst{23}    = !if (!eq (m6or{3}, 1), 1, M6{3});
++  let Inst{22}    = !if (!eq (m6or{2}, 1), 1, M6{2});
++  let Inst{21}    = !if (!eq (m6or{1}, 1), 1, M6{1});
++  let Inst{20}    = !if (!eq (m6or{0}, 1), 1, M6{0});
++  let Inst{19-16} = 0;
++  let Inst{15-12} = V4{3-0};
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9}     = V3{4};
++  let Inst{8}     = V4{4};
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRRe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<5> V2;
++  bits<5> V3;
++  bits<5> V4;
++  bits<4> M5;
++  bits<4> M6;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V2{3-0};
++  let Inst{31-28} = V3{3-0};
++  let Inst{27-24} = M6;
++  let Inst{23-20} = 0;
++  let Inst{19-16} = M5;
++  let Inst{15-12} = V4{3-0};
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V2{4};
++  let Inst{9}     = V3{4};
++  let Inst{8}     = V4{4};
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<4> R2;
++  bits<4> R3;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = R2;
++  let Inst{31-28} = R3;
++  let Inst{27-12} = 0;
++  let Inst{11}    = V1{4};
++  let Inst{10-8}  = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<16> BD2;
++  bits<5> V3;
++  bits<4> M4;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = V3{3-0};
++  let Inst{31-16} = BD2;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = V3{4};
++  let Inst{9-8}   = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRSb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<16> BD2;
++  bits<4> R3;
++  bits<4> M4;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-32} = R3;
++  let Inst{31-16} = BD2;
++  let Inst{15-12} = M4;
++  let Inst{11}    = V1{4};
++  let Inst{10-8}  = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<4> R1;
++  bits<16> BD2;
++  bits<5> V3;
++  bits<4> M4;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = R1;
++  let Inst{35-32} = V3{3-0};
++  let Inst{31-16} = BD2;
++  let Inst{15-12} = M4;
++  let Inst{11}    = 0;
++  let Inst{10}    = V3{4};
++  let Inst{9-8}   = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<21> VBD2;
++  bits<4> M3;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-16} = VBD2{19-0};
++  let Inst{15-12} = M3;
++  let Inst{11}    = V1{4};
++  let Inst{10}    = VBD2{20};
++  let Inst{9-8}   = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
++class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
++  : InstSystemZ<6, outs, ins, asmstr, pattern> {
++  field bits<48> Inst;
++  field bits<48> SoftFail = 0;
++
++  bits<5> V1;
++  bits<20> XBD2;
++  bits<4> M3;
++
++  let Inst{47-40} = op{15-8};
++  let Inst{39-36} = V1{3-0};
++  let Inst{35-16} = XBD2;
++  let Inst{15-12} = M3;
++  let Inst{11}    = V1{4};
++  let Inst{10-8}  = 0;
++  let Inst{7-0}   = op{7-0};
++}
++
+ //===----------------------------------------------------------------------===//
+ // Instruction definitions with semantics
+ //===----------------------------------------------------------------------===//
+@@ -492,12 +884,6 @@ class InstSS<bits<8> op, dag outs, dag i
+ //     form of the source register in the destination register and
+ //     branches on the result.
+ //
+-//   Store:
+-//     One register or immediate input operand and one address input operand.
+-//     The instruction stores the first operand to the address.
+-//
+-//     This category is used for both pure and truncating stores.
+-//
+ //   LoadMultiple:
+ //     One address input operand and two explicit output operands.
+ //     The instruction loads a range of registers from the address,
+@@ -510,18 +896,35 @@ class InstSS<bits<8> op, dag outs, dag i
+ //     with the explicit operands giving the first and last register
+ //     to store.  Other stored registers are added as implicit uses.
+ //
++//   StoreLength:
++//     One value operand, one length operand and one address operand.
++//     The instruction stores the value operand to the address but
++//     doesn't write more than the number of bytes specified by the
++//     length operand.
++//
+ //   Unary:
+ //     One register output operand and one input operand.
+ //
++//   Store:
++//     One address operand and one other input operand.  The instruction
++//     stores to the address.
++//
+ //   Binary:
+ //     One register output operand and two input operands.
+ //
++//   StoreBinary:
++//     One address operand and two other input operands.  The instruction
++//     stores to the address.
++//
+ //   Compare:
+ //     Two input operands and an implicit CC output operand.
+ //
+ //   Ternary:
+ //     One register output operand and three input operands.
+ //
++//   Quaternary:
++//     One register output operand and four input operands.
++//
+ //   LoadAndOp:
+ //     One output operand and two input operands, one of which is an address.
+ //     The instruction both reads from and writes to the address.
+@@ -556,6 +959,12 @@ class InherentRRE<string mnemonic, bits<
+   let R2 = 0;
+ }
+ 
++class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
++  : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
++  let I2 = value;
++  let M3 = 0;
++}
++
+ class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
+   : InstRI<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$I2),
+            mnemonic##"\t$R1, $I2", []> {
+@@ -571,6 +980,13 @@ class LoadMultipleRSY<string mnemonic, b
+   let mayLoad = 1;
+ }
+ 
++class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
++  : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
++             mnemonic#"\t$V1, $V3, $BD2", []> {
++  let M4 = 0;
++  let mayLoad = 1;
++}
++
+ class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
+                  RegisterOperand cls>
+   : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+@@ -619,12 +1035,39 @@ multiclass StoreRXPair<string mnemonic,
+   }
+ }
+ 
++class StoreVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++               TypedReg tr, bits<5> bytes, bits<4> type = 0>
++  : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2),
++            mnemonic#"\t$V1, $XBD2",
++            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
++  let M3 = type;
++  let mayStore = 1;
++  let AccessBytes = bytes;
++}
++
++class StoreLengthVRSb<string mnemonic, bits<16> opcode,
++                      SDPatternOperator operator, bits<5> bytes>
++  : InstVRSb<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
++             mnemonic#"\t$V1, $R3, $BD2",
++             [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
++  let M4 = 0;
++  let mayStore = 1;
++  let AccessBytes = bytes;
++}
++
+ class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
+   : InstRSY<opcode, (outs), (ins cls:$R1, cls:$R3, bdaddr20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []> {
+   let mayStore = 1;
+ }
+ 
++class StoreMultipleVRSa<string mnemonic, bits<16> opcode>
++  : InstVRSa<opcode, (outs), (ins VR128:$V1, VR128:$V3, bdaddr12only:$BD2),
++             mnemonic#"\t$V1, $V3, $BD2", []> {
++  let M4 = 0;
++  let mayStore = 1;
++}
++
+ // StoreSI* instructions are used to store an integer to memory, but the
+ // addresses are more restricted than for normal stores.  If we are in the
+ // situation of having to force either the address into a register or the
+@@ -857,6 +1300,7 @@ class UnaryRXE<string mnemonic, bits<16>
+   let OpType = "mem";
+   let mayLoad = 1;
+   let AccessBytes = bytes;
++  let M3 = 0;
+ }
+ 
+ class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+@@ -883,6 +1327,46 @@ multiclass UnaryRXPair<string mnemonic,
+   }
+ }
+ 
++class UnaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                TypedReg tr, Immediate imm, bits<4> type = 0>
++  : InstVRIa<opcode, (outs tr.op:$V1), (ins imm:$I2),
++             mnemonic#"\t$V1, $I2",
++             [(set tr.op:$V1, (tr.vt (operator imm:$I2)))]> {
++  let M3 = type;
++}
++
++class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0,
++                bits<4> m5 = 0>
++  : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
++             mnemonic#"\t$V1, $V2",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]> {
++  let M3 = type;
++  let M4 = m4;
++  let M5 = m5;
++}
++
++multiclass UnaryVRRaSPair<string mnemonic, bits<16> opcode,
++                          SDPatternOperator operator,
++                          SDPatternOperator operator_cc, TypedReg tr1,
++                          TypedReg tr2, bits<4> type, bits<4> modifier = 0,
++                          bits<4> modifier_cc = 1> {
++  def "" : UnaryVRRa<mnemonic, opcode, operator, tr1, tr2, type, 0, modifier>;
++  let Defs = [CC] in
++    def S : UnaryVRRa<mnemonic##"s", opcode, operator_cc, tr1, tr2, type, 0,
++                      modifier_cc>;
++}
++
++class UnaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++               TypedReg tr, bits<5> bytes, bits<4> type = 0>
++  : InstVRX<opcode, (outs tr.op:$V1), (ins bdxaddr12only:$XBD2),
++            mnemonic#"\t$V1, $XBD2",
++            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2)))]> {
++  let M3 = type;
++  let mayLoad = 1;
++  let AccessBytes = bytes;
++}
++
+ class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+   : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+@@ -1036,6 +1520,7 @@ class BinaryRXE<string mnemonic, bits<16
+   let DisableEncoding = "$R1src";
+   let mayLoad = 1;
+   let AccessBytes = bytes;
++  let M3 = 0;
+ }
+ 
+ class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+@@ -1094,6 +1579,148 @@ multiclass BinarySIPair<string mnemonic,
+   }
+ }
+ 
++class BinaryVRIb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr, bits<4> type>
++  : InstVRIb<opcode, (outs tr.op:$V1), (ins imm32zx8:$I2, imm32zx8:$I3),
++             mnemonic#"\t$V1, $I2, $I3",
++             [(set tr.op:$V1, (tr.vt (operator imm32zx8:$I2, imm32zx8:$I3)))]> {
++  let M4 = type;
++}
++
++class BinaryVRIc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr1, TypedReg tr2, bits<4> type>
++  : InstVRIc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, imm32zx16:$I2),
++             mnemonic#"\t$V1, $V3, $I2",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
++                                                 imm32zx16:$I2)))]> {
++  let M4 = type;
++}
++
++class BinaryVRIe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m5>
++  : InstVRIe<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx12:$I3),
++             mnemonic#"\t$V1, $V2, $I3",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 imm32zx12:$I3)))]> {
++  let M4 = type;
++  let M5 = m5;
++}
++
++class BinaryVRRa<string mnemonic, bits<16> opcode>
++  : InstVRRa<opcode, (outs VR128:$V1), (ins VR128:$V2, imm32zx4:$M3),
++             mnemonic#"\t$V1, $V2, $M3", []> {
++  let M4 = 0;
++  let M5 = 0;
++}
++
++class BinaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr1, TypedReg tr2, bits<4> type = 0,
++                 bits<4> modifier = 0>
++  : InstVRRb<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
++             mnemonic#"\t$V1, $V2, $V3",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3))))]> {
++  let M4 = type;
++  let M5 = modifier;
++}
++
++// Declare a pair of instructions, one which sets CC and one which doesn't.
++// The CC-setting form ends with "S" and sets the low bit of M5.
++multiclass BinaryVRRbSPair<string mnemonic, bits<16> opcode,
++                           SDPatternOperator operator,
++                           SDPatternOperator operator_cc, TypedReg tr1,
++                           TypedReg tr2, bits<4> type,
++                           bits<4> modifier = 0, bits<4> modifier_cc = 1> {
++  def "" : BinaryVRRb<mnemonic, opcode, operator, tr1, tr2, type, modifier>;
++  let Defs = [CC] in
++    def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
++                       modifier_cc>;
++}
++
++class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0,
++                 bits<4> m6 = 0>
++  : InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
++             mnemonic#"\t$V1, $V2, $V3",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3))))]> {
++  let M4 = type;
++  let M5 = m5;
++  let M6 = m6;
++}
++
++multiclass BinaryVRRcSPair<string mnemonic, bits<16> opcode,
++                           SDPatternOperator operator,
++                           SDPatternOperator operator_cc, TypedReg tr1,
++                           TypedReg tr2, bits<4> type, bits<4> m5,
++                           bits<4> modifier = 0, bits<4> modifier_cc = 1> {
++  def "" : BinaryVRRc<mnemonic, opcode, operator, tr1, tr2, type, m5, modifier>;
++  let Defs = [CC] in
++    def S : BinaryVRRc<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
++                       m5, modifier_cc>;
++}
++
++class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr>
++  : InstVRRf<opcode, (outs tr.op:$V1), (ins GR64:$R2, GR64:$R3),
++             mnemonic#"\t$V1, $R2, $R3",
++             [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
++
++class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr1, TypedReg tr2, bits<4> type>
++  : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
++             mnemonic#"\t$V1, $V3, $BD2",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V3),
++                                                 shift12only:$BD2)))]> {
++  let M4 = type;
++}
++
++class BinaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 bits<5> bytes>
++  : InstVRSb<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
++             mnemonic#"\t$V1, $R3, $BD2",
++             [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
++  let M4 = 0;
++  let mayLoad = 1;
++  let AccessBytes = bytes;
++}
++
++class BinaryVRSc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr, bits<4> type>
++  : InstVRSc<opcode, (outs GR64:$R1), (ins tr.op:$V3, shift12only:$BD2),
++           mnemonic#"\t$R1, $V3, $BD2",
++           [(set GR64:$R1, (operator (tr.vt tr.op:$V3), shift12only:$BD2))]> {
++  let M4 = type;
++}
++
++class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                TypedReg tr, bits<5> bytes>
++  : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
++            mnemonic#"\t$V1, $XBD2, $M3",
++            [(set tr.op:$V1, (tr.vt (operator bdxaddr12only:$XBD2,
++                                              imm32zx4:$M3)))]> {
++  let mayLoad = 1;
++  let AccessBytes = bytes;
++}
++
++class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
++                     Immediate index>
++  : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
++            mnemonic#"\t$V1, $VBD2, $M3", []> {
++  let mayStore = 1;
++  let AccessBytes = bytes;
++}
++
++class StoreBinaryVRX<string mnemonic, bits<16> opcode,
++                     SDPatternOperator operator, TypedReg tr, bits<5> bytes,
++                     Immediate index>
++  : InstVRX<opcode, (outs), (ins tr.op:$V1, bdxaddr12only:$XBD2, index:$M3),
++            mnemonic#"\t$V1, $XBD2, $M3",
++            [(operator (tr.vt tr.op:$V1), bdxaddr12only:$XBD2, index:$M3)]> {
++  let mayStore = 1;
++  let AccessBytes = bytes;
++}
++
+ class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+   : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+@@ -1166,6 +1793,7 @@ class CompareRXE<string mnemonic, bits<1
+   let isCompare = 1;
+   let mayLoad = 1;
+   let AccessBytes = bytes;
++  let M3 = 0;
+ }
+ 
+ class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+@@ -1235,6 +1863,17 @@ multiclass CompareSIPair<string mnemonic
+   }
+ }
+ 
++class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr, bits<4> type>
++  : InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
++             mnemonic#"\t$V1, $V2",
++             [(operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2))]> {
++  let isCompare = 1;
++  let M3 = type;
++  let M4 = 0;
++  let M5 = 0;
++}
++
+ class TernaryRRD<string mnemonic, bits<16> opcode,
+                  SDPatternOperator operator, RegisterOperand cls>
+   : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+@@ -1261,6 +1900,188 @@ class TernaryRXF<string mnemonic, bits<1
+   let AccessBytes = bytes;
+ }
+ 
++class TernaryVRIa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, Immediate imm, Immediate index>
++  : InstVRIa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V1src, imm:$I2, index:$M3),
++             mnemonic#"\t$V1, $I2, $M3",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
++                                                 imm:$I2, index:$M3)))]> {
++  let Constraints = "$V1 = $V1src";
++  let DisableEncoding = "$V1src";
++}
++
++class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, bits<4> type>
++  : InstVRId<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
++             mnemonic#"\t$V1, $V2, $V3, $I4",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 imm32zx8:$I4)))]> {
++  let M5 = type;
++}
++
++class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
++  : InstVRRa<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, imm32zx4:$M4, imm32zx4:$M5),
++             mnemonic#"\t$V1, $V2, $M4, $M5",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 imm32zx4:$M4,
++                                                 imm32zx4:$M5)))],
++             m4or> {
++  let M3 = type;
++}
++
++class TernaryVRRb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, bits<4> type,
++                  SDPatternOperator m5mask, bits<4> m5or>
++  : InstVRRb<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, tr2.op:$V3, m5mask:$M5),
++             mnemonic#"\t$V1, $V2, $V3, $M5",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 m5mask:$M5)))],
++             m5or> {
++  let M4 = type;
++}
++
++multiclass TernaryVRRbSPair<string mnemonic, bits<16> opcode,
++                            SDPatternOperator operator,
++                            SDPatternOperator operator_cc, TypedReg tr1,
++                            TypedReg tr2, bits<4> type, bits<4> m5or> {
++  def "" : TernaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
++                       imm32zx4even, !and (m5or, 14)>;
++  def : InstAlias<mnemonic#"\t$V1, $V2, $V3",
++                  (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
++                                            tr2.op:$V3, 0)>;
++  let Defs = [CC] in
++    def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
++                        imm32zx4even, !add(!and (m5or, 14), 1)>;
++  def : InstAlias<mnemonic#"s\t$V1, $V2, $V3",
++                  (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
++                                                tr2.op:$V3, 0)>;
++}
++
++class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2>
++  : InstVRRc<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M4),
++             mnemonic#"\t$V1, $V2, $V3, $M4",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 imm32zx4:$M4)))]> {
++  let M5 = 0;
++  let M6 = 0;
++}
++
++class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, bits<4> type = 0>
++  : InstVRRd<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
++             mnemonic#"\t$V1, $V2, $V3, $V4",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 (tr1.vt tr1.op:$V4))))]> {
++  let M5 = type;
++  let M6 = 0;
++}
++
++class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0>
++  : InstVRRe<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
++             mnemonic#"\t$V1, $V2, $V3, $V4",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 (tr1.vt tr1.op:$V4))))]> {
++  let M5 = m5;
++  let M6 = type;
++}
++
++class TernaryVRSb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                  TypedReg tr1, TypedReg tr2, RegisterOperand cls, bits<4> type>
++  : InstVRSb<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V1src, cls:$R3, shift12only:$BD2),
++             mnemonic#"\t$V1, $R3, $BD2",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
++                                                 cls:$R3,
++                                                 shift12only:$BD2)))]> {
++  let Constraints = "$V1 = $V1src";
++  let DisableEncoding = "$V1src";
++  let M4 = type;
++}
++
++class TernaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
++                 Immediate index>
++  : InstVRV<opcode, (outs VR128:$V1),
++           (ins VR128:$V1src, bdvaddr12only:$VBD2, index:$M3),
++           mnemonic#"\t$V1, $VBD2, $M3", []> {
++  let Constraints = "$V1 = $V1src";
++  let DisableEncoding = "$V1src";
++  let mayLoad = 1;
++  let AccessBytes = bytes;
++}
++
++class TernaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                 TypedReg tr1, TypedReg tr2, bits<5> bytes, Immediate index>
++  : InstVRX<opcode, (outs tr1.op:$V1),
++           (ins tr2.op:$V1src, bdxaddr12only:$XBD2, index:$M3),
++           mnemonic#"\t$V1, $XBD2, $M3",
++           [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
++                                               bdxaddr12only:$XBD2,
++                                               index:$M3)))]> {
++  let Constraints = "$V1 = $V1src";
++  let DisableEncoding = "$V1src";
++  let mayLoad = 1;
++  let AccessBytes = bytes;
++}
++
++class QuaternaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
++                     TypedReg tr1, TypedReg tr2, bits<4> type>
++  : InstVRId<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V1src, tr2.op:$V2, tr2.op:$V3, imm32zx8:$I4),
++             mnemonic#"\t$V1, $V2, $V3, $I4",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V1src),
++                                                 (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 imm32zx8:$I4)))]> {
++  let Constraints = "$V1 = $V1src";
++  let DisableEncoding = "$V1src";
++  let M5 = type;
++}
++
++class QuaternaryVRRd<string mnemonic, bits<16> opcode,
++                     SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
++                     bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
++  : InstVRRd<opcode, (outs tr1.op:$V1),
++             (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
++             mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
++             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
++                                                 (tr2.vt tr2.op:$V3),
++                                                 (tr2.vt tr2.op:$V4),
++                                                 m6mask:$M6)))],
++             m6or> {
++  let M5 = type;
++}
++
++multiclass QuaternaryVRRdSPair<string mnemonic, bits<16> opcode,
++                               SDPatternOperator operator,
++                               SDPatternOperator operator_cc, TypedReg tr1,
++                               TypedReg tr2, bits<4> type, bits<4> m6or> {
++  def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
++                          imm32zx4even, !and (m6or, 14)>;
++  def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
++                  (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
++                                            tr2.op:$V3, tr2.op:$V4, 0)>;
++  let Defs = [CC] in
++    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
++                           imm32zx4even, !add (!and (m6or, 14), 1)>;
++  def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
++                  (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
++                                                tr2.op:$V3, tr2.op:$V4, 0)>;
++}
++
+ class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                   RegisterOperand cls, AddressingMode mode = bdaddr20only>
+   : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, mode:$BD2),
+@@ -1330,10 +2151,13 @@ class PrefetchRILPC<string mnemonic, bit
+ 
+ // A floating-point load-and test operation.  Create both a normal unary
+ // operation and one that acts as a comparison against zero.
++// Note that the comparison against zero operation is not available if we
++// have vector support, since load-and-test instructions will partially
++// clobber the target (vector) register.
+ multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls> {
+   def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
+-  let isCodeGenOnly = 1 in
++  let isCodeGenOnly = 1, Predicates = [FeatureNoVector] in
+     def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
+ }
+ 
+@@ -1577,6 +2401,26 @@ class Alias<int size, dag outs, dag ins,
+   let isCodeGenOnly = 1;
+ }
+ 
++class UnaryAliasVRS<RegisterOperand cls1, RegisterOperand cls2>
++ : Alias<6, (outs cls1:$src1), (ins cls2:$src2), []>;
++
++// An alias of a UnaryVRR*, but with different register sizes.
++class UnaryAliasVRR<SDPatternOperator operator, TypedReg tr1, TypedReg tr2>
++  : Alias<6, (outs tr1.op:$V1), (ins tr2.op:$V2),
++          [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2))))]>;
++
++// An alias of a UnaryVRX, but with different register sizes.
++class UnaryAliasVRX<SDPatternOperator operator, TypedReg tr,
++                    AddressingMode mode = bdxaddr12only>
++  : Alias<6, (outs tr.op:$V1), (ins mode:$XBD2),
++          [(set tr.op:$V1, (tr.vt (operator mode:$XBD2)))]>;
++
++// An alias of a StoreVRX, but with different register sizes.
++class StoreAliasVRX<SDPatternOperator operator, TypedReg tr,
++                    AddressingMode mode = bdxaddr12only>
++  : Alias<6, (outs), (ins tr.op:$V1, mode:$XBD2),
++          [(operator (tr.vt tr.op:$V1), mode:$XBD2)]>;
++
+ // An alias of a BinaryRI, but with different register sizes.
+ class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+@@ -1593,6 +2437,10 @@ class BinaryAliasRIL<SDPatternOperator o
+   let Constraints = "$R1 = $R1src";
+ }
+ 
++// An alias of a BinaryVRRf, but with different register sizes.
++class BinaryAliasVRRf<RegisterOperand cls>
++  : Alias<6, (outs VR128:$V1), (ins cls:$R2, cls:$R3), []>;
++
+ // An alias of a CompareRI, but with different register sizes.
+ class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+Index: llvm-36/lib/Target/SystemZ/SystemZInstrInfo.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZInstrInfo.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZInstrInfo.cpp
+@@ -578,6 +578,12 @@ SystemZInstrInfo::copyPhysReg(MachineBas
+     Opcode = SystemZ::LDR;
+   else if (SystemZ::FP128BitRegClass.contains(DestReg, SrcReg))
+     Opcode = SystemZ::LXR;
++  else if (SystemZ::VR32BitRegClass.contains(DestReg, SrcReg))
++    Opcode = SystemZ::VLR32;
++  else if (SystemZ::VR64BitRegClass.contains(DestReg, SrcReg))
++    Opcode = SystemZ::VLR64;
++  else if (SystemZ::VR128BitRegClass.contains(DestReg, SrcReg))
++    Opcode = SystemZ::VLR;
+   else
+     llvm_unreachable("Impossible reg-to-reg copy");
+ 
+@@ -723,9 +729,12 @@ SystemZInstrInfo::convertToThreeAddress(
+     unsigned Start, End;
+     if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+       unsigned NewOpcode;
+-      if (And.RegSize == 64)
++      if (And.RegSize == 64) {
+         NewOpcode = SystemZ::RISBG;
+-      else {
++        // Prefer RISBGN if available, since it does not clobber CC.
++        if (STI.hasMiscellaneousExtensions())
++          NewOpcode = SystemZ::RISBGN;
++      } else {
+         NewOpcode = SystemZ::RISBMux;
+         Start &= 31;
+         End &= 31;
+@@ -1114,6 +1123,16 @@ void SystemZInstrInfo::getLoadStoreOpcod
+   } else if (RC == &SystemZ::FP128BitRegClass) {
+     LoadOpcode = SystemZ::LX;
+     StoreOpcode = SystemZ::STX;
++  } else if (RC == &SystemZ::VR32BitRegClass) {
++    LoadOpcode = SystemZ::VL32;
++    StoreOpcode = SystemZ::VST32;
++  } else if (RC == &SystemZ::VR64BitRegClass) {
++    LoadOpcode = SystemZ::VL64;
++    StoreOpcode = SystemZ::VST64;
++  } else if (RC == &SystemZ::VF128BitRegClass ||
++             RC == &SystemZ::VR128BitRegClass) {
++    LoadOpcode = SystemZ::VL;
++    StoreOpcode = SystemZ::VST;
+   } else
+     llvm_unreachable("Unsupported regclass to load or store");
+ }
+@@ -1147,17 +1166,22 @@ unsigned SystemZInstrInfo::getOpcodeForO
+ 
+ unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
+   switch (Opcode) {
+-  case SystemZ::L:    return SystemZ::LT;
+-  case SystemZ::LY:   return SystemZ::LT;
+-  case SystemZ::LG:   return SystemZ::LTG;
+-  case SystemZ::LGF:  return SystemZ::LTGF;
+-  case SystemZ::LR:   return SystemZ::LTR;
+-  case SystemZ::LGFR: return SystemZ::LTGFR;
+-  case SystemZ::LGR:  return SystemZ::LTGR;
+-  case SystemZ::LER:  return SystemZ::LTEBR;
+-  case SystemZ::LDR:  return SystemZ::LTDBR;
+-  case SystemZ::LXR:  return SystemZ::LTXBR;
+-  default:            return 0;
++  case SystemZ::L:      return SystemZ::LT;
++  case SystemZ::LY:     return SystemZ::LT;
++  case SystemZ::LG:     return SystemZ::LTG;
++  case SystemZ::LGF:    return SystemZ::LTGF;
++  case SystemZ::LR:     return SystemZ::LTR;
++  case SystemZ::LGFR:   return SystemZ::LTGFR;
++  case SystemZ::LGR:    return SystemZ::LTGR;
++  case SystemZ::LER:    return SystemZ::LTEBR;
++  case SystemZ::LDR:    return SystemZ::LTDBR;
++  case SystemZ::LXR:    return SystemZ::LTXBR;
++  // On zEC12 we prefer to use RISBGN.  But if there is a chance to
++  // actually use the condition code, we may turn it back into RISGB.
++  // Note that RISBG is not really a "load-and-test" instruction,
++  // but sets the same condition code values, so is OK to use here.
++  case SystemZ::RISBGN: return SystemZ::RISBG;
++  default:              return 0;
+   }
+ }
+ 
+@@ -1178,6 +1202,7 @@ static bool isStringOfOnes(uint64_t Mask
+ bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                                    unsigned &Start, unsigned &End) const {
+   // Reject trivial all-zero masks.
++  Mask &= allOnes(BitSize);
+   if (Mask == 0)
+     return false;
+ 
+Index: llvm-36/lib/Target/SystemZ/SystemZInstrInfo.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZInstrInfo.h
++++ llvm-36/lib/Target/SystemZ/SystemZInstrInfo.h
+@@ -56,10 +56,13 @@ static inline unsigned getCompareZeroCCM
+ // SystemZ MachineOperand target flags.
+ enum {
+   // Masks out the bits for the access model.
+-  MO_SYMBOL_MODIFIER = (1 << 0),
++  MO_SYMBOL_MODIFIER = (3 << 0),
+ 
+   // @GOT (aka @GOTENT)
+-  MO_GOT = (1 << 0)
++  MO_GOT = (1 << 0),
++
++  // @INDNTPOFF
++  MO_INDNTPOFF = (2 << 0)
+ };
+ // Classifies a branch.
+ enum BranchType {
+Index: llvm-36/lib/Target/SystemZ/SystemZInstrInfo.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZInstrInfo.td
++++ llvm-36/lib/Target/SystemZ/SystemZInstrInfo.td
+@@ -249,11 +249,21 @@ let isCall = 1, isTerminator = 1, isRetu
+     def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
+ }
+ 
++// TLS calls.  These will be lowered into a call to __tls_get_offset,
++// with an extra relocation specifying the TLS symbol.
++let isCall = 1, Defs = [R14D, CC] in {
++  def TLS_GDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
++                         [(z_tls_gdcall tglobaltlsaddr:$I2)]>;
++  def TLS_LDCALL : Alias<6, (outs), (ins tlssym:$I2, variable_ops),
++                         [(z_tls_ldcall tglobaltlsaddr:$I2)]>;
++}
++
+ // Define the general form of the call instructions for the asm parser.
+ // These instructions don't hard-code %r14 as the return address register.
+-def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
++// Allow an optional TLS marker symbol to generate TLS call relocations.
++def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16tls:$I2),
+                    "bras\t$R1, $I2", []>;
+-def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
++def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32tls:$I2),
+                     "brasl\t$R1, $I2", []>;
+ def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                    "basr\t$R1, $R2", []>;
+@@ -587,6 +597,12 @@ let hasSideEffects = 0, isAsCheapAsAMove
+                      [(set GR64:$R1, pcrel32:$I2)]>;
+ }
+ 
++// Load the Global Offset Table address.  This will be lowered into a
++//     larl $R1, _GLOBAL_OFFSET_TABLE_
++// instruction.
++def GOT : Alias<6, (outs GR64:$R1), (ins),
++                [(set GR64:$R1, (global_offset_table))]>;
++
+ //===----------------------------------------------------------------------===//
+ // Absolute and Negation
+ //===----------------------------------------------------------------------===//
+@@ -1045,6 +1061,10 @@ let Defs = [CC] in {
+     def RISBG : RotateSelectRIEf<"risbg", 0xEC55, GR64, GR64>;
+ }
+ 
++// On zEC12 we have a variant of RISBG that does not set CC.
++let Predicates = [FeatureMiscellaneousExtensions] in
++  def RISBGN : RotateSelectRIEf<"risbgn", 0xEC59, GR64, GR64>;
++
+ // Forms of RISBG that only affect one word of the destination register.
+ // They do not set CC.
+ let Predicates = [FeatureHighWord] in {
+@@ -1342,6 +1362,60 @@ let Defs = [CC] in {
+ }
+ 
+ //===----------------------------------------------------------------------===//
++// Transactional execution
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureTransactionalExecution] in {
++  // Transaction Begin
++  let hasSideEffects = 1, mayStore = 1,
++      usesCustomInserter = 1, Defs = [CC] in {
++    def TBEGIN : InstSIL<0xE560,
++                         (outs), (ins bdaddr12only:$BD1, imm32zx16:$I2),
++                         "tbegin\t$BD1, $I2",
++                         [(z_tbegin bdaddr12only:$BD1, imm32zx16:$I2)]>;
++    def TBEGIN_nofloat : Pseudo<(outs), (ins bdaddr12only:$BD1, imm32zx16:$I2),
++                                [(z_tbegin_nofloat bdaddr12only:$BD1,
++                                                   imm32zx16:$I2)]>;
++    def TBEGINC : InstSIL<0xE561,
++                          (outs), (ins bdaddr12only:$BD1, imm32zx16:$I2),
++                          "tbeginc\t$BD1, $I2",
++                          [(int_s390_tbeginc bdaddr12only:$BD1,
++                                             imm32zx16:$I2)]>;
++  }
++
++  // Transaction End
++  let hasSideEffects = 1, Defs = [CC], BD2 = 0 in
++    def TEND : InstS<0xB2F8, (outs), (ins), "tend", [(z_tend)]>;
++
++  // Transaction Abort
++  let hasSideEffects = 1, isTerminator = 1, isBarrier = 1 in
++    def TABORT : InstS<0xB2FC, (outs), (ins bdaddr12only:$BD2),
++                       "tabort\t$BD2",
++                       [(int_s390_tabort bdaddr12only:$BD2)]>;
++
++  // Nontransactional Store
++  let hasSideEffects = 1 in
++    def NTSTG : StoreRXY<"ntstg", 0xE325, int_s390_ntstg, GR64, 8>;
++
++  // Extract Transaction Nesting Depth
++  let hasSideEffects = 1 in
++    def ETND : InherentRRE<"etnd", 0xB2EC, GR32, (int_s390_etnd)>;
++}
++
++//===----------------------------------------------------------------------===//
++// Processor assist
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureProcessorAssist] in {
++  let hasSideEffects = 1, R4 = 0 in
++    def PPA : InstRRF<0xB2E8, (outs), (ins GR64:$R1, GR64:$R2, imm32zx4:$R3),
++                      "ppa\t$R1, $R2, $R3", []>;
++  def : Pat<(int_s390_ppa_txassist GR32:$src),
++            (PPA (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
++                 0, 1)>;
++}
++
++//===----------------------------------------------------------------------===//
+ // Miscellaneous Instructions.
+ //===----------------------------------------------------------------------===//
+ 
+@@ -1366,6 +1440,13 @@ let Defs = [CC] in {
+ def : Pat<(ctlz GR64:$src),
+           (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
+ 
++// Population count.  Counts bits set per byte.
++let Predicates = [FeaturePopulationCount], Defs = [CC] in {
++  def POPCNT : InstRRE<0xB9E1, (outs GR64:$R1), (ins GR64:$R2),
++                       "popcnt\t$R1, $R2",
++                       [(set GR64:$R1, (z_popcnt GR64:$R2))]>;
++}
++
+ // Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
+ def : Pat<(i64 (anyext GR32:$src)),
+           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
+Index: llvm-36/lib/Target/SystemZ/SystemZInstrVector.td
+===================================================================
+--- /dev/null
++++ llvm-36/lib/Target/SystemZ/SystemZInstrVector.td
+@@ -0,0 +1,1097 @@
++//==- SystemZInstrVector.td - SystemZ Vector instructions ------*- tblgen-*-==//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++
++//===----------------------------------------------------------------------===//
++// Move instructions
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Register move.
++  def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
++  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
++  def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
++
++  // Load GR from VR element.
++  def VLGVB : BinaryVRSc<"vlgvb", 0xE721, null_frag, v128b, 0>;
++  def VLGVH : BinaryVRSc<"vlgvh", 0xE721, null_frag, v128h, 1>;
++  def VLGVF : BinaryVRSc<"vlgvf", 0xE721, null_frag, v128f, 2>;
++  def VLGVG : BinaryVRSc<"vlgvg", 0xE721, z_vector_extract, v128g, 3>;
++
++  // Load VR element from GR.
++  def VLVGB : TernaryVRSb<"vlvgb", 0xE722, z_vector_insert,
++                          v128b, v128b, GR32, 0>;
++  def VLVGH : TernaryVRSb<"vlvgh", 0xE722, z_vector_insert,
++                          v128h, v128h, GR32, 1>;
++  def VLVGF : TernaryVRSb<"vlvgf", 0xE722, z_vector_insert,
++                          v128f, v128f, GR32, 2>;
++  def VLVGG : TernaryVRSb<"vlvgg", 0xE722, z_vector_insert,
++                          v128g, v128g, GR64, 3>;
++
++  // Load VR from GRs disjoint.
++  def VLVGP : BinaryVRRf<"vlvgp", 0xE762, z_join_dwords, v128g>;
++  def VLVGP32 : BinaryAliasVRRf<GR32>;
++}
++
++// Extractions always assign to the full GR64, even if the element would
++// fit in the lower 32 bits.  Sub-i64 extracts therefore need to take a
++// subreg of the result.
++class VectorExtractSubreg<ValueType type, Instruction insn>
++  : Pat<(i32 (z_vector_extract (type VR128:$vec), shift12only:$index)),
++        (EXTRACT_SUBREG (insn VR128:$vec, shift12only:$index), subreg_l32)>;
++
++def : VectorExtractSubreg<v16i8, VLGVB>;
++def : VectorExtractSubreg<v8i16, VLGVH>;
++def : VectorExtractSubreg<v4i32, VLGVF>;
++
++//===----------------------------------------------------------------------===//
++// Immediate instructions
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Generate byte mask.
++  def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
++  def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
++  def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
++
++  // Generate mask.
++  def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
++  def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
++  def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
++  def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
++
++  // Load element immediate.
++  //
++  // We want these instructions to be used ahead of VLVG* where possible.
++  // However, VLVG* takes a variable BD-format index whereas VLEI takes
++  // a plain immediate index.  This means that VLVG* has an extra "base"
++  // register operand and is 3 units more complex.  Bumping the complexity
++  // of the VLEI* instructions by 4 means that they are strictly better
++  // than VLVG* in cases where both forms match.
++  let AddedComplexity = 4 in {
++    def VLEIB : TernaryVRIa<"vleib", 0xE740, z_vector_insert,
++                            v128b, v128b, imm32sx16trunc, imm32zx4>;
++    def VLEIH : TernaryVRIa<"vleih", 0xE741, z_vector_insert,
++                            v128h, v128h, imm32sx16trunc, imm32zx3>;
++    def VLEIF : TernaryVRIa<"vleif", 0xE743, z_vector_insert,
++                            v128f, v128f, imm32sx16, imm32zx2>;
++    def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert,
++                            v128g, v128g, imm64sx16, imm32zx1>;
++  }
++
++  // Replicate immediate.
++  def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
++  def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
++  def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
++  def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
++}
++
++//===----------------------------------------------------------------------===//
++// Loads
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Load.
++  def VL : UnaryVRX<"vl", 0xE706, null_frag, v128any, 16>;
++
++  // Load to block boundary.  The number of loaded bytes is only known
++  // at run time.  The instruction is really polymorphic, but v128b matches
++  // the return type of the associated intrinsic.
++  def VLBB : BinaryVRX<"vlbb", 0xE707, int_s390_vlbb, v128b, 0>;
++
++  // Load count to block boundary.
++  let Defs = [CC] in
++    def LCBB : InstRXE<0xE727, (outs GR32:$R1),
++                               (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
++                       "lcbb\t$R1, $XBD2, $M3",
++                       [(set GR32:$R1, (int_s390_lcbb bdxaddr12only:$XBD2,
++                                                      imm32zx4:$M3))]>;
++
++  // Load with length.  The number of loaded bytes is only known at run time.
++  def VLL : BinaryVRSb<"vll", 0xE737, int_s390_vll, 0>;
++
++  // Load multiple.
++  def VLM : LoadMultipleVRSa<"vlm", 0xE736>;
++
++  // Load and replicate
++  def VLREPB : UnaryVRX<"vlrepb", 0xE705, z_replicate_loadi8,  v128b, 1, 0>;
++  def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>;
++  def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>;
++  def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>;
++  def : Pat<(v4f32 (z_replicate_loadf32 bdxaddr12only:$addr)),
++            (VLREPF bdxaddr12only:$addr)>;
++  def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)),
++            (VLREPG bdxaddr12only:$addr)>;
++
++  // Use VLREP to load subvectors.  These patterns use "12pair" because
++  // LEY and LDY offer full 20-bit displacement fields.  It's often better
++  // to use those instructions rather than force a 20-bit displacement
++  // into a GPR temporary.
++  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
++  def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
++
++  // Load logical element and zero.
++  def VLLEZB : UnaryVRX<"vllezb", 0xE704, z_vllezi8,  v128b, 1, 0>;
++  def VLLEZH : UnaryVRX<"vllezh", 0xE704, z_vllezi16, v128h, 2, 1>;
++  def VLLEZF : UnaryVRX<"vllezf", 0xE704, z_vllezi32, v128f, 4, 2>;
++  def VLLEZG : UnaryVRX<"vllezg", 0xE704, z_vllezi64, v128g, 8, 3>;
++  def : Pat<(v4f32 (z_vllezf32 bdxaddr12only:$addr)),
++            (VLLEZF bdxaddr12only:$addr)>;
++  def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
++            (VLLEZG bdxaddr12only:$addr)>;
++
++  // Load element.
++  def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8,  v128b, v128b, 1, imm32zx4>;
++  def VLEH : TernaryVRX<"vleh", 0xE701, z_vlei16, v128h, v128h, 2, imm32zx3>;
++  def VLEF : TernaryVRX<"vlef", 0xE703, z_vlei32, v128f, v128f, 4, imm32zx2>;
++  def VLEG : TernaryVRX<"vleg", 0xE702, z_vlei64, v128g, v128g, 8, imm32zx1>;
++  def : Pat<(z_vlef32 (v4f32 VR128:$val), bdxaddr12only:$addr, imm32zx2:$index),
++            (VLEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
++  def : Pat<(z_vlef64 (v2f64 VR128:$val), bdxaddr12only:$addr, imm32zx1:$index),
++            (VLEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
++
++  // Gather element.
++  def VGEF : TernaryVRV<"vgef", 0xE713, 4, imm32zx2>;
++  def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
++}
++
++// Use replicating loads if we're inserting a single element into an
++// undefined vector.  This avoids a false dependency on the previous
++// register contents.
++multiclass ReplicatePeephole<Instruction vlrep, ValueType vectype,
++                             SDPatternOperator load, ValueType scalartype> {
++  def : Pat<(vectype (z_vector_insert
++                      (undef), (scalartype (load bdxaddr12only:$addr)), 0)),
++            (vlrep bdxaddr12only:$addr)>;
++  def : Pat<(vectype (scalar_to_vector
++                      (scalartype (load bdxaddr12only:$addr)))),
++            (vlrep bdxaddr12only:$addr)>;
++}
++defm : ReplicatePeephole<VLREPB, v16i8, anyextloadi8, i32>;
++defm : ReplicatePeephole<VLREPH, v8i16, anyextloadi16, i32>;
++defm : ReplicatePeephole<VLREPF, v4i32, load, i32>;
++defm : ReplicatePeephole<VLREPG, v2i64, load, i64>;
++defm : ReplicatePeephole<VLREPF, v4f32, load, f32>;
++defm : ReplicatePeephole<VLREPG, v2f64, load, f64>;
++
++//===----------------------------------------------------------------------===//
++// Stores
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Store.
++  def VST : StoreVRX<"vst", 0xE70E, null_frag, v128any, 16>;
++
++  // Store with length.  The number of stored bytes is only known at run time.
++  def VSTL : StoreLengthVRSb<"vstl", 0xE73F, int_s390_vstl, 0>;
++
++  // Store multiple.
++  def VSTM : StoreMultipleVRSa<"vstm", 0xE73E>;
++
++  // Store element.
++  def VSTEB : StoreBinaryVRX<"vsteb", 0xE708, z_vstei8,  v128b, 1, imm32zx4>;
++  def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>;
++  def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>;
++  def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>;
++  def : Pat<(z_vstef32 (v4f32 VR128:$val), bdxaddr12only:$addr,
++                       imm32zx2:$index),
++            (VSTEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>;
++  def : Pat<(z_vstef64 (v2f64 VR128:$val), bdxaddr12only:$addr,
++                       imm32zx1:$index),
++            (VSTEG VR128:$val, bdxaddr12only:$addr, imm32zx1:$index)>;
++
++  // Use VSTE to store subvectors.  These patterns use "12pair" because
++  // STEY and STDY offer full 20-bit displacement fields.  It's often better
++  // to use those instructions rather than force a 20-bit displacement
++  // into a GPR temporary.
++  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
++  def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
++
++  // Scatter element.
++  def VSCEF : StoreBinaryVRV<"vscef", 0xE71B, 4, imm32zx2>;
++  def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
++}
++
++//===----------------------------------------------------------------------===//
++// Selects and permutes
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Merge high.
++  def VMRHB : BinaryVRRc<"vmrhb", 0xE761, z_merge_high, v128b, v128b, 0>;
++  def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
++  def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
++  def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
++  def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
++  def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
++
++  // Merge low.
++  def VMRLB : BinaryVRRc<"vmrlb", 0xE760, z_merge_low, v128b, v128b, 0>;
++  def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
++  def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
++  def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
++  def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
++  def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
++
++  // Permute.
++  def VPERM : TernaryVRRe<"vperm", 0xE78C, z_permute, v128b, v128b>;
++
++  // Permute doubleword immediate.
++  def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
++
++  // Replicate.
++  def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
++  def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>;
++  def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>;
++  def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>;
++  def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16:$index)),
++            (VREPF VR128:$vec, imm32zx16:$index)>;
++  def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16:$index)),
++            (VREPG VR128:$vec, imm32zx16:$index)>;
++
++  // Select.
++  def VSEL : TernaryVRRe<"vsel", 0xE78D, null_frag, v128any, v128any>;
++}
++
++//===----------------------------------------------------------------------===//
++// Widening and narrowing
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Pack
++  def VPKH : BinaryVRRc<"vpkh", 0xE794, z_pack, v128b, v128h, 1>;
++  def VPKF : BinaryVRRc<"vpkf", 0xE794, z_pack, v128h, v128f, 2>;
++  def VPKG : BinaryVRRc<"vpkg", 0xE794, z_pack, v128f, v128g, 3>;
++
++  // Pack saturate.
++  defm VPKSH : BinaryVRRbSPair<"vpksh", 0xE797, int_s390_vpksh, z_packs_cc,
++                               v128b, v128h, 1>;
++  defm VPKSF : BinaryVRRbSPair<"vpksf", 0xE797, int_s390_vpksf, z_packs_cc,
++                               v128h, v128f, 2>;
++  defm VPKSG : BinaryVRRbSPair<"vpksg", 0xE797, int_s390_vpksg, z_packs_cc,
++                               v128f, v128g, 3>;
++
++  // Pack saturate logical.
++  defm VPKLSH : BinaryVRRbSPair<"vpklsh", 0xE795, int_s390_vpklsh, z_packls_cc,
++                                v128b, v128h, 1>;
++  defm VPKLSF : BinaryVRRbSPair<"vpklsf", 0xE795, int_s390_vpklsf, z_packls_cc,
++                                v128h, v128f, 2>;
++  defm VPKLSG : BinaryVRRbSPair<"vpklsg", 0xE795, int_s390_vpklsg, z_packls_cc,
++                                v128f, v128g, 3>;
++
++  // Sign-extend to doubleword.
++  def VSEGB : UnaryVRRa<"vsegb", 0xE75F, z_vsei8,  v128g, v128g, 0>;
++  def VSEGH : UnaryVRRa<"vsegh", 0xE75F, z_vsei16, v128g, v128g, 1>;
++  def VSEGF : UnaryVRRa<"vsegf", 0xE75F, z_vsei32, v128g, v128g, 2>;
++  def : Pat<(z_vsei8_by_parts  (v16i8 VR128:$src)), (VSEGB VR128:$src)>;
++  def : Pat<(z_vsei16_by_parts (v8i16 VR128:$src)), (VSEGH VR128:$src)>;
++  def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>;
++
++  // Unpack high.
++  def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>;
++  def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>;
++  def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>;
++
++  // Unpack logical high.
++  def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>;
++  def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>;
++  def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>;
++
++  // Unpack low.
++  def VUPLB  : UnaryVRRa<"vuplb",  0xE7D6, z_unpack_low, v128h, v128b, 0>;
++  def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>;
++  def VUPLF  : UnaryVRRa<"vuplf",  0xE7D6, z_unpack_low, v128g, v128f, 2>;
++
++  // Unpack logical low.
++  def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>;
++  def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>;
++  def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>;
++}
++
++//===----------------------------------------------------------------------===//
++// Instantiating generic operations for specific types.
++//===----------------------------------------------------------------------===//
++
++multiclass GenericVectorOps<ValueType type, ValueType inttype> {
++  let Predicates = [FeatureVector] in {
++    def : Pat<(type (load bdxaddr12only:$addr)),
++              (VL bdxaddr12only:$addr)>;
++    def : Pat<(store (type VR128:$src), bdxaddr12only:$addr),
++              (VST VR128:$src, bdxaddr12only:$addr)>;
++    def : Pat<(type (vselect (inttype VR128:$x), VR128:$y, VR128:$z)),
++              (VSEL VR128:$y, VR128:$z, VR128:$x)>;
++    def : Pat<(type (vselect (inttype (z_vnot VR128:$x)), VR128:$y, VR128:$z)),
++              (VSEL VR128:$z, VR128:$y, VR128:$x)>;
++  }
++}
++
++defm : GenericVectorOps<v16i8, v16i8>;
++defm : GenericVectorOps<v8i16, v8i16>;
++defm : GenericVectorOps<v4i32, v4i32>;
++defm : GenericVectorOps<v2i64, v2i64>;
++defm : GenericVectorOps<v4f32, v4i32>;
++defm : GenericVectorOps<v2f64, v2i64>;
++
++//===----------------------------------------------------------------------===//
++// Integer arithmetic
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Add.
++  def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>;
++  def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>;
++  def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>;
++  def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>;
++  def VAQ : BinaryVRRc<"vaq", 0xE7F3, int_s390_vaq, v128q, v128q, 4>;
++
++  // Add compute carry.
++  def VACCB : BinaryVRRc<"vaccb", 0xE7F1, int_s390_vaccb, v128b, v128b, 0>;
++  def VACCH : BinaryVRRc<"vacch", 0xE7F1, int_s390_vacch, v128h, v128h, 1>;
++  def VACCF : BinaryVRRc<"vaccf", 0xE7F1, int_s390_vaccf, v128f, v128f, 2>;
++  def VACCG : BinaryVRRc<"vaccg", 0xE7F1, int_s390_vaccg, v128g, v128g, 3>;
++  def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, int_s390_vaccq, v128q, v128q, 4>;
++
++  // Add with carry.
++  def VACQ : TernaryVRRd<"vacq", 0xE7BB, int_s390_vacq, v128q, v128q, 4>;
++
++  // Add with carry compute carry.
++  def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, int_s390_vacccq, v128q, v128q, 4>;
++
++  // And.
++  def VN : BinaryVRRc<"vn", 0xE768, null_frag, v128any, v128any>;
++
++  // And with complement.
++  def VNC : BinaryVRRc<"vnc", 0xE769, null_frag, v128any, v128any>;
++
++  // Average.
++  def VAVGB : BinaryVRRc<"vavgb", 0xE7F2, int_s390_vavgb, v128b, v128b, 0>;
++  def VAVGH : BinaryVRRc<"vavgh", 0xE7F2, int_s390_vavgh, v128h, v128h, 1>;
++  def VAVGF : BinaryVRRc<"vavgf", 0xE7F2, int_s390_vavgf, v128f, v128f, 2>;
++  def VAVGG : BinaryVRRc<"vavgg", 0xE7F2, int_s390_vavgg, v128g, v128g, 3>;
++
++  // Average logical.
++  def VAVGLB : BinaryVRRc<"vavglb", 0xE7F0, int_s390_vavglb, v128b, v128b, 0>;
++  def VAVGLH : BinaryVRRc<"vavglh", 0xE7F0, int_s390_vavglh, v128h, v128h, 1>;
++  def VAVGLF : BinaryVRRc<"vavglf", 0xE7F0, int_s390_vavglf, v128f, v128f, 2>;
++  def VAVGLG : BinaryVRRc<"vavglg", 0xE7F0, int_s390_vavglg, v128g, v128g, 3>;
++
++  // Checksum.
++  def VCKSM : BinaryVRRc<"vcksm", 0xE766, int_s390_vcksm, v128f, v128f>;
++
++  // Count leading zeros.
++  def VCLZB : UnaryVRRa<"vclzb", 0xE753, ctlz, v128b, v128b, 0>;
++  def VCLZH : UnaryVRRa<"vclzh", 0xE753, ctlz, v128h, v128h, 1>;
++  def VCLZF : UnaryVRRa<"vclzf", 0xE753, ctlz, v128f, v128f, 2>;
++  def VCLZG : UnaryVRRa<"vclzg", 0xE753, ctlz, v128g, v128g, 3>;
++
++  // Count trailing zeros.
++  def VCTZB : UnaryVRRa<"vctzb", 0xE752, cttz, v128b, v128b, 0>;
++  def VCTZH : UnaryVRRa<"vctzh", 0xE752, cttz, v128h, v128h, 1>;
++  def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
++  def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
++
++  // Exclusive or.
++  def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
++
++  // Galois field multiply sum.
++  def VGFMB : BinaryVRRc<"vgfmb", 0xE7B4, int_s390_vgfmb, v128h, v128b, 0>;
++  def VGFMH : BinaryVRRc<"vgfmh", 0xE7B4, int_s390_vgfmh, v128f, v128h, 1>;
++  def VGFMF : BinaryVRRc<"vgfmf", 0xE7B4, int_s390_vgfmf, v128g, v128f, 2>;
++  def VGFMG : BinaryVRRc<"vgfmg", 0xE7B4, int_s390_vgfmg, v128q, v128g, 3>;
++
++  // Galois field multiply sum and accumulate.
++  def VGFMAB : TernaryVRRd<"vgfmab", 0xE7BC, int_s390_vgfmab, v128h, v128b, 0>;
++  def VGFMAH : TernaryVRRd<"vgfmah", 0xE7BC, int_s390_vgfmah, v128f, v128h, 1>;
++  def VGFMAF : TernaryVRRd<"vgfmaf", 0xE7BC, int_s390_vgfmaf, v128g, v128f, 2>;
++  def VGFMAG : TernaryVRRd<"vgfmag", 0xE7BC, int_s390_vgfmag, v128q, v128g, 3>;
++
++  // Load complement.
++  def VLCB : UnaryVRRa<"vlcb", 0xE7DE, z_vneg, v128b, v128b, 0>;
++  def VLCH : UnaryVRRa<"vlch", 0xE7DE, z_vneg, v128h, v128h, 1>;
++  def VLCF : UnaryVRRa<"vlcf", 0xE7DE, z_vneg, v128f, v128f, 2>;
++  def VLCG : UnaryVRRa<"vlcg", 0xE7DE, z_vneg, v128g, v128g, 3>;
++
++  // Load positive.
++  def VLPB : UnaryVRRa<"vlpb", 0xE7DF, z_viabs8,  v128b, v128b, 0>;
++  def VLPH : UnaryVRRa<"vlph", 0xE7DF, z_viabs16, v128h, v128h, 1>;
++  def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>;
++  def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>;
++
++  // Maximum.
++  def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
++  def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
++  def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
++  def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
++
++  // Maximum logical.
++  def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
++  def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
++  def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
++  def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
++
++  // Minimum.
++  def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
++  def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
++  def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
++  def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
++
++  // Minimum logical.
++  def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
++  def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
++  def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
++  def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
++
++  // Multiply and add low.
++  def VMALB  : TernaryVRRd<"vmalb",  0xE7AA, z_muladd, v128b, v128b, 0>;
++  def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
++  def VMALF  : TernaryVRRd<"vmalf",  0xE7AA, z_muladd, v128f, v128f, 2>;
++
++  // Multiply and add high.
++  def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
++  def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
++  def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
++
++  // Multiply and add logical high.
++  def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
++  def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
++  def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
++
++  // Multiply and add even.
++  def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
++  def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
++  def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
++
++  // Multiply and add logical even.
++  def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
++  def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
++  def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
++
++  // Multiply and add odd.
++  def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
++  def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
++  def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
++
++  // Multiply and add logical odd.
++  def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
++  def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
++  def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
++
++  // Multiply high.
++  def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
++  def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
++  def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
++
++  // Multiply logical high.
++  def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
++  def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
++  def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
++
++  // Multiply low.
++  def VMLB  : BinaryVRRc<"vmlb",  0xE7A2, mul, v128b, v128b, 0>;
++  def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>;
++  def VMLF  : BinaryVRRc<"vmlf",  0xE7A2, mul, v128f, v128f, 2>;
++
++  // Multiply even.
++  def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
++  def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
++  def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
++
++  // Multiply logical even.
++  def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
++  def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
++  def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
++
++  // Multiply odd.
++  def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
++  def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
++  def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
++
++  // Multiply logical odd.
++  def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
++  def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
++  def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
++
++  // Nor.
++  def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
++
++  // Or.
++  def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
++
++  // Population count.
++  def VPOPCT : BinaryVRRa<"vpopct", 0xE750>;
++  def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
++
++  // Element rotate left logical (with vector shift amount).
++  def VERLLVB : BinaryVRRc<"verllvb", 0xE773, int_s390_verllvb,
++                           v128b, v128b, 0>;
++  def VERLLVH : BinaryVRRc<"verllvh", 0xE773, int_s390_verllvh,
++                           v128h, v128h, 1>;
++  def VERLLVF : BinaryVRRc<"verllvf", 0xE773, int_s390_verllvf,
++                           v128f, v128f, 2>;
++  def VERLLVG : BinaryVRRc<"verllvg", 0xE773, int_s390_verllvg,
++                           v128g, v128g, 3>;
++
++  // Element rotate left logical (with scalar shift amount).
++  def VERLLB : BinaryVRSa<"verllb", 0xE733, int_s390_verllb, v128b, v128b, 0>;
++  def VERLLH : BinaryVRSa<"verllh", 0xE733, int_s390_verllh, v128h, v128h, 1>;
++  def VERLLF : BinaryVRSa<"verllf", 0xE733, int_s390_verllf, v128f, v128f, 2>;
++  def VERLLG : BinaryVRSa<"verllg", 0xE733, int_s390_verllg, v128g, v128g, 3>;
++
++  // Element rotate and insert under mask.
++  def VERIMB : QuaternaryVRId<"verimb", 0xE772, int_s390_verimb, v128b, v128b, 0>;
++  def VERIMH : QuaternaryVRId<"verimh", 0xE772, int_s390_verimh, v128h, v128h, 1>;
++  def VERIMF : QuaternaryVRId<"verimf", 0xE772, int_s390_verimf, v128f, v128f, 2>;
++  def VERIMG : QuaternaryVRId<"verimg", 0xE772, int_s390_verimg, v128g, v128g, 3>;
++
++  // Element shift left (with vector shift amount).
++  def VESLVB : BinaryVRRc<"veslvb", 0xE770, z_vshl, v128b, v128b, 0>;
++  def VESLVH : BinaryVRRc<"veslvh", 0xE770, z_vshl, v128h, v128h, 1>;
++  def VESLVF : BinaryVRRc<"veslvf", 0xE770, z_vshl, v128f, v128f, 2>;
++  def VESLVG : BinaryVRRc<"veslvg", 0xE770, z_vshl, v128g, v128g, 3>;
++
++  // Element shift left (with scalar shift amount).
++  def VESLB : BinaryVRSa<"veslb", 0xE730, z_vshl_by_scalar, v128b, v128b, 0>;
++  def VESLH : BinaryVRSa<"veslh", 0xE730, z_vshl_by_scalar, v128h, v128h, 1>;
++  def VESLF : BinaryVRSa<"veslf", 0xE730, z_vshl_by_scalar, v128f, v128f, 2>;
++  def VESLG : BinaryVRSa<"veslg", 0xE730, z_vshl_by_scalar, v128g, v128g, 3>;
++
++  // Element shift right arithmetic (with vector shift amount).
++  def VESRAVB : BinaryVRRc<"vesravb", 0xE77A, z_vsra, v128b, v128b, 0>;
++  def VESRAVH : BinaryVRRc<"vesravh", 0xE77A, z_vsra, v128h, v128h, 1>;
++  def VESRAVF : BinaryVRRc<"vesravf", 0xE77A, z_vsra, v128f, v128f, 2>;
++  def VESRAVG : BinaryVRRc<"vesravg", 0xE77A, z_vsra, v128g, v128g, 3>;
++
++  // Element shift right arithmetic (with scalar shift amount).
++  def VESRAB : BinaryVRSa<"vesrab", 0xE73A, z_vsra_by_scalar, v128b, v128b, 0>;
++  def VESRAH : BinaryVRSa<"vesrah", 0xE73A, z_vsra_by_scalar, v128h, v128h, 1>;
++  def VESRAF : BinaryVRSa<"vesraf", 0xE73A, z_vsra_by_scalar, v128f, v128f, 2>;
++  def VESRAG : BinaryVRSa<"vesrag", 0xE73A, z_vsra_by_scalar, v128g, v128g, 3>;
++
++  // Element shift right logical (with vector shift amount).
++  def VESRLVB : BinaryVRRc<"vesrlvb", 0xE778, z_vsrl, v128b, v128b, 0>;
++  def VESRLVH : BinaryVRRc<"vesrlvh", 0xE778, z_vsrl, v128h, v128h, 1>;
++  def VESRLVF : BinaryVRRc<"vesrlvf", 0xE778, z_vsrl, v128f, v128f, 2>;
++  def VESRLVG : BinaryVRRc<"vesrlvg", 0xE778, z_vsrl, v128g, v128g, 3>;
++
++  // Element shift right logical (with scalar shift amount).
++  def VESRLB : BinaryVRSa<"vesrlb", 0xE738, z_vsrl_by_scalar, v128b, v128b, 0>;
++  def VESRLH : BinaryVRSa<"vesrlh", 0xE738, z_vsrl_by_scalar, v128h, v128h, 1>;
++  def VESRLF : BinaryVRSa<"vesrlf", 0xE738, z_vsrl_by_scalar, v128f, v128f, 2>;
++  def VESRLG : BinaryVRSa<"vesrlg", 0xE738, z_vsrl_by_scalar, v128g, v128g, 3>;
++
++  // Shift left.
++  def VSL : BinaryVRRc<"vsl", 0xE774, int_s390_vsl, v128b, v128b>;
++
++  // Shift left by byte.
++  def VSLB : BinaryVRRc<"vslb", 0xE775, int_s390_vslb, v128b, v128b>;
++
++  // Shift left double by byte.
++  def VSLDB : TernaryVRId<"vsldb", 0xE777, z_shl_double, v128b, v128b, 0>;
++  def : Pat<(int_s390_vsldb VR128:$x, VR128:$y, imm32zx8:$z),
++            (VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
++
++  // Shift right arithmetic.
++  def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>;
++
++  // Shift right arithmetic by byte.
++  def VSRAB : BinaryVRRc<"vsrab", 0xE77F, int_s390_vsrab, v128b, v128b>;
++
++  // Shift right logical.
++  def VSRL : BinaryVRRc<"vsrl", 0xE77C, int_s390_vsrl, v128b, v128b>;
++
++  // Shift right logical by byte.
++  def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>;
++
++  // Subtract.
++  def VSB : BinaryVRRc<"vsb", 0xE7F7, sub, v128b, v128b, 0>;
++  def VSH : BinaryVRRc<"vsh", 0xE7F7, sub, v128h, v128h, 1>;
++  def VSF : BinaryVRRc<"vsf", 0xE7F7, sub, v128f, v128f, 2>;
++  def VSG : BinaryVRRc<"vsg", 0xE7F7, sub, v128g, v128g, 3>;
++  def VSQ : BinaryVRRc<"vsq", 0xE7F7, int_s390_vsq, v128q, v128q, 4>;
++
++  // Subtract compute borrow indication.
++  def VSCBIB : BinaryVRRc<"vscbib", 0xE7F5, int_s390_vscbib, v128b, v128b, 0>;
++  def VSCBIH : BinaryVRRc<"vscbih", 0xE7F5, int_s390_vscbih, v128h, v128h, 1>;
++  def VSCBIF : BinaryVRRc<"vscbif", 0xE7F5, int_s390_vscbif, v128f, v128f, 2>;
++  def VSCBIG : BinaryVRRc<"vscbig", 0xE7F5, int_s390_vscbig, v128g, v128g, 3>;
++  def VSCBIQ : BinaryVRRc<"vscbiq", 0xE7F5, int_s390_vscbiq, v128q, v128q, 4>;
++
++  // Subtract with borrow indication.
++  def VSBIQ : TernaryVRRd<"vsbiq", 0xE7BF, int_s390_vsbiq, v128q, v128q, 4>;
++
++  // Subtract with borrow compute borrow indication.
++  def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, int_s390_vsbcbiq,
++                            v128q, v128q, 4>;
++
++  // Sum across doubleword.
++  def VSUMGH : BinaryVRRc<"vsumgh", 0xE765, z_vsum, v128g, v128h, 1>;
++  def VSUMGF : BinaryVRRc<"vsumgf", 0xE765, z_vsum, v128g, v128f, 2>;
++
++  // Sum across quadword.
++  def VSUMQF : BinaryVRRc<"vsumqf", 0xE767, z_vsum, v128q, v128f, 2>;
++  def VSUMQG : BinaryVRRc<"vsumqg", 0xE767, z_vsum, v128q, v128g, 3>;
++
++  // Sum across word.
++  def VSUMB : BinaryVRRc<"vsumb", 0xE764, z_vsum, v128f, v128b, 0>;
++  def VSUMH : BinaryVRRc<"vsumh", 0xE764, z_vsum, v128f, v128h, 1>;
++}
++
++// Instantiate the bitwise ops for type TYPE.
++multiclass BitwiseVectorOps<ValueType type> {
++  let Predicates = [FeatureVector] in {
++    def : Pat<(type (and VR128:$x, VR128:$y)), (VN VR128:$x, VR128:$y)>;
++    def : Pat<(type (and VR128:$x, (z_vnot VR128:$y))),
++              (VNC VR128:$x, VR128:$y)>;
++    def : Pat<(type (or VR128:$x, VR128:$y)), (VO VR128:$x, VR128:$y)>;
++    def : Pat<(type (xor VR128:$x, VR128:$y)), (VX VR128:$x, VR128:$y)>;
++    def : Pat<(type (or (and VR128:$x, VR128:$z),
++                        (and VR128:$y, (z_vnot VR128:$z)))),
++              (VSEL VR128:$x, VR128:$y, VR128:$z)>;
++    def : Pat<(type (z_vnot (or VR128:$x, VR128:$y))),
++              (VNO VR128:$x, VR128:$y)>;
++    def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
++  }
++}
++
++defm : BitwiseVectorOps<v16i8>;
++defm : BitwiseVectorOps<v8i16>;
++defm : BitwiseVectorOps<v4i32>;
++defm : BitwiseVectorOps<v2i64>;
++
++// Instantiate additional patterns for absolute-related expressions on
++// type TYPE.  LC is the negate instruction for TYPE and LP is the absolute
++// instruction.
++multiclass IntegerAbsoluteVectorOps<ValueType type, Instruction lc,
++                                    Instruction lp, int shift> {
++  let Predicates = [FeatureVector] in {
++    def : Pat<(type (vselect (type (z_vicmph_zero VR128:$x)),
++                             (z_vneg VR128:$x), VR128:$x)),
++              (lc (lp VR128:$x))>;
++    def : Pat<(type (vselect (type (z_vnot (z_vicmph_zero VR128:$x))),
++                             VR128:$x, (z_vneg VR128:$x))),
++              (lc (lp VR128:$x))>;
++    def : Pat<(type (vselect (type (z_vicmpl_zero VR128:$x)),
++                             VR128:$x, (z_vneg VR128:$x))),
++              (lc (lp VR128:$x))>;
++    def : Pat<(type (vselect (type (z_vnot (z_vicmpl_zero VR128:$x))),
++                             (z_vneg VR128:$x), VR128:$x)),
++              (lc (lp VR128:$x))>;
++    def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)),
++                             (z_vneg VR128:$x)),
++                        (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))),
++                             VR128:$x))),
++              (lp VR128:$x)>;
++    def : Pat<(type (or (and (z_vsra_by_scalar VR128:$x, (i32 shift)),
++                             VR128:$x),
++                        (and (z_vnot (z_vsra_by_scalar VR128:$x, (i32 shift))),
++                             (z_vneg VR128:$x)))),
++              (lc (lp VR128:$x))>;
++  }
++}
++
++defm : IntegerAbsoluteVectorOps<v16i8, VLCB, VLPB, 7>;
++defm : IntegerAbsoluteVectorOps<v8i16, VLCH, VLPH, 15>;
++defm : IntegerAbsoluteVectorOps<v4i32, VLCF, VLPF, 31>;
++defm : IntegerAbsoluteVectorOps<v2i64, VLCG, VLPG, 63>;
++
++// Instantiate minimum- and maximum-related patterns for TYPE.  CMPH is the
++// signed or unsigned "set if greater than" comparison instruction and
++// MIN and MAX are the associated minimum and maximum instructions.
++multiclass IntegerMinMaxVectorOps<ValueType type, SDPatternOperator cmph,
++                                  Instruction min, Instruction max> {
++  let Predicates = [FeatureVector] in {
++    def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$x, VR128:$y)),
++              (max VR128:$x, VR128:$y)>;
++    def : Pat<(type (vselect (cmph VR128:$x, VR128:$y), VR128:$y, VR128:$x)),
++              (min VR128:$x, VR128:$y)>;
++    def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
++                             VR128:$x, VR128:$y)),
++              (min VR128:$x, VR128:$y)>;
++    def : Pat<(type (vselect (z_vnot (cmph VR128:$x, VR128:$y)),
++                             VR128:$y, VR128:$x)),
++              (max VR128:$x, VR128:$y)>;
++  }
++}
++
++// Signed min/max.
++defm : IntegerMinMaxVectorOps<v16i8, z_vicmph, VMNB, VMXB>;
++defm : IntegerMinMaxVectorOps<v8i16, z_vicmph, VMNH, VMXH>;
++defm : IntegerMinMaxVectorOps<v4i32, z_vicmph, VMNF, VMXF>;
++defm : IntegerMinMaxVectorOps<v2i64, z_vicmph, VMNG, VMXG>;
++
++// Unsigned min/max.
++defm : IntegerMinMaxVectorOps<v16i8, z_vicmphl, VMNLB, VMXLB>;
++defm : IntegerMinMaxVectorOps<v8i16, z_vicmphl, VMNLH, VMXLH>;
++defm : IntegerMinMaxVectorOps<v4i32, z_vicmphl, VMNLF, VMXLF>;
++defm : IntegerMinMaxVectorOps<v2i64, z_vicmphl, VMNLG, VMXLG>;
++
++//===----------------------------------------------------------------------===//
++// Integer comparison
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Element compare.
++  let Defs = [CC] in {
++    def VECB : CompareVRRa<"vecb", 0xE7DB, null_frag, v128b, 0>;
++    def VECH : CompareVRRa<"vech", 0xE7DB, null_frag, v128h, 1>;
++    def VECF : CompareVRRa<"vecf", 0xE7DB, null_frag, v128f, 2>;
++    def VECG : CompareVRRa<"vecg", 0xE7DB, null_frag, v128g, 3>;
++  }
++
++  // Element compare logical.
++  let Defs = [CC] in {
++    def VECLB : CompareVRRa<"veclb", 0xE7D9, null_frag, v128b, 0>;
++    def VECLH : CompareVRRa<"veclh", 0xE7D9, null_frag, v128h, 1>;
++    def VECLF : CompareVRRa<"veclf", 0xE7D9, null_frag, v128f, 2>;
++    def VECLG : CompareVRRa<"veclg", 0xE7D9, null_frag, v128g, 3>;
++  }
++
++  // Compare equal.
++  defm VCEQB : BinaryVRRbSPair<"vceqb", 0xE7F8, z_vicmpe, z_vicmpes,
++                               v128b, v128b, 0>;
++  defm VCEQH : BinaryVRRbSPair<"vceqh", 0xE7F8, z_vicmpe, z_vicmpes,
++                               v128h, v128h, 1>;
++  defm VCEQF : BinaryVRRbSPair<"vceqf", 0xE7F8, z_vicmpe, z_vicmpes,
++                               v128f, v128f, 2>;
++  defm VCEQG : BinaryVRRbSPair<"vceqg", 0xE7F8, z_vicmpe, z_vicmpes,
++                               v128g, v128g, 3>;
++
++  // Compare high.
++  defm VCHB : BinaryVRRbSPair<"vchb", 0xE7FB, z_vicmph, z_vicmphs,
++                              v128b, v128b, 0>;
++  defm VCHH : BinaryVRRbSPair<"vchh", 0xE7FB, z_vicmph, z_vicmphs,
++                              v128h, v128h, 1>;
++  defm VCHF : BinaryVRRbSPair<"vchf", 0xE7FB, z_vicmph, z_vicmphs,
++                              v128f, v128f, 2>;
++  defm VCHG : BinaryVRRbSPair<"vchg", 0xE7FB, z_vicmph, z_vicmphs,
++                              v128g, v128g, 3>;
++
++  // Compare high logical.
++  defm VCHLB : BinaryVRRbSPair<"vchlb", 0xE7F9, z_vicmphl, z_vicmphls,
++                               v128b, v128b, 0>;
++  defm VCHLH : BinaryVRRbSPair<"vchlh", 0xE7F9, z_vicmphl, z_vicmphls,
++                               v128h, v128h, 1>;
++  defm VCHLF : BinaryVRRbSPair<"vchlf", 0xE7F9, z_vicmphl, z_vicmphls,
++                               v128f, v128f, 2>;
++  defm VCHLG : BinaryVRRbSPair<"vchlg", 0xE7F9, z_vicmphl, z_vicmphls,
++                               v128g, v128g, 3>;
++
++  // Test under mask.
++  let Defs = [CC] in
++    def VTM : CompareVRRa<"vtm", 0xE7D8, z_vtm, v128b, 0>;
++}
++
++//===----------------------------------------------------------------------===//
++// Floating-point arithmetic
++//===----------------------------------------------------------------------===//
++
++// See comments in SystemZInstrFP.td for the suppression flags and
++// rounding modes.
++multiclass VectorRounding<Instruction insn, TypedReg tr> {
++  def : FPConversion<insn, frint,      tr, tr, 0, 0>;
++  def : FPConversion<insn, fnearbyint, tr, tr, 4, 0>;
++  def : FPConversion<insn, ffloor,     tr, tr, 4, 7>;
++  def : FPConversion<insn, fceil,      tr, tr, 4, 6>;
++  def : FPConversion<insn, ftrunc,     tr, tr, 4, 5>;
++  def : FPConversion<insn, frnd,       tr, tr, 4, 1>;
++}
++
++let Predicates = [FeatureVector] in {
++  // Add.
++  def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
++  def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
++
++  // Convert from fixed 64-bit.
++  def VCDGB : TernaryVRRa<"vcdgb", 0xE7C3, null_frag, v128db, v128g, 3, 0>;
++  def WCDGB : TernaryVRRa<"wcdgb", 0xE7C3, null_frag, v64db, v64g, 3, 8>;
++  def : FPConversion<VCDGB, sint_to_fp, v128db, v128g, 0, 0>;
++
++  // Convert from logical 64-bit.
++  def VCDLGB : TernaryVRRa<"vcdlgb", 0xE7C1, null_frag, v128db, v128g, 3, 0>;
++  def WCDLGB : TernaryVRRa<"wcdlgb", 0xE7C1, null_frag, v64db, v64g, 3, 8>;
++  def : FPConversion<VCDLGB, uint_to_fp, v128db, v128g, 0, 0>;
++
++  // Convert to fixed 64-bit.
++  def VCGDB : TernaryVRRa<"vcgdb", 0xE7C2, null_frag, v128g, v128db, 3, 0>;
++  def WCGDB : TernaryVRRa<"wcgdb", 0xE7C2, null_frag, v64g, v64db, 3, 8>;
++  // Rounding mode should agree with SystemZInstrFP.td.
++  def : FPConversion<VCGDB, fp_to_sint, v128g, v128db, 0, 5>;
++
++  // Convert to logical 64-bit.
++  def VCLGDB : TernaryVRRa<"vclgdb", 0xE7C0, null_frag, v128g, v128db, 3, 0>;
++  def WCLGDB : TernaryVRRa<"wclgdb", 0xE7C0, null_frag, v64g, v64db, 3, 8>;
++  // Rounding mode should agree with SystemZInstrFP.td.
++  def : FPConversion<VCLGDB, fp_to_uint, v128g, v128db, 0, 5>;
++
++  // Divide.
++  def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
++  def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
++
++  // Load FP integer.
++  def VFIDB : TernaryVRRa<"vfidb", 0xE7C7, int_s390_vfidb, v128db, v128db, 3, 0>;
++  def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
++  defm : VectorRounding<VFIDB, v128db>;
++  defm : VectorRounding<WFIDB, v64db>;
++
++  // Load lengthened.
++  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
++  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fextend, v64db, v32eb, 2, 8>;
++
++  // Load rounded,
++  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
++  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
++  def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
++  def : FPConversion<WLEDB, fround, v32eb, v64db, 0, 0>;
++
++  // Multiply.
++  def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
++  def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
++
++  // Multiply and add.
++  def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
++  def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
++
++  // Multiply and subtract.
++  def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
++  def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
++
++  // Load complement,
++  def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
++  def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
++
++  // Load negative.
++  def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
++  def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
++
++  // Load positive.
++  def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
++  def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
++
++  // Square root.
++  def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
++  def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
++
++  // Subtract.
++  def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
++  def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
++
++  // Test data class immediate.
++  let Defs = [CC] in {
++    def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
++    def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
++  }
++}
++
++//===----------------------------------------------------------------------===//
++// Floating-point comparison
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  // Compare scalar.
++  let Defs = [CC] in
++    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
++
++  // Compare and signal scalar.
++  let Defs = [CC] in
++    def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
++
++  // Compare equal.
++  defm VFCEDB : BinaryVRRcSPair<"vfcedb", 0xE7E8, z_vfcmpe, z_vfcmpes,
++                                v128g, v128db, 3, 0>;
++  defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
++                                v64g, v64db, 3, 8>;
++
++  // Compare high.
++  defm VFCHDB : BinaryVRRcSPair<"vfchdb", 0xE7EB, z_vfcmph, z_vfcmphs,
++                                v128g, v128db, 3, 0>;
++  defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
++                                v64g, v64db, 3, 8>;
++
++  // Compare high or equal.
++  defm VFCHEDB : BinaryVRRcSPair<"vfchedb", 0xE7EA, z_vfcmphe, z_vfcmphes,
++                                 v128g, v128db, 3, 0>;
++  defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
++                                 v64g, v64db, 3, 8>;
++}
++
++//===----------------------------------------------------------------------===//
++// Conversions
++//===----------------------------------------------------------------------===//
++
++def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
++def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
++def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
++def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
++def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
++
++def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
++def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
++def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
++def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
++def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
++
++def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
++def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
++def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
++def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
++def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
++
++def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
++def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
++def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
++def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
++def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
++
++def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
++def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
++def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
++def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
++def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
++
++def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
++def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
++def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
++def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
++def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
++
++//===----------------------------------------------------------------------===//
++// Replicating scalars
++//===----------------------------------------------------------------------===//
++
++// Define patterns for replicating a scalar GR32 into a vector of type TYPE.
++// INDEX is 8 minus the element size in bytes.
++class VectorReplicateScalar<ValueType type, Instruction insn, bits<16> index>
++  : Pat<(type (z_replicate GR32:$scalar)),
++        (insn (VLVGP32 GR32:$scalar, GR32:$scalar), index)>;
++
++def : VectorReplicateScalar<v16i8, VREPB, 7>;
++def : VectorReplicateScalar<v8i16, VREPH, 3>;
++def : VectorReplicateScalar<v4i32, VREPF, 1>;
++
++// i64 replications are just a single isntruction.
++def : Pat<(v2i64 (z_replicate GR64:$scalar)),
++          (VLVGP GR64:$scalar, GR64:$scalar)>;
++
++//===----------------------------------------------------------------------===//
++// Floating-point insertion and extraction
++//===----------------------------------------------------------------------===//
++
++// Moving 32-bit values between GPRs and FPRs can be done using VLVGF
++// and VLGVF.
++def LEFR : UnaryAliasVRS<VR32, GR32>;
++def LFER : UnaryAliasVRS<GR64, VR32>;
++def : Pat<(f32 (bitconvert (i32 GR32:$src))), (LEFR GR32:$src)>;
++def : Pat<(i32 (bitconvert (f32 VR32:$src))),
++          (EXTRACT_SUBREG (LFER VR32:$src), subreg_l32)>;
++
++// Floating-point values are stored in element 0 of the corresponding
++// vector register.  Scalar to vector conversion is just a subreg and
++// scalar replication can just replicate element 0 of the vector register.
++multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
++                            SubRegIndex subreg> {
++  def : Pat<(vt (scalar_to_vector cls:$scalar)),
++            (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar, subreg)>;
++  def : Pat<(vt (z_replicate cls:$scalar)),
++            (vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
++                                 subreg), 0)>;
++}
++defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_r32>;
++defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_r64>;
++
++// Match v2f64 insertions.  The AddedComplexity counters the 3 added by
++// TableGen for the base register operand in VLVG-based integer insertions
++// and ensures that this version is strictly better.
++let AddedComplexity = 4 in {
++  def : Pat<(z_vector_insert (v2f64 VR128:$vec), FP64:$elt, 0),
++            (VPDI (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FP64:$elt,
++                                 subreg_r64), VR128:$vec, 1)>;
++  def : Pat<(z_vector_insert (v2f64 VR128:$vec), FP64:$elt, 1),
++            (VPDI VR128:$vec, (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FP64:$elt,
++                                             subreg_r64), 0)>;
++}
++
++// We extract floating-point element X by replicating (for elements other
++// than 0) and then taking a high subreg.  The AddedComplexity counters the
++// 3 added by TableGen for the base register operand in VLGV-based integer
++// extractions and ensures that this version is strictly better.
++let AddedComplexity = 4 in {
++  def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
++            (EXTRACT_SUBREG VR128:$vec, subreg_r32)>;
++  def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
++            (EXTRACT_SUBREG (VREPF VR128:$vec, imm32zx2:$index), subreg_r32)>;
++
++  def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), 0)),
++            (EXTRACT_SUBREG VR128:$vec, subreg_r64)>;
++  def : Pat<(f64 (z_vector_extract (v2f64 VR128:$vec), imm32zx1:$index)),
++            (EXTRACT_SUBREG (VREPG VR128:$vec, imm32zx1:$index), subreg_r64)>;
++}
++
++//===----------------------------------------------------------------------===//
++// String instructions
++//===----------------------------------------------------------------------===//
++
++let Predicates = [FeatureVector] in {
++  defm VFAEB : TernaryVRRbSPair<"vfaeb", 0xE782, int_s390_vfaeb, z_vfae_cc,
++                                v128b, v128b, 0, 0>;
++  defm VFAEH : TernaryVRRbSPair<"vfaeh", 0xE782, int_s390_vfaeh, z_vfae_cc,
++                                v128h, v128h, 1, 0>;
++  defm VFAEF : TernaryVRRbSPair<"vfaef", 0xE782, int_s390_vfaef, z_vfae_cc,
++                                v128f, v128f, 2, 0>;
++  defm VFAEZB : TernaryVRRbSPair<"vfaezb", 0xE782, int_s390_vfaezb, z_vfaez_cc,
++                                 v128b, v128b, 0, 2>;
++  defm VFAEZH : TernaryVRRbSPair<"vfaezh", 0xE782, int_s390_vfaezh, z_vfaez_cc,
++                                 v128h, v128h, 1, 2>;
++  defm VFAEZF : TernaryVRRbSPair<"vfaezf", 0xE782, int_s390_vfaezf, z_vfaez_cc,
++                                 v128f, v128f, 2, 2>;
++
++  defm VFEEB : BinaryVRRbSPair<"vfeeb", 0xE780, int_s390_vfeeb, z_vfee_cc,
++                               v128b, v128b, 0, 0, 1>;
++  defm VFEEH : BinaryVRRbSPair<"vfeeh", 0xE780, int_s390_vfeeh, z_vfee_cc,
++                               v128h, v128h, 1, 0, 1>;
++  defm VFEEF : BinaryVRRbSPair<"vfeef", 0xE780, int_s390_vfeef, z_vfee_cc,
++                               v128f, v128f, 2, 0, 1>;
++  defm VFEEZB : BinaryVRRbSPair<"vfeezb", 0xE780, int_s390_vfeezb, z_vfeez_cc,
++                                v128b, v128b, 0, 2, 3>;
++  defm VFEEZH : BinaryVRRbSPair<"vfeezh", 0xE780, int_s390_vfeezh, z_vfeez_cc,
++                                v128h, v128h, 1, 2, 3>;
++  defm VFEEZF : BinaryVRRbSPair<"vfeezf", 0xE780, int_s390_vfeezf, z_vfeez_cc,
++                                v128f, v128f, 2, 2, 3>;
++
++  defm VFENEB : BinaryVRRbSPair<"vfeneb", 0xE781, int_s390_vfeneb, z_vfene_cc,
++                                v128b, v128b, 0, 0, 1>;
++  defm VFENEH : BinaryVRRbSPair<"vfeneh", 0xE781, int_s390_vfeneh, z_vfene_cc,
++                                v128h, v128h, 1, 0, 1>;
++  defm VFENEF : BinaryVRRbSPair<"vfenef", 0xE781, int_s390_vfenef, z_vfene_cc,
++                                v128f, v128f, 2, 0, 1>;
++  defm VFENEZB : BinaryVRRbSPair<"vfenezb", 0xE781, int_s390_vfenezb,
++                                 z_vfenez_cc, v128b, v128b, 0, 2, 3>;
++  defm VFENEZH : BinaryVRRbSPair<"vfenezh", 0xE781, int_s390_vfenezh,
++                                 z_vfenez_cc, v128h, v128h, 1, 2, 3>;
++  defm VFENEZF : BinaryVRRbSPair<"vfenezf", 0xE781, int_s390_vfenezf,
++                                 z_vfenez_cc, v128f, v128f, 2, 2, 3>;
++
++  defm VISTRB : UnaryVRRaSPair<"vistrb", 0xE75C, int_s390_vistrb, z_vistr_cc,
++                               v128b, v128b, 0>;
++  defm VISTRH : UnaryVRRaSPair<"vistrh", 0xE75C, int_s390_vistrh, z_vistr_cc,
++                               v128h, v128h, 1>;
++  defm VISTRF : UnaryVRRaSPair<"vistrf", 0xE75C, int_s390_vistrf, z_vistr_cc,
++                               v128f, v128f, 2>;
++
++  defm VSTRCB : QuaternaryVRRdSPair<"vstrcb", 0xE78A, int_s390_vstrcb,
++                                    z_vstrc_cc, v128b, v128b, 0, 0>;
++  defm VSTRCH : QuaternaryVRRdSPair<"vstrch", 0xE78A, int_s390_vstrch,
++                                    z_vstrc_cc, v128h, v128h, 1, 0>;
++  defm VSTRCF : QuaternaryVRRdSPair<"vstrcf", 0xE78A, int_s390_vstrcf,
++                                    z_vstrc_cc, v128f, v128f, 2, 0>;
++  defm VSTRCZB : QuaternaryVRRdSPair<"vstrczb", 0xE78A, int_s390_vstrczb,
++                                     z_vstrcz_cc, v128b, v128b, 0, 2>;
++  defm VSTRCZH : QuaternaryVRRdSPair<"vstrczh", 0xE78A, int_s390_vstrczh,
++                                     z_vstrcz_cc, v128h, v128h, 1, 2>;
++  defm VSTRCZF : QuaternaryVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
++                                     z_vstrcz_cc, v128f, v128f, 2, 2>;
++}
+Index: llvm-36/lib/Target/SystemZ/SystemZLDCleanup.cpp
+===================================================================
+--- /dev/null
++++ llvm-36/lib/Target/SystemZ/SystemZLDCleanup.cpp
+@@ -0,0 +1,143 @@
++//===-- SystemZLDCleanup.cpp - Clean up local-dynamic TLS accesses --------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This pass combines multiple accesses to local-dynamic TLS variables so that
++// the TLS base address for the module is only fetched once per execution path
++// through the function.
++//
++//===----------------------------------------------------------------------===//
++
++#include "SystemZTargetMachine.h"
++#include "SystemZMachineFunctionInfo.h"
++#include "llvm/CodeGen/MachineDominators.h"
++#include "llvm/CodeGen/MachineFunctionPass.h"
++#include "llvm/CodeGen/MachineInstrBuilder.h"
++#include "llvm/CodeGen/MachineRegisterInfo.h"
++#include "llvm/Target/TargetInstrInfo.h"
++#include "llvm/Target/TargetMachine.h"
++#include "llvm/Target/TargetRegisterInfo.h"
++
++using namespace llvm;
++
++namespace {
++
++class SystemZLDCleanup : public MachineFunctionPass {
++public:
++  static char ID;
++  SystemZLDCleanup(const SystemZTargetMachine &tm)
++    : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {}
++
++  const char *getPassName() const override {
++    return "SystemZ Local Dynamic TLS Access Clean-up";
++  }
++
++  bool runOnMachineFunction(MachineFunction &MF) override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override;
++
++private:
++  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg);
++  MachineInstr *ReplaceTLSCall(MachineInstr *I, unsigned TLSBaseAddrReg);
++  MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg);
++
++  const SystemZInstrInfo *TII;
++  MachineFunction *MF;
++};
++
++char SystemZLDCleanup::ID = 0;
++
++} // end anonymous namespace
++
++FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
++  return new SystemZLDCleanup(TM);
++}
++
++void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
++  AU.setPreservesCFG();
++  AU.addRequired<MachineDominatorTree>();
++  MachineFunctionPass::getAnalysisUsage(AU);
++}
++
++bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
++  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
++  MF = &F;
++
++  SystemZMachineFunctionInfo* MFI = F.getInfo<SystemZMachineFunctionInfo>();
++  if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
++    // No point folding accesses if there isn't at least two.
++    return false;
++  }
++
++  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
++  return VisitNode(DT->getRootNode(), 0);
++}
++
++// Visit the dominator subtree rooted at Node in pre-order.
++// If TLSBaseAddrReg is non-null, then use that to replace any
++// TLS_LDCALL instructions. Otherwise, create the register
++// when the first such instruction is seen, and then use it
++// as we encounter more instructions.
++bool SystemZLDCleanup::VisitNode(MachineDomTreeNode *Node,
++                                 unsigned TLSBaseAddrReg) {
++  MachineBasicBlock *BB = Node->getBlock();
++  bool Changed = false;
++
++  // Traverse the current block.
++  for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
++    switch (I->getOpcode()) {
++      case SystemZ::TLS_LDCALL:
++        if (TLSBaseAddrReg)
++          I = ReplaceTLSCall(I, TLSBaseAddrReg);
++        else
++          I = SetRegister(I, &TLSBaseAddrReg);
++        Changed = true;
++        break;
++      default:
++        break;
++    }
++  }
++
++  // Visit the children of this block in the dominator tree.
++  for (auto I = Node->begin(), E = Node->end(); I != E; ++I)
++    Changed |= VisitNode(*I, TLSBaseAddrReg);
++
++  return Changed;
++}
++
++// Replace the TLS_LDCALL instruction I with a copy from TLSBaseAddrReg,
++// returning the new instruction.
++MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I,
++                                               unsigned TLSBaseAddrReg) {
++  // Insert a Copy from TLSBaseAddrReg to R2.
++  MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
++                               TII->get(TargetOpcode::COPY), SystemZ::R2D)
++                               .addReg(TLSBaseAddrReg);
++
++  // Erase the TLS_LDCALL instruction.
++  I->eraseFromParent();
++
++  return Copy;
++}
++
++// Create a virtal register in *TLSBaseAddrReg, and populate it by
++// inserting a copy instruction after I. Returns the new instruction.
++MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I,
++                                            unsigned *TLSBaseAddrReg) {
++  // Create a virtual register for the TLS base address.
++  MachineRegisterInfo &RegInfo = MF->getRegInfo();
++  *TLSBaseAddrReg = RegInfo.createVirtualRegister(&SystemZ::GR64BitRegClass);
++
++  // Insert a copy from R2 to TLSBaseAddrReg.
++  MachineInstr *Next = I->getNextNode();
++  MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
++                               TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
++                               .addReg(SystemZ::R2D);
++
++  return Copy;
++}
++
+Index: llvm-36/lib/Target/SystemZ/SystemZMCInstLower.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZMCInstLower.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZMCInstLower.cpp
+@@ -22,6 +22,8 @@ static MCSymbolRefExpr::VariantKind getV
+       return MCSymbolRefExpr::VK_None;
+     case SystemZII::MO_GOT:
+       return MCSymbolRefExpr::VK_GOT;
++    case SystemZII::MO_INDNTPOFF:
++      return MCSymbolRefExpr::VK_INDNTPOFF;
+   }
+   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
+ }
+Index: llvm-36/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
++++ llvm-36/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+@@ -23,11 +23,13 @@ class SystemZMachineFunctionInfo : publi
+   unsigned VarArgsFrameIndex;
+   unsigned RegSaveFrameIndex;
+   bool ManipulatesSP;
++  unsigned NumLocalDynamics;
+ 
+ public:
+   explicit SystemZMachineFunctionInfo(MachineFunction &MF)
+     : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
+-      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false) {}
++      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false),
++      NumLocalDynamics(0) {}
+ 
+   // Get and set the first call-saved GPR that should be saved and restored
+   // by this function.  This is 0 if no GPRs need to be saved or restored.
+@@ -61,6 +63,10 @@ public:
+   // e.g. through STACKSAVE or STACKRESTORE.
+   bool getManipulatesSP() const { return ManipulatesSP; }
+   void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
++
++  // Count number of local-dynamic TLS symbols used.
++  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
++  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+ };
+ 
+ } // end namespace llvm
+Index: llvm-36/lib/Target/SystemZ/SystemZOperands.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZOperands.td
++++ llvm-36/lib/Target/SystemZ/SystemZOperands.td
+@@ -16,6 +16,11 @@ class ImmediateAsmOperand<string name>
+   let Name = name;
+   let RenderMethod = "addImmOperands";
+ }
++class ImmediateTLSAsmOperand<string name>
++  : AsmOperandClass {
++  let Name = name;
++  let RenderMethod = "addImmTLSOperands";
++}
+ 
+ // Constructs both a DAG pattern and instruction operand for an immediate
+ // of type VT.  PRED returns true if a node is acceptable and XFORM returns
+@@ -34,6 +39,11 @@ class PCRelAsmOperand<string size> : Imm
+   let PredicateMethod = "isImm";
+   let ParserMethod = "parsePCRel"##size;
+ }
++class PCRelTLSAsmOperand<string size>
++  : ImmediateTLSAsmOperand<"PCRelTLS"##size> {
++  let PredicateMethod = "isImmTLS";
++  let ParserMethod = "parsePCRelTLS"##size;
++}
+ 
+ // Constructs an operand for a PC-relative address with address type VT.
+ // ASMOP is the associated asm operand.
+@@ -41,6 +51,10 @@ class PCRelOperand<ValueType vt, AsmOper
+   let PrintMethod = "printPCRelOperand";
+   let ParserMatchClass = asmop;
+ }
++class PCRelTLSOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
++  let PrintMethod = "printPCRelTLSOperand";
++  let ParserMatchClass = asmop;
++}
+ 
+ // Constructs both a DAG pattern and instruction operand for a PC-relative
+ // address with address size VT.  SELF is the name of the operand and
+@@ -64,6 +78,22 @@ class AddressAsmOperand<string format, s
+   let RenderMethod = "add"##format##"Operands";
+ }
+ 
++// Constructs an instruction operand for an addressing mode.  FORMAT,
++// BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
++// AddressAsmOperand.  OPERANDS is a list of individual operands
++// (base register, displacement, etc.).
++class AddressOperand<string bitsize, string dispsize, string length,
++                     string format, dag operands>
++  : Operand<!cast<ValueType>("i"##bitsize)> {
++  let PrintMethod = "print"##format##"Operand";
++  let EncoderMethod = "get"##format##dispsize##length##"Encoding";
++  let DecoderMethod =
++    "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
++  let MIOperandInfo = operands;
++  let ParserMatchClass =
++    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
++}
++
+ // Constructs both a DAG pattern and instruction operand for an addressing mode.
+ // FORMAT, BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+ // AddressAsmOperand.  OPERANDS is a list of NUMOPS individual operands
+@@ -79,15 +109,7 @@ class AddressingMode<string seltype, str
+   : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
+                    "select"##seltype##dispsize##suffix##length,
+                    [add, sub, or, frameindex, z_adjdynalloc]>,
+-    Operand<!cast<ValueType>("i"##bitsize)> {
+-  let PrintMethod = "print"##format##"Operand";
+-  let EncoderMethod = "get"##format##dispsize##length##"Encoding";
+-  let DecoderMethod =
+-    "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
+-  let MIOperandInfo = operands;
+-  let ParserMatchClass =
+-    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
+-}
++    AddressOperand<bitsize, dispsize, length, format, operands>;
+ 
+ // An addressing mode with a base and displacement but no index.
+ class BDMode<string type, string bitsize, string dispsize, string suffix>
+@@ -111,6 +133,13 @@ class BDLMode<string type, string bitsiz
+                         !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                         !cast<Immediate>("imm"##bitsize))>;
+ 
++// An addressing mode with a base, displacement and a vector index.
++class BDVMode<string bitsize, string dispsize>
++  : AddressOperand<bitsize, dispsize, "", "BDVAddr",
++                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
++                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
++                        !cast<RegisterOperand>("VR128"))>;
++
+ //===----------------------------------------------------------------------===//
+ // Extracting immediate operands from nodes
+ // These all create MVT::i64 nodes to ensure the value is not sign-extended
+@@ -163,6 +192,16 @@ def UIMM8 : SDNodeXForm<imm, [{
+   return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()), MVT::i64);
+ }]>;
+ 
++// Truncate an immediate to a 8-bit unsigned quantity and mask off low bit.
++def UIMM8EVEN : SDNodeXForm<imm, [{
++  return CurDAG->getTargetConstant(N->getZExtValue() & 0xfe, MVT::i64);
++}]>;
++
++// Truncate an immediate to a 12-bit unsigned quantity.
++def UIMM12 : SDNodeXForm<imm, [{
++  return CurDAG->getTargetConstant(N->getZExtValue() & 0xfff, MVT::i64);
++}]>;
++
+ // Truncate an immediate to a 16-bit signed quantity.
+ def SIMM16 : SDNodeXForm<imm, [{
+   return CurDAG->getTargetConstant(int16_t(N->getZExtValue()), MVT::i64);
+@@ -192,10 +231,14 @@ def NEGIMM32 : SDNodeXForm<imm, [{
+ // Immediate asm operands.
+ //===----------------------------------------------------------------------===//
+ 
++def U1Imm  : ImmediateAsmOperand<"U1Imm">;
++def U2Imm  : ImmediateAsmOperand<"U2Imm">;
++def U3Imm  : ImmediateAsmOperand<"U3Imm">;
+ def U4Imm  : ImmediateAsmOperand<"U4Imm">;
+ def U6Imm  : ImmediateAsmOperand<"U6Imm">;
+ def S8Imm  : ImmediateAsmOperand<"S8Imm">;
+ def U8Imm  : ImmediateAsmOperand<"U8Imm">;
++def U12Imm : ImmediateAsmOperand<"U12Imm">;
+ def S16Imm : ImmediateAsmOperand<"S16Imm">;
+ def U16Imm : ImmediateAsmOperand<"U16Imm">;
+ def S32Imm : ImmediateAsmOperand<"S32Imm">;
+@@ -226,10 +269,28 @@ def imm32lh16c : Immediate<i32, [{
+ }], LH16, "U16Imm">;
+ 
+ // Short immediates
++def imm32zx1 : Immediate<i32, [{
++  return isUInt<1>(N->getZExtValue());
++}], NOOP_SDNodeXForm, "U1Imm">;
++
++def imm32zx2 : Immediate<i32, [{
++  return isUInt<2>(N->getZExtValue());
++}], NOOP_SDNodeXForm, "U2Imm">;
++
++def imm32zx3 : Immediate<i32, [{
++  return isUInt<3>(N->getZExtValue());
++}], NOOP_SDNodeXForm, "U3Imm">;
++
+ def imm32zx4 : Immediate<i32, [{
+   return isUInt<4>(N->getZExtValue());
+ }], NOOP_SDNodeXForm, "U4Imm">;
+ 
++// Note: this enforces an even value during code generation only.
++// When used from the assembler, any 4-bit value is allowed.
++def imm32zx4even : Immediate<i32, [{
++  return isUInt<4>(N->getZExtValue());
++}], UIMM8EVEN, "U4Imm">;
++
+ def imm32zx6 : Immediate<i32, [{
+   return isUInt<6>(N->getZExtValue());
+ }], NOOP_SDNodeXForm, "U6Imm">;
+@@ -244,6 +305,10 @@ def imm32zx8 : Immediate<i32, [{
+ 
+ def imm32zx8trunc : Immediate<i32, [{}], UIMM8, "U8Imm">;
+ 
++def imm32zx12 : Immediate<i32, [{
++  return isUInt<12>(N->getZExtValue());
++}], UIMM12, "U12Imm">;
++
+ def imm32sx16 : Immediate<i32, [{
+   return isInt<16>(N->getSExtValue());
+ }], SIMM16, "S16Imm">;
+@@ -370,6 +435,8 @@ def fpimmneg0 : PatLeaf<(fpimm), [{ retu
+ // PC-relative asm operands.
+ def PCRel16 : PCRelAsmOperand<"16">;
+ def PCRel32 : PCRelAsmOperand<"32">;
++def PCRelTLS16 : PCRelTLSAsmOperand<"16">;
++def PCRelTLS32 : PCRelTLSAsmOperand<"32">;
+ 
+ // PC-relative offsets of a basic block.  The offset is sign-extended
+ // and multiplied by 2.
+@@ -382,6 +449,20 @@ def brtarget32 : PCRelOperand<OtherVT, P
+   let DecoderMethod = "decodePC32DBLOperand";
+ }
+ 
++// Variants of brtarget16/32 with an optional additional TLS symbol.
++// These are used to annotate calls to __tls_get_offset.
++def tlssym : Operand<i64> { }
++def brtarget16tls : PCRelTLSOperand<OtherVT, PCRelTLS16> {
++  let MIOperandInfo = (ops brtarget16:$func, tlssym:$sym);
++  let EncoderMethod = "getPC16DBLTLSEncoding";
++  let DecoderMethod = "decodePC16DBLOperand";
++}
++def brtarget32tls : PCRelTLSOperand<OtherVT, PCRelTLS32> {
++  let MIOperandInfo = (ops brtarget32:$func, tlssym:$sym);
++  let EncoderMethod = "getPC32DBLTLSEncoding";
++  let DecoderMethod = "decodePC32DBLOperand";
++}
++
+ // A PC-relative offset of a global value.  The offset is sign-extended
+ // and multiplied by 2.
+ def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
+@@ -408,6 +489,7 @@ def BDAddr64Disp20      : AddressAsmOper
+ def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
+ def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+ def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
++def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
+ 
+ // DAG patterns and operands for addressing modes.  Each mode has
+ // the form <type><range><group>[<len>] where:
+@@ -420,6 +502,7 @@ def BDLAddr64Disp12Len8 : AddressAsmOper
+ //   laaddr   : like bdxaddr, but used for Load Address operations
+ //   dynalloc : base + displacement + index + ADJDYNALLOC
+ //   bdladdr  : base + displacement with a length field
++//   bdvaddr  : base + displacement with a vector index
+ //
+ // <range> is one of:
+ //   12       : the displacement is an unsigned 12-bit value
+@@ -452,6 +535,7 @@ def dynalloc12only    : BDXMode<"DynAllo
+ def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
+ def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+ def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
++def bdvaddr12only     : BDVMode<            "64", "12">;
+ 
+ //===----------------------------------------------------------------------===//
+ // Miscellaneous
+Index: llvm-36/lib/Target/SystemZ/SystemZOperators.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZOperators.td
++++ llvm-36/lib/Target/SystemZ/SystemZOperators.td
+@@ -79,6 +79,64 @@ def SDT_ZI32Intrinsic       : SDTypeProf
+ def SDT_ZPrefetch           : SDTypeProfile<0, 2,
+                                             [SDTCisVT<0, i32>,
+                                              SDTCisPtrTy<1>]>;
++def SDT_ZTBegin             : SDTypeProfile<0, 2,
++                                            [SDTCisPtrTy<0>,
++                                             SDTCisVT<1, i32>]>;
++def SDT_ZInsertVectorElt    : SDTypeProfile<1, 3,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>,
++                                             SDTCisVT<3, i32>]>;
++def SDT_ZExtractVectorElt   : SDTypeProfile<1, 2,
++                                            [SDTCisVec<1>,
++                                             SDTCisVT<2, i32>]>;
++def SDT_ZReplicate          : SDTypeProfile<1, 1,
++                                            [SDTCisVec<0>]>;
++def SDT_ZVecUnaryConv       : SDTypeProfile<1, 1,
++                                            [SDTCisVec<0>,
++                                             SDTCisVec<1>]>;
++def SDT_ZVecUnary           : SDTypeProfile<1, 1,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>]>;
++def SDT_ZVecBinary          : SDTypeProfile<1, 2,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>,
++                                             SDTCisSameAs<0, 2>]>;
++def SDT_ZVecBinaryInt       : SDTypeProfile<1, 2,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>,
++                                             SDTCisVT<2, i32>]>;
++def SDT_ZVecBinaryConv      : SDTypeProfile<1, 2,
++                                            [SDTCisVec<0>,
++                                             SDTCisVec<1>,
++                                             SDTCisSameAs<1, 2>]>;
++def SDT_ZVecBinaryConvInt   : SDTypeProfile<1, 2,
++                                            [SDTCisVec<0>,
++                                             SDTCisVec<1>,
++                                             SDTCisVT<2, i32>]>;
++def SDT_ZRotateMask         : SDTypeProfile<1, 2,
++                                            [SDTCisVec<0>,
++                                             SDTCisVT<1, i32>,
++                                             SDTCisVT<2, i32>]>;
++def SDT_ZJoinDwords         : SDTypeProfile<1, 2,
++                                            [SDTCisVT<0, v2i64>,
++                                             SDTCisVT<1, i64>,
++                                             SDTCisVT<2, i64>]>;
++def SDT_ZVecTernary         : SDTypeProfile<1, 3,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>,
++                                             SDTCisSameAs<0, 2>,
++                                             SDTCisSameAs<0, 3>]>;
++def SDT_ZVecTernaryInt      : SDTypeProfile<1, 3,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>,
++                                             SDTCisSameAs<0, 2>,
++                                             SDTCisVT<3, i32>]>;
++def SDT_ZVecQuaternaryInt   : SDTypeProfile<1, 4,
++                                            [SDTCisVec<0>,
++                                             SDTCisSameAs<0, 1>,
++                                             SDTCisSameAs<0, 2>,
++                                             SDTCisSameAs<0, 3>,
++                                             SDTCisVT<4, i32>]>;
+ 
+ //===----------------------------------------------------------------------===//
+ // Node definitions
+@@ -90,6 +148,7 @@ def callseq_start       : SDNode<"ISD::C
+ def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
+                                  [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue,
+                                   SDNPOutGlue]>;
++def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
+ 
+ // Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+ def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+@@ -100,6 +159,12 @@ def z_call              : SDNode<"System
+ def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                   SDNPVariadic]>;
++def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
++                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
++                                  SDNPVariadic]>;
++def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
++                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
++                                  SDNPVariadic]>;
+ def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+ def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
+                                  SDT_ZWrapOffset, []>;
+@@ -114,6 +179,7 @@ def z_select_ccmask     : SDNode<"System
+ def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+ def z_extract_access    : SDNode<"SystemZISD::EXTRACT_ACCESS",
+                                  SDT_ZExtractAccess>;
++def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+ def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
+ def z_sdivrem32         : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
+ def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
+@@ -123,6 +189,80 @@ def z_udivrem64         : SDNode<"System
+ def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
+                                  [SDNPHasChain, SDNPMayStore]>;
+ 
++// Defined because the index is an i32 rather than a pointer.
++def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
++                                 SDT_ZInsertVectorElt>;
++def z_vector_extract    : SDNode<"ISD::EXTRACT_VECTOR_ELT",
++                                 SDT_ZExtractVectorElt>;
++def z_byte_mask         : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>;
++def z_rotate_mask       : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>;
++def z_replicate         : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>;
++def z_join_dwords       : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>;
++def z_splat             : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
++def z_merge_high        : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
++def z_merge_low         : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
++def z_shl_double        : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
++def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
++                                 SDT_ZVecTernaryInt>;
++def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
++def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
++def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConv,
++                                 [SDNPOutGlue]>;
++def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConv,
++                                 [SDNPOutGlue]>;
++def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>;
++def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>;
++def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>;
++def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>;
++def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
++                                 SDT_ZVecBinaryInt>;
++def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
++                                 SDT_ZVecBinaryInt>;
++def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
++                                 SDT_ZVecBinaryInt>;
++def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>;
++def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>;
++def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>;
++def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>;
++def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
++def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
++def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
++def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConv,
++                                 [SDNPOutGlue]>;
++def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConv,
++                                 [SDNPOutGlue]>;
++def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConv,
++                                 [SDNPOutGlue]>;
++def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
++def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
++def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp, [SDNPOutGlue]>;
++def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryInt,
++                                 [SDNPOutGlue]>;
++def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryInt,
++                                 [SDNPOutGlue]>;
++def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vfeez_cc          : SDNode<"SystemZISD::VFEEZ_CC", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vfene_cc          : SDNode<"SystemZISD::VFENE_CC", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vfenez_cc         : SDNode<"SystemZISD::VFENEZ_CC", SDT_ZVecBinary,
++                                 [SDNPOutGlue]>;
++def z_vistr_cc          : SDNode<"SystemZISD::VISTR_CC", SDT_ZVecUnary,
++                                 [SDNPOutGlue]>;
++def z_vstrc_cc          : SDNode<"SystemZISD::VSTRC_CC", SDT_ZVecQuaternaryInt,
++                                 [SDNPOutGlue]>;
++def z_vstrcz_cc         : SDNode<"SystemZISD::VSTRCZ_CC",
++                                 SDT_ZVecQuaternaryInt, [SDNPOutGlue]>;
++def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvInt,
++                                 [SDNPOutGlue]>;
++
+ class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
+   : SDNode<"SystemZISD::"##name, profile,
+            [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+@@ -172,6 +312,19 @@ def z_prefetch          : SDNode<"System
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                                   SDNPMemOperand]>;
+ 
++def z_tbegin            : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
++                                 [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
++                                  SDNPSideEffect]>;
++def z_tbegin_nofloat    : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
++                                 [SDNPHasChain, SDNPOutGlue, SDNPMayStore,
++                                  SDNPSideEffect]>;
++def z_tend              : SDNode<"SystemZISD::TEND", SDTNone,
++                                 [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
++
++def z_vshl              : SDNode<"ISD::SHL", SDT_ZVecBinary>;
++def z_vsra              : SDNode<"ISD::SRA", SDT_ZVecBinary>;
++def z_vsrl              : SDNode<"ISD::SRL", SDT_ZVecBinary>;
++
+ //===----------------------------------------------------------------------===//
+ // Pattern fragments
+ //===----------------------------------------------------------------------===//
+@@ -195,11 +348,21 @@ def sext8  : PatFrag<(ops node:$src), (s
+ def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
+ def sext32 : PatFrag<(ops node:$src), (sext (i32 node:$src))>;
+ 
++// Match extensions of an i32 to an i64, followed by an in-register sign
++// extension from a sub-i32 value.
++def sext8dbl : PatFrag<(ops node:$src), (sext8 (anyext node:$src))>;
++def sext16dbl : PatFrag<(ops node:$src), (sext16 (anyext node:$src))>;
++
+ // Register zero-extend operations.  Sub-32-bit values are represented as i32s.
+ def zext8  : PatFrag<(ops node:$src), (and node:$src, 0xff)>;
+ def zext16 : PatFrag<(ops node:$src), (and node:$src, 0xffff)>;
+ def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
+ 
++// Match extensions of an i32 to an i64, followed by an AND of the low
++// i8 or i16 part.
++def zext8dbl : PatFrag<(ops node:$src), (zext8 (anyext node:$src))>;
++def zext16dbl : PatFrag<(ops node:$src), (zext16 (anyext node:$src))>;
++
+ // Typed floating-point loads.
+ def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
+ def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
+@@ -363,6 +526,14 @@ def z_iabs64 : PatFrag<(ops node:$src),
+ def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+ def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+ 
++// Integer multiply-and-add
++def z_muladd : PatFrag<(ops node:$src1, node:$src2, node:$src3),
++                       (add (mul node:$src1, node:$src2), node:$src3)>;
++
++// Fused multiply-subtract, using the natural operand order.
++def fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
++                  (fma node:$src1, node:$src2, (fneg node:$src3))>;
++
+ // Fused multiply-add and multiply-subtract, but with the order of the
+ // operands matching SystemZ's MA and MS instructions.
+ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+@@ -383,3 +554,110 @@ class loadu<SDPatternOperator operator,
+ class storeu<SDPatternOperator operator, SDPatternOperator store = store>
+   : PatFrag<(ops node:$value, node:$addr),
+             (store (operator node:$value), node:$addr)>;
++
++// Vector representation of all-zeros and all-ones.
++def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>;
++def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>;
++
++// Load a scalar and replicate it in all elements of a vector.
++class z_replicate_load<ValueType scalartype, SDPatternOperator load>
++  : PatFrag<(ops node:$addr),
++            (z_replicate (scalartype (load node:$addr)))>;
++def z_replicate_loadi8  : z_replicate_load<i32, anyextloadi8>;
++def z_replicate_loadi16 : z_replicate_load<i32, anyextloadi16>;
++def z_replicate_loadi32 : z_replicate_load<i32, load>;
++def z_replicate_loadi64 : z_replicate_load<i64, load>;
++def z_replicate_loadf32 : z_replicate_load<f32, load>;
++def z_replicate_loadf64 : z_replicate_load<f64, load>;
++
++// Load a scalar and insert it into a single element of a vector.
++class z_vle<ValueType scalartype, SDPatternOperator load>
++  : PatFrag<(ops node:$vec, node:$addr, node:$index),
++            (z_vector_insert node:$vec, (scalartype (load node:$addr)),
++                             node:$index)>;
++def z_vlei8  : z_vle<i32, anyextloadi8>;
++def z_vlei16 : z_vle<i32, anyextloadi16>;
++def z_vlei32 : z_vle<i32, load>;
++def z_vlei64 : z_vle<i64, load>;
++def z_vlef32 : z_vle<f32, load>;
++def z_vlef64 : z_vle<f64, load>;
++
++// Load a scalar and insert it into the low element of the high i64 of a
++// zeroed vector.
++class z_vllez<ValueType scalartype, SDPatternOperator load, int index>
++  : PatFrag<(ops node:$addr),
++            (z_vector_insert (z_vzero),
++                             (scalartype (load node:$addr)), (i32 index))>;
++def z_vllezi8  : z_vllez<i32, anyextloadi8, 7>;
++def z_vllezi16 : z_vllez<i32, anyextloadi16, 3>;
++def z_vllezi32 : z_vllez<i32, load, 1>;
++def z_vllezi64 : PatFrag<(ops node:$addr),
++                         (z_join_dwords (i64 (load node:$addr)), (i64 0))>;
++// We use high merges to form a v4f32 from four f32s.  Propagating zero
++// into all elements but index 1 gives this expression.
++def z_vllezf32 : PatFrag<(ops node:$addr),
++                         (bitconvert
++                          (z_merge_high
++                           (v2i64
++                            (z_unpackl_high
++                             (v4i32
++                              (bitconvert
++                               (v4f32 (scalar_to_vector
++                                       (f32 (load node:$addr)))))))),
++                           (v2i64 (z_vzero))))>;
++def z_vllezf64 : PatFrag<(ops node:$addr),
++                         (z_merge_high
++                          (scalar_to_vector (f64 (load node:$addr))),
++                          (z_vzero))>;
++
++// Store one element of a vector.
++class z_vste<ValueType scalartype, SDPatternOperator store>
++  : PatFrag<(ops node:$vec, node:$addr, node:$index),
++            (store (scalartype (z_vector_extract node:$vec, node:$index)),
++                   node:$addr)>;
++def z_vstei8  : z_vste<i32, truncstorei8>;
++def z_vstei16 : z_vste<i32, truncstorei16>;
++def z_vstei32 : z_vste<i32, store>;
++def z_vstei64 : z_vste<i64, store>;
++def z_vstef32 : z_vste<f32, store>;
++def z_vstef64 : z_vste<f64, store>;
++
++// Arithmetic negation on vectors.
++def z_vneg : PatFrag<(ops node:$x), (sub (z_vzero), node:$x)>;
++
++// Bitwise negation on vectors.
++def z_vnot : PatFrag<(ops node:$x), (xor node:$x, (z_vones))>;
++
++// Signed "integer greater than zero" on vectors.
++def z_vicmph_zero : PatFrag<(ops node:$x), (z_vicmph node:$x, (z_vzero))>;
++
++// Signed "integer less than zero" on vectors.
++def z_vicmpl_zero : PatFrag<(ops node:$x), (z_vicmph (z_vzero), node:$x)>;
++
++// Integer absolute on vectors.
++class z_viabs<int shift>
++  : PatFrag<(ops node:$src),
++            (xor (add node:$src, (z_vsra_by_scalar node:$src, (i32 shift))),
++                 (z_vsra_by_scalar node:$src, (i32 shift)))>;
++def z_viabs8  : z_viabs<7>;
++def z_viabs16 : z_viabs<15>;
++def z_viabs32 : z_viabs<31>;
++def z_viabs64 : z_viabs<63>;
++
++// Sign-extend the i64 elements of a vector.
++class z_vse<int shift>
++  : PatFrag<(ops node:$src),
++            (z_vsra_by_scalar (z_vshl_by_scalar node:$src, shift), shift)>;
++def z_vsei8  : z_vse<56>;
++def z_vsei16 : z_vse<48>;
++def z_vsei32 : z_vse<32>;
++
++// ...and again with the extensions being done on individual i64 scalars.
++class z_vse_by_parts<SDPatternOperator operator, int index1, int index2>
++  : PatFrag<(ops node:$src),
++            (z_join_dwords
++             (operator (z_vector_extract node:$src, index1)),
++             (operator (z_vector_extract node:$src, index2)))>;
++def z_vsei8_by_parts  : z_vse_by_parts<sext8dbl, 7, 15>;
++def z_vsei16_by_parts : z_vse_by_parts<sext16dbl, 3, 7>;
++def z_vsei32_by_parts : z_vse_by_parts<sext32, 1, 3>;
+Index: llvm-36/lib/Target/SystemZ/SystemZPatterns.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZPatterns.td
++++ llvm-36/lib/Target/SystemZ/SystemZPatterns.td
+@@ -153,3 +153,17 @@ multiclass CompareZeroFP<Instruction ins
+   // The sign of the zero makes no difference.
+   def : Pat<(z_fcmp cls:$reg, (fpimmneg0)), (insn cls:$reg, cls:$reg)>;
+ }
++
++// Use INSN for performing binary operation OPERATION of type VT
++// on registers of class CLS.
++class BinaryRRWithType<Instruction insn, RegisterOperand cls,
++                       SDPatternOperator operator, ValueType vt>
++  : Pat<(vt (operator cls:$x, cls:$y)), (insn cls:$x, cls:$y)>;
++
++// Use INSN to perform conversion operation OPERATOR, with the input being
++// TR2 and the output being TR1.  SUPPRESS is 4 to suppress inexact conditions
++// and 0 to allow them.  MODE is the rounding mode to use.
++class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
++                   TypedReg tr2, bits<3> suppress, bits<4> mode>
++  : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
++        (insn tr2.op:$vec, suppress, mode)>;
+Index: llvm-36/lib/Target/SystemZ/SystemZProcessors.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZProcessors.td
++++ llvm-36/lib/Target/SystemZ/SystemZProcessors.td
+@@ -39,6 +39,11 @@ def FeatureFPExtension : SystemZFeature<
+   "Assume that the floating-point extension facility is installed"
+ >;
+ 
++def FeaturePopulationCount : SystemZFeature<
++  "population-count", "PopulationCount",
++  "Assume that the population-count facility is installed"
++>;
++
+ def FeatureFastSerialization : SystemZFeature<
+   "fast-serialization", "FastSerialization",
+   "Assume that the fast-serialization facility is installed"
+@@ -50,13 +55,42 @@ def FeatureInterlockedAccess1 : SystemZF
+ >;
+ def FeatureNoInterlockedAccess1 : SystemZMissingFeature<"InterlockedAccess1">;
+ 
++def FeatureMiscellaneousExtensions : SystemZFeature<
++  "miscellaneous-extensions", "MiscellaneousExtensions",
++  "Assume that the miscellaneous-extensions facility is installed"
++>;
++
++def FeatureTransactionalExecution : SystemZFeature<
++  "transactional-execution", "TransactionalExecution",
++  "Assume that the transactional-execution facility is installed"
++>;
++
++def FeatureProcessorAssist : SystemZFeature<
++  "processor-assist", "ProcessorAssist",
++  "Assume that the processor-assist facility is installed"
++>;
++
++def FeatureVector : SystemZFeature<
++  "vector", "Vector",
++  "Assume that the vectory facility is installed"
++>;
++def FeatureNoVector : SystemZMissingFeature<"Vector">;
++
+ def : Processor<"generic", NoItineraries, []>;
+ def : Processor<"z10", NoItineraries, []>;
+ def : Processor<"z196", NoItineraries,
+                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+-                 FeatureFPExtension, FeatureFastSerialization,
+-                 FeatureInterlockedAccess1]>;
++                 FeatureFPExtension, FeaturePopulationCount,
++                 FeatureFastSerialization, FeatureInterlockedAccess1]>;
+ def : Processor<"zEC12", NoItineraries,
+                 [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+-                 FeatureFPExtension, FeatureFastSerialization,
+-                 FeatureInterlockedAccess1]>;
++                 FeatureFPExtension, FeaturePopulationCount,
++                 FeatureFastSerialization, FeatureInterlockedAccess1,
++                 FeatureMiscellaneousExtensions,
++                 FeatureTransactionalExecution, FeatureProcessorAssist]>;
++def : Processor<"z13", NoItineraries,
++                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
++                 FeatureFPExtension, FeaturePopulationCount,
++                 FeatureFastSerialization, FeatureInterlockedAccess1,
++                 FeatureTransactionalExecution, FeatureProcessorAssist,
++                 FeatureVector]>;
+Index: llvm-36/lib/Target/SystemZ/SystemZRegisterInfo.td
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZRegisterInfo.td
++++ llvm-36/lib/Target/SystemZ/SystemZRegisterInfo.td
+@@ -25,20 +25,24 @@ def subreg_l32   : SubRegIndex<32, 0>;
+ def subreg_h32   : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+ def subreg_l64   : SubRegIndex<64, 0>;
+ def subreg_h64   : SubRegIndex<64, 64>;
++def subreg_r32   : SubRegIndex<32, 32>; // Reinterpret a wider reg as 32 bits.
++def subreg_r64   : SubRegIndex<64, 64>; // Reinterpret a wider reg as 64 bits.
+ def subreg_hh32  : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+ def subreg_hl32  : ComposedSubRegIndex<subreg_h64, subreg_l32>;
++def subreg_hr32  : ComposedSubRegIndex<subreg_h64, subreg_r32>;
+ }
+ 
+-// Define a register class that contains values of type TYPE and an
++// Define a register class that contains values of types TYPES and an
+ // associated operand called NAME.  SIZE is the size and alignment
+ // of the registers and REGLIST is the list of individual registers.
+-multiclass SystemZRegClass<string name, ValueType type, int size, dag regList> {
++multiclass SystemZRegClass<string name, list<ValueType> types, int size,
++                           dag regList> {
+   def AsmOperand : AsmOperandClass {
+     let Name = name;
+     let ParserMethod = "parse"##name;
+     let RenderMethod = "addRegOperands";
+   }
+-  def Bit : RegisterClass<"SystemZ", [type], size, regList> {
++  def Bit : RegisterClass<"SystemZ", types, size, regList> {
+     let Size = size;
+   }
+   def "" : RegisterOperand<!cast<RegisterClass>(name##"Bit")> {
+@@ -84,16 +88,19 @@ foreach I = [0, 2, 4, 6, 8, 10, 12, 14]
+ 
+ /// Allocate the callee-saved R6-R13 backwards. That way they can be saved
+ /// together with R14 and R15 in one prolog instruction.
+-defm GR32  : SystemZRegClass<"GR32",  i32, 32, (add (sequence "R%uL",  0, 5),
+-                                                    (sequence "R%uL", 15, 6))>;
+-defm GRH32 : SystemZRegClass<"GRH32", i32, 32, (add (sequence "R%uH",  0, 5),
+-                                                    (sequence "R%uH", 15, 6))>;
+-defm GR64  : SystemZRegClass<"GR64",  i64, 64, (add (sequence "R%uD",  0, 5),
+-                                                    (sequence "R%uD", 15, 6))>;
++defm GR32  : SystemZRegClass<"GR32",  [i32], 32,
++                             (add (sequence "R%uL",  0, 5),
++                                  (sequence "R%uL", 15, 6))>;
++defm GRH32 : SystemZRegClass<"GRH32", [i32], 32,
++                             (add (sequence "R%uH",  0, 5),
++                                  (sequence "R%uH", 15, 6))>;
++defm GR64  : SystemZRegClass<"GR64",  [i64], 64,
++                             (add (sequence "R%uD",  0, 5),
++                                  (sequence "R%uD", 15, 6))>;
+ 
+ // Combine the low and high GR32s into a single class.  This can only be
+ // used for virtual registers if the high-word facility is available.
+-defm GRX32 : SystemZRegClass<"GRX32", i32, 32,
++defm GRX32 : SystemZRegClass<"GRX32", [i32], 32,
+                              (add (sequence "R%uL",  0, 5),
+                                   (sequence "R%uH",  0, 5),
+                                   R15L, R15H, R14L, R14H, R13L, R13H,
+@@ -102,18 +109,17 @@ defm GRX32 : SystemZRegClass<"GRX32", i3
+ 
+ // The architecture doesn't really have any i128 support, so model the
+ // register pairs as untyped instead.
+-defm GR128 : SystemZRegClass<"GR128", untyped, 128, (add R0Q, R2Q, R4Q,
+-                                                         R12Q, R10Q, R8Q, R6Q,
+-                                                         R14Q)>;
++defm GR128 : SystemZRegClass<"GR128", [untyped], 128,
++                             (add R0Q, R2Q, R4Q, R12Q, R10Q, R8Q, R6Q, R14Q)>;
+ 
+ // Base and index registers.  Everything except R0, which in an address
+ // context evaluates as 0.
+-defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0L)>;
+-defm ADDR64 : SystemZRegClass<"ADDR64", i64, 64, (sub GR64Bit, R0D)>;
++defm ADDR32 : SystemZRegClass<"ADDR32", [i32], 32, (sub GR32Bit, R0L)>;
++defm ADDR64 : SystemZRegClass<"ADDR64", [i64], 64, (sub GR64Bit, R0D)>;
+ 
+ // Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
+ // of a GR128.
+-defm ADDR128 : SystemZRegClass<"ADDR128", untyped, 128, (sub GR128Bit, R0Q)>;
++defm ADDR128 : SystemZRegClass<"ADDR128", [untyped], 128, (sub GR128Bit, R0Q)>;
+ 
+ //===----------------------------------------------------------------------===//
+ // Floating-point registers
+@@ -142,16 +148,36 @@ def F11Dwarf : DwarfMapping<29>;
+ def F13Dwarf : DwarfMapping<30>;
+ def F15Dwarf : DwarfMapping<31>;
+ 
+-// Lower 32 bits of one of the 16 64-bit floating-point registers
++def F16Dwarf : DwarfMapping<68>;
++def F18Dwarf : DwarfMapping<69>;
++def F20Dwarf : DwarfMapping<70>;
++def F22Dwarf : DwarfMapping<71>;
++
++def F17Dwarf : DwarfMapping<72>;
++def F19Dwarf : DwarfMapping<73>;
++def F21Dwarf : DwarfMapping<74>;
++def F23Dwarf : DwarfMapping<75>;
++
++def F24Dwarf : DwarfMapping<76>;
++def F26Dwarf : DwarfMapping<77>;
++def F28Dwarf : DwarfMapping<78>;
++def F30Dwarf : DwarfMapping<79>;
++
++def F25Dwarf : DwarfMapping<80>;
++def F27Dwarf : DwarfMapping<81>;
++def F29Dwarf : DwarfMapping<82>;
++def F31Dwarf : DwarfMapping<83>;
++
++// Upper 32 bits of one of the floating-point registers
+ class FPR32<bits<16> num, string n> : SystemZReg<n> {
+   let HWEncoding = num;
+ }
+ 
+-// One of the 16 64-bit floating-point registers
+-class FPR64<bits<16> num, string n, FPR32 low>
+- : SystemZRegWithSubregs<n, [low]> {
++// One of the floating-point registers.
++class FPR64<bits<16> num, string n, FPR32 high>
++ : SystemZRegWithSubregs<n, [high]> {
+   let HWEncoding = num;
+-  let SubRegIndices = [subreg_h32];
++  let SubRegIndices = [subreg_r32];
+ }
+ 
+ // 8 pairs of FPR64s, with a one-register gap inbetween.
+@@ -161,12 +187,17 @@ class FPR128<bits<16> num, string n, FPR
+   let SubRegIndices = [subreg_l64, subreg_h64];
+ }
+ 
+-// Floating-point registers
++// Floating-point registers.  Registers 16-31 require the vector facility.
+ foreach I = 0-15 in {
+   def F#I#S : FPR32<I, "f"#I>;
+   def F#I#D : FPR64<I, "f"#I, !cast<FPR32>("F"#I#"S")>,
+               DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
+ }
++foreach I = 16-31 in {
++  def F#I#S : FPR32<I, "v"#I>;
++  def F#I#D : FPR64<I, "v"#I, !cast<FPR32>("F"#I#"S")>,
++              DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
++}
+ 
+ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
+   def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+@@ -175,10 +206,74 @@ foreach I = [0, 1, 4, 5, 8, 9, 12, 13] i
+ 
+ // There's no store-multiple instruction for FPRs, so we're not fussy
+ // about the order in which call-saved registers are allocated.
+-defm FP32  : SystemZRegClass<"FP32", f32, 32, (sequence "F%uS", 0, 15)>;
+-defm FP64  : SystemZRegClass<"FP64", f64, 64, (sequence "F%uD", 0, 15)>;
+-defm FP128 : SystemZRegClass<"FP128", f128, 128, (add F0Q, F1Q, F4Q, F5Q,
+-                                                      F8Q, F9Q, F12Q, F13Q)>;
++defm FP32  : SystemZRegClass<"FP32", [f32], 32, (sequence "F%uS", 0, 15)>;
++defm FP64  : SystemZRegClass<"FP64", [f64], 64, (sequence "F%uD", 0, 15)>;
++defm FP128 : SystemZRegClass<"FP128", [f128], 128,
++                             (add F0Q, F1Q, F4Q, F5Q, F8Q, F9Q, F12Q, F13Q)>;
++
++//===----------------------------------------------------------------------===//
++// Vector registers
++//===----------------------------------------------------------------------===//
++
++// A full 128-bit vector register, with an FPR64 as its high part.
++class VR128<bits<16> num, string n, FPR64 high>
++  : SystemZRegWithSubregs<n, [high]> {
++  let HWEncoding = num;
++  let SubRegIndices = [subreg_r64];
++}
++
++// Full vector registers.
++foreach I = 0-31 in {
++  def V#I : VR128<I, "v"#I, !cast<FPR64>("F"#I#"D")>,
++            DwarfRegNum<[!cast<DwarfMapping>("F"#I#"Dwarf").Id]>;
++}
++
++// Class used to store 32-bit values in the first element of a vector
++// register.  f32 scalars are used for the WLEDB and WLDEB instructions.
++defm VR32 : SystemZRegClass<"VR32", [f32, v4i8, v2i16], 32,
++                            (add (sequence "F%uS", 0, 7),
++                                 (sequence "F%uS", 16, 31),
++                                 (sequence "F%uS", 8, 15))>;
++
++// Class used to store 64-bit values in the upper half of a vector register.
++// The vector facility also includes scalar f64 instructions that operate
++// on the full vector register set.
++defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64,
++                            (add (sequence "F%uD", 0, 7),
++                                 (sequence "F%uD", 16, 31),
++                                 (sequence "F%uD", 8, 15))>;
++
++// The subset of vector registers that can be used for floating-point
++// operations too.
++defm VF128 : SystemZRegClass<"VF128",
++                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
++                             (sequence "V%u", 0, 15)>;
++
++// All vector registers.
++defm VR128 : SystemZRegClass<"VR128",
++                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
++                             (add (sequence "V%u", 0, 7),
++                                  (sequence "V%u", 16, 31),
++                                  (sequence "V%u", 8, 15))>;
++
++// Attaches a ValueType to a register operand, to make the instruction
++// definitions easier.
++class TypedReg<ValueType vtin, RegisterOperand opin> {
++  ValueType vt = vtin;
++  RegisterOperand op = opin;
++}
++
++def v32eb   : TypedReg<f32,     VR32>;
++def v64g    : TypedReg<i64,     VR64>;
++def v64db   : TypedReg<f64,     VR64>;
++def v128b   : TypedReg<v16i8,   VR128>;
++def v128h   : TypedReg<v8i16,   VR128>;
++def v128f   : TypedReg<v4i32,   VR128>;
++def v128g   : TypedReg<v2i64,   VR128>;
++def v128q   : TypedReg<v16i8,   VR128>;
++def v128eb  : TypedReg<v4f32,   VR128>;
++def v128db  : TypedReg<v2f64,   VR128>;
++def v128any : TypedReg<untyped, VR128>;
+ 
+ //===----------------------------------------------------------------------===//
+ // Other registers
+Index: llvm-36/lib/Target/SystemZ/SystemZShortenInst.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZShortenInst.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZShortenInst.cpp
+@@ -15,6 +15,7 @@
+ 
+ #include "SystemZTargetMachine.h"
+ #include "llvm/CodeGen/MachineFunctionPass.h"
++#include "llvm/CodeGen/MachineInstrBuilder.h"
+ 
+ using namespace llvm;
+ 
+@@ -36,6 +37,10 @@ public:
+ private:
+   bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
+                   unsigned LLIxL, unsigned LLIxH);
++  bool shortenOn0(MachineInstr &MI, unsigned Opcode);
++  bool shortenOn01(MachineInstr &MI, unsigned Opcode);
++  bool shortenOn001(MachineInstr &MI, unsigned Opcode);
++  bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
+ 
+   const SystemZInstrInfo *TII;
+ 
+@@ -97,6 +102,64 @@ bool SystemZShortenInst::shortenIIF(Mach
+   return false;
+ }
+ 
++// Change MI's opcode to Opcode if register operand 0 has a 4-bit encoding.
++bool SystemZShortenInst::shortenOn0(MachineInstr &MI, unsigned Opcode) {
++  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16) {
++    MI.setDesc(TII->get(Opcode));
++    return true;
++  }
++  return false;
++}
++
++// Change MI's opcode to Opcode if register operands 0 and 1 have a
++// 4-bit encoding.
++bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) {
++  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
++      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
++    MI.setDesc(TII->get(Opcode));
++    return true;
++  }
++  return false;
++}
++
++// Change MI's opcode to Opcode if register operands 0, 1 and 2 have a
++// 4-bit encoding and if operands 0 and 1 are tied.
++bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) {
++  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
++      MI.getOperand(1).getReg() == MI.getOperand(0).getReg() &&
++      SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) {
++    MI.setDesc(TII->get(Opcode));
++    return true;
++  }
++  return false;
++}
++
++// MI is a vector-style conversion instruction with the operand order:
++// destination, source, exact-suppress, rounding-mode.  If both registers
++// have a 4-bit encoding then change it to Opcode, which has operand order:
++// destination, rouding-mode, source, exact-suppress.
++bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
++  if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 &&
++      SystemZMC::getFirstReg(MI.getOperand(1).getReg()) < 16) {
++    MachineOperand Dest(MI.getOperand(0));
++    MachineOperand Src(MI.getOperand(1));
++    MachineOperand Suppress(MI.getOperand(2));
++    MachineOperand Mode(MI.getOperand(3));
++    MI.RemoveOperand(3);
++    MI.RemoveOperand(2);
++    MI.RemoveOperand(1);
++    MI.RemoveOperand(0);
++    MI.setDesc(TII->get(Opcode));
++    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
++      .addOperand(Dest)
++      .addOperand(Mode)
++      .addOperand(Src)
++      .addOperand(Suppress);
++    return true;
++  }
++  return false;
++}
++
+ // Process all instructions in MBB.  Return true if something changed.
+ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
+   bool Changed = false;
+@@ -117,13 +180,83 @@ bool SystemZShortenInst::processBlock(Ma
+   // Iterate backwards through the block looking for instructions to change.
+   for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) {
+     MachineInstr &MI = *MBBI;
+-    unsigned Opcode = MI.getOpcode();
+-    if (Opcode == SystemZ::IILF)
++    switch (MI.getOpcode()) {
++    case SystemZ::IILF:
+       Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
+                             SystemZ::LLILH);
+-    else if (Opcode == SystemZ::IIHF)
++      break;
++
++    case SystemZ::IIHF:
+       Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
+                             SystemZ::LLIHH);
++      break;
++
++    case SystemZ::WFADB:
++      Changed |= shortenOn001(MI, SystemZ::ADBR);
++      break;
++
++    case SystemZ::WFDDB:
++      Changed |= shortenOn001(MI, SystemZ::DDBR);
++      break;
++
++    case SystemZ::WFIDB:
++      Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
++      break;
++
++    case SystemZ::WLDEB:
++      Changed |= shortenOn01(MI, SystemZ::LDEBR);
++      break;
++
++    case SystemZ::WLEDB:
++      Changed |= shortenFPConv(MI, SystemZ::LEDBRA);
++      break;
++
++    case SystemZ::WFMDB:
++      Changed |= shortenOn001(MI, SystemZ::MDBR);
++      break;
++
++    case SystemZ::WFLCDB:
++      Changed |= shortenOn01(MI, SystemZ::LCDBR);
++      break;
++
++    case SystemZ::WFLNDB:
++      Changed |= shortenOn01(MI, SystemZ::LNDBR);
++      break;
++
++    case SystemZ::WFLPDB:
++      Changed |= shortenOn01(MI, SystemZ::LPDBR);
++      break;
++
++    case SystemZ::WFSQDB:
++      Changed |= shortenOn01(MI, SystemZ::SQDBR);
++      break;
++
++    case SystemZ::WFSDB:
++      Changed |= shortenOn001(MI, SystemZ::SDBR);
++      break;
++
++    case SystemZ::WFCDB:
++      Changed |= shortenOn01(MI, SystemZ::CDBR);
++      break;
++
++    case SystemZ::VL32:
++      // For z13 we prefer LDE over LE to avoid partial register dependencies.
++      Changed |= shortenOn0(MI, SystemZ::LDE32);
++      break;
++
++    case SystemZ::VST32:
++      Changed |= shortenOn0(MI, SystemZ::STE);
++      break;
++
++    case SystemZ::VL64:
++      Changed |= shortenOn0(MI, SystemZ::LD);
++      break;
++
++    case SystemZ::VST64:
++      Changed |= shortenOn0(MI, SystemZ::STD);
++      break;
++    }
++
+     unsigned UsedLow = 0;
+     unsigned UsedHigh = 0;
+     for (auto MOI = MI.operands_begin(), MOE = MI.operands_end();
+Index: llvm-36/lib/Target/SystemZ/SystemZSubtarget.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZSubtarget.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZSubtarget.cpp
+@@ -10,7 +10,6 @@
+ #include "SystemZSubtarget.h"
+ #include "MCTargetDesc/SystemZMCTargetDesc.h"
+ #include "llvm/IR/GlobalValue.h"
+-#include "llvm/Support/Host.h"
+ 
+ using namespace llvm;
+ 
+@@ -23,15 +22,69 @@ using namespace llvm;
+ // Pin the vtable to this file.
+ void SystemZSubtarget::anchor() {}
+ 
++// Determine whether we use the vector ABI.
++static bool UsesVectorABI(StringRef CPU, StringRef FS) {
++  // We use the vector ABI whenever the vector facility is avaiable.
++  // This is the case by default if CPU is z13 or later, and can be
++  // overridden via "[+-]vector" feature string elements.
++  bool VectorABI = true;
++  if (CPU.empty() || CPU == "generic" ||
++      CPU == "z10" || CPU == "z196" || CPU == "zEC12")
++    VectorABI = false;
++
++  SmallVector<StringRef, 3> Features;
++  FS.split(Features, ",", -1, false /* KeepEmpty */);
++  for (auto &Feature : Features) {
++    if (Feature == "vector" || Feature == "+vector")
++      VectorABI = true;
++    if (Feature == "-vector")
++      VectorABI = false;
++  }
++
++  return VectorABI;
++}
++
++static std::string computeDataLayout(StringRef TT, StringRef CPU,
++                                     StringRef FS) {
++  const Triple Triple(TT);
++  bool VectorABI = UsesVectorABI(CPU, FS);
++  std::string Ret = "";
++
++  // Big endian.
++  Ret += "E";
++
++  // Data mangling.
++  Ret += DataLayout::getManglingComponent(Triple);
++
++  // Make sure that global data has at least 16 bits of alignment by
++  // default, so that we can refer to it using LARL.  We don't have any
++  // special requirements for stack variables though.
++  Ret += "-i1:8:16-i8:8:16";
++
++  // 64-bit integers are naturally aligned.
++  Ret += "-i64:64";
++
++  // 128-bit floats are aligned only to 64 bits.
++  Ret += "-f128:64";
++
++  // When using the vector ABI, 128-bit vectors are also aligned to 64 bits.
++  if (VectorABI)
++    Ret += "-v128:64";
++
++  // We prefer 16 bits of aligned for all globals; see above.
++  Ret += "-a:8:16";
++
++  // Integer registers are 32 or 64 bits.
++  Ret += "-n32:64";
++
++  return Ret;
++}
++
+ SystemZSubtarget &
+ SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
+   std::string CPUName = CPU;
+   if (CPUName.empty())
+     CPUName = "generic";
+-#if defined(__linux__) && defined(__s390x__)
+-  if (CPUName == "generic")
+-    CPUName = sys::getHostCPUName();
+-#endif
+   // Parse features string.
+   ParseSubtargetFeatures(CPUName, FS);
+   return *this;
+@@ -43,12 +96,12 @@ SystemZSubtarget::SystemZSubtarget(const
+                                    const TargetMachine &TM)
+     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
+       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+-      HasFastSerialization(false), HasInterlockedAccess1(false),
++      HasPopulationCount(false), HasFastSerialization(false),
++      HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
++      HasTransactionalExecution(false), HasProcessorAssist(false),
++      HasVector(false),
+       TargetTriple(TT),
+-      // Make sure that global data has at least 16 bits of alignment by
+-      // default, so that we can refer to it using LARL.  We don't have any
+-      // special requirements for stack variables though.
+-      DL("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64"),
++      DL(computeDataLayout(TT, CPU, FS)),
+       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
+       TSInfo(DL), FrameLowering() {}
+ 
+Index: llvm-36/lib/Target/SystemZ/SystemZSubtarget.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZSubtarget.h
++++ llvm-36/lib/Target/SystemZ/SystemZSubtarget.h
+@@ -38,8 +38,13 @@ protected:
+   bool HasLoadStoreOnCond;
+   bool HasHighWord;
+   bool HasFPExtension;
++  bool HasPopulationCount;
+   bool HasFastSerialization;
+   bool HasInterlockedAccess1;
++  bool HasMiscellaneousExtensions;
++  bool HasTransactionalExecution;
++  bool HasProcessorAssist;
++  bool HasVector;
+ 
+ private:
+   Triple TargetTriple;
+@@ -88,12 +93,29 @@ public:
+   // Return true if the target has the floating-point extension facility.
+   bool hasFPExtension() const { return HasFPExtension; }
+ 
++  // Return true if the target has the population-count facility.
++  bool hasPopulationCount() const { return HasPopulationCount; }
++
+   // Return true if the target has the fast-serialization facility.
+   bool hasFastSerialization() const { return HasFastSerialization; }
+ 
+   // Return true if the target has interlocked-access facility 1.
+   bool hasInterlockedAccess1() const { return HasInterlockedAccess1; }
+ 
++  // Return true if the target has the miscellaneous-extensions facility.
++  bool hasMiscellaneousExtensions() const {
++    return HasMiscellaneousExtensions;
++  }
++
++  // Return true if the target has the transactional-execution facility.
++  bool hasTransactionalExecution() const { return HasTransactionalExecution; }
++
++  // Return true if the target has the processor-assist facility.
++  bool hasProcessorAssist() const { return HasProcessorAssist; }
++
++  // Return true if the target has the vector facility.
++  bool hasVector() const { return HasVector; }
++
+   // Return true if GV can be accessed using LARL for reloc model RM
+   // and code model CM.
+   bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
+Index: llvm-36/lib/Target/SystemZ/SystemZTargetMachine.cpp
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZTargetMachine.cpp
++++ llvm-36/lib/Target/SystemZ/SystemZTargetMachine.cpp
+@@ -9,6 +9,7 @@
+ 
+ #include "SystemZTargetMachine.h"
+ #include "llvm/CodeGen/Passes.h"
++#include "llvm/PassManager.h"
+ #include "llvm/Support/TargetRegistry.h"
+ #include "llvm/Transforms/Scalar.h"
+ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+@@ -57,6 +58,10 @@ void SystemZPassConfig::addIRPasses() {
+ 
+ bool SystemZPassConfig::addInstSelector() {
+   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
++
++ if (getOptLevel() != CodeGenOpt::None)
++    addPass(createSystemZLDCleanupPass(getSystemZTargetMachine()));
++
+   return false;
+ }
+ 
+@@ -100,3 +105,12 @@ void SystemZPassConfig::addPreEmitPass()
+ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
+   return new SystemZPassConfig(this, PM);
+ }
++
++void SystemZTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
++  // Add first the target-independent BasicTTI pass, then our SystemZ pass.
++  // This allows the SystemZ pass to delegate to the target independent layer
++  // when appropriate.
++  PM.add(createBasicTargetTransformInfoPass(this));
++  PM.add(createSystemZTargetTransformInfoPass(this));
++}
++
+Index: llvm-36/lib/Target/SystemZ/SystemZTargetMachine.h
+===================================================================
+--- llvm-36.orig/lib/Target/SystemZ/SystemZTargetMachine.h
++++ llvm-36/lib/Target/SystemZ/SystemZTargetMachine.h
+@@ -39,6 +39,7 @@ public:
+   }
+   // Override LLVMTargetMachine
+   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
++  void addAnalysisPasses(PassManagerBase &PM) override;
+   TargetLoweringObjectFile *getObjFileLowering() const override {
+     return TLOF.get();
+   }
+Index: llvm-36/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+===================================================================
+--- /dev/null
++++ llvm-36/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+@@ -0,0 +1,334 @@
++//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements a TargetTransformInfo analysis pass specific to the
++// SystemZ target machine. It uses the target's detailed information to provide
++// more precise answers to certain TTI queries, while letting the target
++// independent and default TTI implementations handle the rest.
++//
++//===----------------------------------------------------------------------===//
++
++#include "SystemZTargetMachine.h"
++#include "llvm/Analysis/TargetTransformInfo.h"
++#include "llvm/IR/IntrinsicInst.h"
++#include "llvm/Support/Debug.h"
++#include "llvm/Target/CostTable.h"
++#include "llvm/Target/TargetLowering.h"
++using namespace llvm;
++
++#define DEBUG_TYPE "systemztti"
++
++// Declare the pass initialization routine locally as target-specific passes
++// don't have a target-wide initialization entry point, and so we rely on the
++// pass constructor initialization.
++namespace llvm {
++void initializeSystemZTTIPass(PassRegistry &);
++}
++
++namespace {
++
++class SystemZTTI : public ImmutablePass, public TargetTransformInfo {
++  const SystemZSubtarget *ST;
++  const SystemZTargetLowering *TLI;
++
++public:
++  SystemZTTI() : ImmutablePass(ID), ST(0), TLI(0) {
++    llvm_unreachable("This pass cannot be directly constructed");
++  }
++
++  SystemZTTI(const SystemZTargetMachine *TM)
++      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
++        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
++    initializeSystemZTTIPass(*PassRegistry::getPassRegistry());
++  }
++
++  void initializePass() override {
++    pushTTIStack(this);
++  }
++
++  void getAnalysisUsage(AnalysisUsage &AU) const override {
++    TargetTransformInfo::getAnalysisUsage(AU);
++  }
++
++  // Pass identification.
++  static char ID;
++
++  // Provide necessary pointer adjustments for the two base classes.
++  void *getAdjustedAnalysisPointer(const void *ID) override {
++    if (ID == &TargetTransformInfo::ID)
++      return (TargetTransformInfo*)this;
++    return this;
++  }
++
++  /// \name Scalar TTI Implementations
++  /// @{
++
++  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
++
++  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
++                         Type *Ty);
++  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
++                         Type *Ty);
++
++  PopcntSupportKind getPopcntSupport(unsigned TyWidth);
++
++  /// @}
++
++  /// \name Vector TTI Implementations
++  /// @{
++
++  unsigned getNumberOfRegisters(bool Vector);
++  unsigned getRegisterBitWidth(bool Vector);
++
++  /// @}
++};
++
++} // end anonymous namespace
++
++INITIALIZE_AG_PASS(SystemZTTI, TargetTransformInfo, "systemztti",
++                   "SystemZ Target Transform Info", true, true, false)
++char SystemZTTI::ID = 0;
++
++ImmutablePass *
++llvm::createSystemZTargetTransformInfoPass(const SystemZTargetMachine *TM) {
++  return new SystemZTTI(TM);
++}
++
++
++//===----------------------------------------------------------------------===//
++//
++// SystemZ cost model.
++//
++//===----------------------------------------------------------------------===//
++
++unsigned SystemZTTI::getIntImmCost(const APInt &Imm, Type *Ty) {
++  assert(Ty->isIntegerTy());
++
++  unsigned BitSize = Ty->getPrimitiveSizeInBits();
++  // There is no cost model for constants with a bit size of 0. Return TCC_Free
++  // here, so that constant hoisting will ignore this constant.
++  if (BitSize == 0)
++    return TCC_Free;
++  // No cost model for operations on integers larger than 64 bit implemented yet.
++  if (BitSize > 64)
++    return TCC_Free;
++
++  if (Imm == 0)
++    return TCC_Free;
++
++  if (Imm.getBitWidth() <= 64) {
++    // Constants loaded via lgfi.
++    if (isInt<32>(Imm.getSExtValue()))
++      return TCC_Basic;
++    // Constants loaded via llilf.
++    if (isUInt<32>(Imm.getZExtValue()))
++      return TCC_Basic;
++    // Constants loaded via llihf:
++    if ((Imm.getZExtValue() & 0xffffffff) == 0)
++      return TCC_Basic;
++
++    return 2 * TCC_Basic;
++  }
++
++  return 4 * TCC_Basic;
++}
++
++unsigned SystemZTTI::getIntImmCost(unsigned Opcode, unsigned Idx,
++                                       const APInt &Imm, Type *Ty) {
++  assert(Ty->isIntegerTy());
++
++  unsigned BitSize = Ty->getPrimitiveSizeInBits();
++  // There is no cost model for constants with a bit size of 0. Return TCC_Free
++  // here, so that constant hoisting will ignore this constant.
++  if (BitSize == 0)
++    return TCC_Free;
++  // No cost model for operations on integers larger than 64 bit implemented yet.
++  if (BitSize > 64)
++    return TCC_Free;
++
++  switch (Opcode) {
++  default:
++    return TCC_Free;
++  case Instruction::GetElementPtr:
++    // Always hoist the base address of a GetElementPtr. This prevents the
++    // creation of new constants for every base constant that gets constant
++    // folded with the offset.
++    if (Idx == 0)
++      return 2 * TCC_Basic;
++    return TCC_Free;
++  case Instruction::Store:
++    if (Idx == 0 && Imm.getBitWidth() <= 64) {
++      // Any 8-bit immediate store can by implemented via mvi.
++      if (BitSize == 8)
++        return TCC_Free;
++      // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
++      if (isInt<16>(Imm.getSExtValue()))
++        return TCC_Free;
++    }
++    break;
++  case Instruction::ICmp:
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      // Comparisons against signed 32-bit immediates implemented via cgfi.
++      if (isInt<32>(Imm.getSExtValue()))
++        return TCC_Free;
++      // Comparisons against unsigned 32-bit immediates implemented via clgfi.
++      if (isUInt<32>(Imm.getZExtValue()))
++        return TCC_Free;
++    }
++    break;
++  case Instruction::Add:
++  case Instruction::Sub:
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
++      if (isUInt<32>(Imm.getZExtValue()))
++        return TCC_Free;
++      // Or their negation, by swapping addition vs. subtraction.
++      if (isUInt<32>(-Imm.getSExtValue()))
++        return TCC_Free;
++    }
++    break;
++  case Instruction::Mul:
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      // We use msgfi to multiply by 32-bit signed immediates.
++      if (isInt<32>(Imm.getSExtValue()))
++        return TCC_Free;
++    }
++    break;
++  case Instruction::Or:
++  case Instruction::Xor:
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      // Masks supported by oilf/xilf.
++      if (isUInt<32>(Imm.getZExtValue()))
++        return TCC_Free;
++      // Masks supported by oihf/xihf.
++      if ((Imm.getZExtValue() & 0xffffffff) == 0)
++        return TCC_Free;
++    }
++    break;
++  case Instruction::And:
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      // Any 32-bit AND operation can by implemented via nilf.
++      if (BitSize <= 32)
++        return TCC_Free;
++      // 64-bit masks supported by nilf.
++      if (isUInt<32>(~Imm.getZExtValue()))
++        return TCC_Free;
++      // 64-bit masks supported by nilh.
++      if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
++        return TCC_Free;
++      // Some 64-bit AND operations can be implemented via risbg.
++      const SystemZInstrInfo *TII = ST->getInstrInfo();
++      unsigned Start, End;
++      if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
++        return TCC_Free;
++    }
++    break;
++  case Instruction::Shl:
++  case Instruction::LShr:
++  case Instruction::AShr:
++    // Always return TCC_Free for the shift value of a shift instruction.
++    if (Idx == 1)
++      return TCC_Free;
++    break;
++  case Instruction::UDiv:
++  case Instruction::SDiv:
++  case Instruction::URem:
++  case Instruction::SRem:
++  case Instruction::Trunc:
++  case Instruction::ZExt:
++  case Instruction::SExt:
++  case Instruction::IntToPtr:
++  case Instruction::PtrToInt:
++  case Instruction::BitCast:
++  case Instruction::PHI:
++  case Instruction::Call:
++  case Instruction::Select:
++  case Instruction::Ret:
++  case Instruction::Load:
++    break;
++  }
++
++  return SystemZTTI::getIntImmCost(Imm, Ty);
++}
++
++unsigned SystemZTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
++                                       const APInt &Imm, Type *Ty) {
++  assert(Ty->isIntegerTy());
++
++  unsigned BitSize = Ty->getPrimitiveSizeInBits();
++  // There is no cost model for constants with a bit size of 0. Return TCC_Free
++  // here, so that constant hoisting will ignore this constant.
++  if (BitSize == 0)
++    return TCC_Free;
++  // No cost model for operations on integers larger than 64 bit implemented yet.
++  if (BitSize > 64)
++    return TCC_Free;
++
++  switch (IID) {
++  default:
++    return TCC_Free;
++  case Intrinsic::sadd_with_overflow:
++  case Intrinsic::uadd_with_overflow:
++  case Intrinsic::ssub_with_overflow:
++  case Intrinsic::usub_with_overflow:
++    // These get expanded to include a normal addition/subtraction.
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      if (isUInt<32>(Imm.getZExtValue()))
++        return TCC_Free;
++      if (isUInt<32>(-Imm.getSExtValue()))
++        return TCC_Free;
++    }
++    break;
++  case Intrinsic::smul_with_overflow:
++  case Intrinsic::umul_with_overflow:
++    // These get expanded to include a normal multiplication.
++    if (Idx == 1 && Imm.getBitWidth() <= 64) {
++      if (isInt<32>(Imm.getSExtValue()))
++        return TCC_Free;
++    }
++    break;
++  case Intrinsic::experimental_stackmap:
++    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
++      return TCC_Free;
++    break;
++  case Intrinsic::experimental_patchpoint_void:
++  case Intrinsic::experimental_patchpoint_i64:
++    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
++      return TCC_Free;
++    break;
++  }
++  return SystemZTTI::getIntImmCost(Imm, Ty);
++}
++
++SystemZTTI::PopcntSupportKind
++SystemZTTI::getPopcntSupport(unsigned TyWidth) {
++  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
++  if (ST->hasPopulationCount() && TyWidth <= 64)
++    return PSK_FastHardware;
++  return PSK_Software;
++}
++
++unsigned SystemZTTI::getNumberOfRegisters(bool Vector) {
++  if (!Vector)
++    // Discount the stack pointer.  Also leave out %r0, since it can't
++    // be used in an address.
++    return 14;
++  if (ST->hasVector())
++    return 32;
++  return 0;
++}
++
++unsigned SystemZTTI::getRegisterBitWidth(bool Vector) {
++  if (!Vector)
++    return 64;
++  if (ST->hasVector())
++    return 128;
++  return 0;
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/ctpop-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/ctpop-01.ll
+@@ -0,0 +1,96 @@
++; Test population-count instruction
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
++
++declare i32 @llvm.ctpop.i32(i32 %a)
++declare i64 @llvm.ctpop.i64(i64 %a)
++
++define i32 @f1(i32 %a) {
++; CHECK-LABEL: f1:
++; CHECK: popcnt  %r0, %r2
++; CHECK: sllk    %r1, %r0, 16
++; CHECK: ar      %r1, %r0
++; CHECK: sllk    %r2, %r1, 8
++; CHECK: ar      %r2, %r1
++; CHECK: srl     %r2, 24
++; CHECK: br      %r14
++
++  %popcnt = call i32 @llvm.ctpop.i32(i32 %a)
++  ret i32 %popcnt
++}
++
++define i32 @f2(i32 %a) {
++; CHECK-LABEL: f2:
++; CHECK: llhr    %r0, %r2
++; CHECK: popcnt  %r0, %r0
++; CHECK: risblg  %r2, %r0, 16, 151, 8
++; CHECK: ar      %r2, %r0
++; CHECK: srl     %r2, 8
++; CHECK: br      %r14
++  %and = and i32 %a, 65535
++  %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
++  ret i32 %popcnt
++}
++
++define i32 @f3(i32 %a) {
++; CHECK-LABEL: f3:
++; CHECK: llcr    %r0, %r2
++; CHECK: popcnt  %r2, %r0
++; CHECK: br      %r14
++  %and = and i32 %a, 255
++  %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
++  ret i32 %popcnt
++}
++
++define i64 @f4(i64 %a) {
++; CHECK-LABEL: f4:
++; CHECK: popcnt  %r0, %r2
++; CHECK: sllg    %r1, %r0, 32
++; CHECK: agr     %r1, %r0
++; CHECK: sllg    %r0, %r1, 16
++; CHECK: agr     %r0, %r1
++; CHECK: sllg    %r1, %r0, 8
++; CHECK: agr     %r1, %r0
++; CHECK: srlg    %r2, %r1, 56
++; CHECK: br      %r14
++  %popcnt = call i64 @llvm.ctpop.i64(i64 %a)
++  ret i64 %popcnt
++}
++
++define i64 @f5(i64 %a) {
++; CHECK-LABEL: f5:
++; CHECK: llgfr   %r0, %r2
++; CHECK: popcnt  %r0, %r0
++; CHECK: sllg    %r1, %r0, 16
++; CHECK: algfr   %r0, %r1
++; CHECK: sllg    %r1, %r0, 8
++; CHECK: algfr   %r0, %r1
++; CHECK: srlg    %r2, %r0, 24
++  %and = and i64 %a, 4294967295
++  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
++  ret i64 %popcnt
++}
++
++define i64 @f6(i64 %a) {
++; CHECK-LABEL: f6:
++; CHECK: llghr   %r0, %r2
++; CHECK: popcnt  %r0, %r0
++; CHECK: risbg   %r1, %r0, 48, 183, 8
++; CHECK: agr     %r1, %r0
++; CHECK: srlg    %r2, %r1, 8
++; CHECK: br      %r14
++  %and = and i64 %a, 65535
++  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
++  ret i64 %popcnt
++}
++
++define i64 @f7(i64 %a) {
++; CHECK-LABEL: f7:
++; CHECK: llgcr   %r0, %r2
++; CHECK: popcnt  %r2, %r0
++; CHECK: br      %r14
++  %and = and i64 %a, 255
++  %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
++  ret i64 %popcnt
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/fp-abs-01.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-abs-01.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-abs-01.ll
+@@ -1,6 +1,7 @@
+ ; Test floating-point absolute.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Test f32.
+ declare float @llvm.fabs.f32(float %f)
+Index: llvm-36/test/CodeGen/SystemZ/fp-abs-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-abs-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-abs-02.ll
+@@ -1,6 +1,7 @@
+ ; Test negated floating-point absolute.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Test f32.
+ declare float @llvm.fabs.f32(float %f)
+Index: llvm-36/test/CodeGen/SystemZ/fp-add-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-add-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-add-02.ll
+@@ -1,7 +1,8 @@
+ ; Test 64-bit floating-point addition.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+-
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ declare double @foo()
+ 
+ ; Check register addition.
+@@ -76,7 +77,7 @@ define double @f6(double %f1, double *%b
+ define double @f7(double *%ptr0) {
+ ; CHECK-LABEL: f7:
+ ; CHECK: brasl %r14, foo@PLT
+-; CHECK: adb %f0, 160(%r15)
++; CHECK-SCALAR: adb %f0, 160(%r15)
+ ; CHECK: br %r14
+   %ptr1 = getelementptr double *%ptr0, i64 2
+   %ptr2 = getelementptr double *%ptr0, i64 4
+Index: llvm-36/test/CodeGen/SystemZ/fp-cmp-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-cmp-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-cmp-02.ll
+@@ -1,7 +1,10 @@
+ ; Test 64-bit floating-point comparison.  The tests assume a z10 implementation
+ ; of select, using conditional branches rather than LOCGR.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
+ 
+ declare double @foo()
+ 
+@@ -9,8 +12,9 @@ declare double @foo()
+ define i64 @f1(i64 %a, i64 %b, double %f1, double %f2) {
+ ; CHECK-LABEL: f1:
+ ; CHECK: cdbr %f0, %f2
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %cond = fcmp oeq double %f1, %f2
+   %res = select i1 %cond, i64 %a, i64 %b
+@@ -21,8 +25,9 @@ define i64 @f1(i64 %a, i64 %b, double %f
+ define i64 @f2(i64 %a, i64 %b, double %f1, double *%ptr) {
+ ; CHECK-LABEL: f2:
+ ; CHECK: cdb %f0, 0(%r4)
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %f2 = load double *%ptr
+   %cond = fcmp oeq double %f1, %f2
+@@ -34,8 +39,9 @@ define i64 @f2(i64 %a, i64 %b, double %f
+ define i64 @f3(i64 %a, i64 %b, double %f1, double *%base) {
+ ; CHECK-LABEL: f3:
+ ; CHECK: cdb %f0, 4088(%r4)
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %ptr = getelementptr double *%base, i64 511
+   %f2 = load double *%ptr
+@@ -50,8 +56,9 @@ define i64 @f4(i64 %a, i64 %b, double %f
+ ; CHECK-LABEL: f4:
+ ; CHECK: aghi %r4, 4096
+ ; CHECK: cdb %f0, 0(%r4)
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %ptr = getelementptr double *%base, i64 512
+   %f2 = load double *%ptr
+@@ -65,8 +72,9 @@ define i64 @f5(i64 %a, i64 %b, double %f
+ ; CHECK-LABEL: f5:
+ ; CHECK: aghi %r4, -8
+ ; CHECK: cdb %f0, 0(%r4)
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %ptr = getelementptr double *%base, i64 -1
+   %f2 = load double *%ptr
+@@ -80,8 +88,9 @@ define i64 @f6(i64 %a, i64 %b, double %f
+ ; CHECK-LABEL: f6:
+ ; CHECK: sllg %r1, %r5, 3
+ ; CHECK: cdb %f0, 800(%r1,%r4)
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %ptr1 = getelementptr double *%base, i64 %index
+   %ptr2 = getelementptr double *%ptr1, i64 100
+@@ -95,7 +104,7 @@ define i64 @f6(i64 %a, i64 %b, double %f
+ define double @f7(double *%ptr0) {
+ ; CHECK-LABEL: f7:
+ ; CHECK: brasl %r14, foo@PLT
+-; CHECK: cdb {{%f[0-9]+}}, 160(%r15)
++; CHECK-SCALAR: cdb {{%f[0-9]+}}, 160(%r15)
+ ; CHECK: br %r14
+   %ptr1 = getelementptr double *%ptr0, i64 2
+   %ptr2 = getelementptr double *%ptr0, i64 4
+@@ -152,9 +161,12 @@ define double @f7(double *%ptr0) {
+ ; Check comparison with zero.
+ define i64 @f8(i64 %a, i64 %b, double %f) {
+ ; CHECK-LABEL: f8:
+-; CHECK: ltdbr %f0, %f0
+-; CHECK-NEXT: je
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR: ltdbr %f0, %f0
++; CHECK-SCALAR-NEXT: je
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR: lzdr %f1
++; CHECK-VECTOR-NEXT: cdbr %f0, %f1
++; CHECK-VECTOR-NEXT: locgrne %r2, %r3
+ ; CHECK: br %r14
+   %cond = fcmp oeq double %f, 0.0
+   %res = select i1 %cond, i64 %a, i64 %b
+@@ -165,8 +177,9 @@ define i64 @f8(i64 %a, i64 %b, double %f
+ define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
+ ; CHECK-LABEL: f9:
+ ; CHECK: cdb %f0, 0(%r4)
+-; CHECK-NEXT: jl {{\.L.*}}
+-; CHECK: lgr %r2, %r3
++; CHECK-SCALAR-NEXT: jl
++; CHECK-SCALAR: lgr %r2, %r3
++; CHECK-VECTOR-NEXT: locgrnl %r2, %r3
+ ; CHECK: br %r14
+   %f1 = load double *%ptr
+   %cond = fcmp ogt double %f1, %f2
+Index: llvm-36/test/CodeGen/SystemZ/fp-conv-01.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-conv-01.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-conv-01.ll
+@@ -1,11 +1,15 @@
+ ; Test floating-point truncations.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
+ 
+ ; Test f64->f32.
+ define float @f1(double %d1, double %d2) {
+ ; CHECK-LABEL: f1:
+-; CHECK: ledbr %f0, %f2
++; CHECK-SCALAR: ledbr %f0, %f2
++; CHECK-VECTOR: ledbra %f0, 0, %f2, 0
+ ; CHECK: br %r14
+   %res = fptrunc double %d2 to float
+   ret float %res
+@@ -50,8 +54,10 @@ define double @f4(fp128 *%ptr) {
+ define void @f5(double *%dst, fp128 *%ptr, double %d1, double %d2) {
+ ; CHECK-LABEL: f5:
+ ; CHECK: ldxbr %f1, %f1
+-; CHECK: adbr %f1, %f2
+-; CHECK: std %f1, 0(%r2)
++; CHECK-SCALAR: adbr %f1, %f2
++; CHECK-SCALAR: std %f1, 0(%r2)
++; CHECK-VECTOR: wfadb [[REG:%f[0-9]+]], %f1, %f2
++; CHECK-VECTOR: std [[REG]], 0(%r2)
+ ; CHECK: br %r14
+   %val = load fp128 *%ptr
+   %conv = fptrunc fp128 %val to double
+Index: llvm-36/test/CodeGen/SystemZ/fp-conv-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-conv-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-conv-02.ll
+@@ -1,6 +1,8 @@
+ ; Test extensions of f32 to f64.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Check register extension.
+ define double @f1(float %val) {
+@@ -74,7 +76,7 @@ define double @f6(float *%base, i64 %ind
+ ; to use LDEB if possible.
+ define void @f7(double *%ptr1, float *%ptr2) {
+ ; CHECK-LABEL: f7:
+-; CHECK: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
++; CHECK-SCALAR: ldeb {{%f[0-9]+}}, 16{{[04]}}(%r15)
+ ; CHECK: br %r14
+   %val0 = load volatile float *%ptr2
+   %val1 = load volatile float *%ptr2
+Index: llvm-36/test/CodeGen/SystemZ/fp-div-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-div-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-div-02.ll
+@@ -1,6 +1,8 @@
+ ; Test 64-bit floating-point division.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ declare double @foo()
+ 
+@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%b
+ define double @f7(double *%ptr0) {
+ ; CHECK-LABEL: f7:
+ ; CHECK: brasl %r14, foo@PLT
+-; CHECK: ddb %f0, 160(%r15)
++; CHECK-SCALAR: ddb %f0, 160(%r15)
+ ; CHECK: br %r14
+   %ptr1 = getelementptr double *%ptr0, i64 2
+   %ptr2 = getelementptr double *%ptr0, i64 4
+Index: llvm-36/test/CodeGen/SystemZ/fp-move-01.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-move-01.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-move-01.ll
+@@ -1,11 +1,13 @@
+ ; Test moves between FPRs.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Test f32 moves.
+ define float @f1(float %a, float %b) {
+ ; CHECK-LABEL: f1:
+ ; CHECK: ler %f0, %f2
++; CHECK: br %r14
+   ret float %b
+ }
+ 
+@@ -13,6 +15,7 @@ define float @f1(float %a, float %b) {
+ define double @f2(double %a, double %b) {
+ ; CHECK-LABEL: f2:
+ ; CHECK: ldr %f0, %f2
++; CHECK: br %r14
+   ret double %b
+ }
+ 
+@@ -22,6 +25,7 @@ define void @f3(fp128 *%x) {
+ ; CHECK-LABEL: f3:
+ ; CHECK: lxr
+ ; CHECK: axbr
++; CHECK: br %r14
+   %val = load volatile fp128 *%x
+   %sum = fadd fp128 %val, %val
+   store volatile fp128 %sum, fp128 *%x
+Index: llvm-36/test/CodeGen/SystemZ/fp-move-04.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-move-04.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-move-04.ll
+@@ -1,6 +1,7 @@
+ ; Test 64-bit floating-point loads.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Test the low end of the LD range.
+ define double @f1(double *%src) {
+Index: llvm-36/test/CodeGen/SystemZ/fp-move-07.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-move-07.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-move-07.ll
+@@ -1,6 +1,7 @@
+ ; Test 64-bit floating-point stores.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Test the low end of the STD range.
+ define void @f1(double *%src, double %val) {
+Index: llvm-36/test/CodeGen/SystemZ/fp-move-09.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-move-09.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-move-09.ll
+@@ -1,4 +1,4 @@
+-; Test moves between FPRs and GPRs for z196 and above.
++; Test moves between FPRs and GPRs for z196 and zEC12.
+ ;
+ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+ 
+Index: llvm-36/test/CodeGen/SystemZ/fp-move-10.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/fp-move-10.ll
+@@ -0,0 +1,61 @@
++; Test moves between FPRs and GPRs for z13 and above.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Check that moves from i32s to floats use a low GR32 and vector operation.
++define float @f1(i16 *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: llh [[REG:%r[0-5]]], 0(%r2)
++; CHECK: oilh [[REG]], 16256
++; CHECK: vlvgf %v0, [[REG]], 0
++; CHECK: br %r14
++  %base = load i16 *%ptr
++  %ext = zext i16 %base to i32
++  %full = or i32 %ext, 1065353216
++  %res = bitcast i32 %full to float
++  ret float %res
++}
++
++; Check that moves from floats to i32s use a low GR32 and vector operation.
++define void @f2(float %val, i8 *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
++; CHECK: stc [[REG]], 0(%r2)
++; CHECK: br %r14
++  %res = bitcast float %val to i32
++  %trunc = trunc i32 %res to i8
++  store i8 %trunc, i8 *%ptr
++  ret void
++}
++
++; Like f2, but with a conditional store.
++define void @f3(float %val, i8 *%ptr, i32 %which) {
++; CHECK-LABEL: f3:
++; CHECK-DAG: cijlh %r3, 0,
++; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0
++; CHECK: stc [[REG]], 0(%r2)
++; CHECK: br %r14
++  %int = bitcast float %val to i32
++  %trunc = trunc i32 %int to i8
++  %old = load i8 *%ptr
++  %cmp = icmp eq i32 %which, 0
++  %res = select i1 %cmp, i8 %trunc, i8 %old
++  store i8 %res, i8 *%ptr
++  ret void
++}
++
++; ...and again with 16-bit memory.
++define void @f4(float %val, i16 *%ptr, i32 %which) {
++; CHECK-LABEL: f4:
++; CHECK-DAG: cijlh %r3, 0,
++; CHECK-DAG: vlgvf [[REG:%r[0-5]]], %v0, 0
++; CHECK: sth [[REG]], 0(%r2)
++; CHECK: br %r14
++  %int = bitcast float %val to i32
++  %trunc = trunc i32 %int to i16
++  %old = load i16 *%ptr
++  %cmp = icmp eq i32 %which, 0
++  %res = select i1 %cmp, i16 %trunc, i16 %old
++  store i16 %res, i16 *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/fp-move-11.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/fp-move-11.ll
+@@ -0,0 +1,110 @@
++; Test 32-bit floating-point loads for z13.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test that we use LDE instead of LE - low end of the LE range.
++define float @f1(float *%src) {
++; CHECK-LABEL: f1:
++; CHECK: lde %f0, 0(%r2)
++; CHECK: br %r14
++  %val = load float *%src
++  ret float %val
++}
++
++; Test that we use LDE instead of LE - high end of the LE range.
++define float @f2(float *%src) {
++; CHECK-LABEL: f2:
++; CHECK: lde %f0, 4092(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 1023
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check the next word up, which should use LEY instead of LDE.
++define float @f3(float *%src) {
++; CHECK-LABEL: f3:
++; CHECK: ley %f0, 4096(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 1024
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check the high end of the aligned LEY range.
++define float @f4(float *%src) {
++; CHECK-LABEL: f4:
++; CHECK: ley %f0, 524284(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 131071
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check the next word up, which needs separate address logic.
++; Other sequences besides this one would be OK.
++define float @f5(float *%src) {
++; CHECK-LABEL: f5:
++; CHECK: agfi %r2, 524288
++; CHECK: lde %f0, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 131072
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check the high end of the negative aligned LEY range.
++define float @f6(float *%src) {
++; CHECK-LABEL: f6:
++; CHECK: ley %f0, -4(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 -1
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check the low end of the LEY range.
++define float @f7(float *%src) {
++; CHECK-LABEL: f7:
++; CHECK: ley %f0, -524288(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 -131072
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check the next word down, which needs separate address logic.
++; Other sequences besides this one would be OK.
++define float @f8(float *%src) {
++; CHECK-LABEL: f8:
++; CHECK: agfi %r2, -524292
++; CHECK: lde %f0, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%src, i64 -131073
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check that LDE allows an index.
++define float @f9(i64 %src, i64 %index) {
++; CHECK-LABEL: f9:
++; CHECK: lde %f0, 4092({{%r3,%r2|%r2,%r3}})
++; CHECK: br %r14
++  %add1 = add i64 %src, %index
++  %add2 = add i64 %add1, 4092
++  %ptr = inttoptr i64 %add2 to float *
++  %val = load float *%ptr
++  ret float %val
++}
++
++; Check that LEY allows an index.
++define float @f10(i64 %src, i64 %index) {
++; CHECK-LABEL: f10:
++; CHECK: ley %f0, 4096({{%r3,%r2|%r2,%r3}})
++; CHECK: br %r14
++  %add1 = add i64 %src, %index
++  %add2 = add i64 %add1, 4096
++  %ptr = inttoptr i64 %add2 to float *
++  %val = load float *%ptr
++  ret float %val
++}
+Index: llvm-36/test/CodeGen/SystemZ/fp-mul-03.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-mul-03.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-mul-03.ll
+@@ -1,6 +1,8 @@
+ ; Test multiplication of two f64s, producing an f64 result.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ declare double @foo()
+ 
+@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%b
+ define double @f7(double *%ptr0) {
+ ; CHECK-LABEL: f7:
+ ; CHECK: brasl %r14, foo@PLT
+-; CHECK: mdb %f0, 160(%r15)
++; CHECK-SCALAR: mdb %f0, 160(%r15)
+ ; CHECK: br %r14
+   %ptr1 = getelementptr double *%ptr0, i64 2
+   %ptr2 = getelementptr double *%ptr0, i64 4
+Index: llvm-36/test/CodeGen/SystemZ/fp-mul-07.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-mul-07.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-mul-07.ll
+@@ -1,11 +1,15 @@
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
+ 
+ declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
+ 
+ define double @f1(double %f1, double %f2, double %acc) {
+ ; CHECK-LABEL: f1:
+-; CHECK: madbr %f4, %f0, %f2
+-; CHECK: ldr %f0, %f4
++; CHECK-SCALAR: madbr %f4, %f0, %f2
++; CHECK-SCALAR: ldr %f0, %f4
++; CHECK-VECTOR: wfmadb %f0, %f0, %f2, %f4
+ ; CHECK: br %r14
+   %res = call double @llvm.fma.f64 (double %f1, double %f2, double %acc)
+   ret double %res
+Index: llvm-36/test/CodeGen/SystemZ/fp-mul-09.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-mul-09.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-mul-09.ll
+@@ -1,11 +1,15 @@
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
+ 
+ declare double @llvm.fma.f64(double %f1, double %f2, double %f3)
+ 
+ define double @f1(double %f1, double %f2, double %acc) {
+ ; CHECK-LABEL: f1:
+-; CHECK: msdbr %f4, %f0, %f2
+-; CHECK: ldr %f0, %f4
++; CHECK-SCALAR: msdbr %f4, %f0, %f2
++; CHECK-SCALAR: ldr %f0, %f4
++; CHECK-VECTOR: wfmsdb %f0, %f0, %f2, %f4
+ ; CHECK: br %r14
+   %negacc = fsub double -0.0, %acc
+   %res = call double @llvm.fma.f64 (double %f1, double %f2, double %negacc)
+Index: llvm-36/test/CodeGen/SystemZ/fp-neg-01.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-neg-01.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-neg-01.ll
+@@ -1,6 +1,7 @@
+ ; Test floating-point negation.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ ; Test f32.
+ define float @f1(float %f) {
+Index: llvm-36/test/CodeGen/SystemZ/fp-round-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-round-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-round-02.ll
+@@ -1,6 +1,9 @@
+ ; Test rounding functions for z196 and above.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-VECTOR %s
+ 
+ ; Test rint for f32.
+ declare float @llvm.rint.f32(float %f)
+@@ -16,7 +19,8 @@ define float @f1(float %f) {
+ declare double @llvm.rint.f64(double %f)
+ define double @f2(double %f) {
+ ; CHECK-LABEL: f2:
+-; CHECK: fidbr %f0, 0, %f0
++; CHECK-SCALAR: fidbr %f0, 0, %f0
++; CHECK-VECTOR: fidbra %f0, 0, %f0, 0
+ ; CHECK: br %r14
+   %res = call double @llvm.rint.f64(double %f)
+   ret double %res
+Index: llvm-36/test/CodeGen/SystemZ/fp-sqrt-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-sqrt-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-sqrt-02.ll
+@@ -1,6 +1,8 @@
+ ; Test 64-bit square root.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ declare double @llvm.sqrt.f64(double %f)
+ declare double @sqrt(double)
+@@ -77,7 +79,7 @@ define double @f6(double *%base, i64 %in
+ ; to use SQDB if possible.
+ define void @f7(double *%ptr) {
+ ; CHECK-LABEL: f7:
+-; CHECK: sqdb {{%f[0-9]+}}, 160(%r15)
++; CHECK-SCALAR: sqdb {{%f[0-9]+}}, 160(%r15)
+ ; CHECK: br %r14
+   %val0 = load volatile double *%ptr
+   %val1 = load volatile double *%ptr
+Index: llvm-36/test/CodeGen/SystemZ/fp-sub-02.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/fp-sub-02.ll
++++ llvm-36/test/CodeGen/SystemZ/fp-sub-02.ll
+@@ -1,6 +1,8 @@
+ ; Test 64-bit floating-point subtraction.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 \
++; RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SCALAR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+ 
+ declare double @foo()
+ 
+@@ -76,7 +78,7 @@ define double @f6(double %f1, double *%b
+ define double @f7(double *%ptr0) {
+ ; CHECK-LABEL: f7:
+ ; CHECK: brasl %r14, foo@PLT
+-; CHECK: sdb %f0, 16{{[04]}}(%r15)
++; CHECK-SCALAR: sdb %f0, 16{{[04]}}(%r15)
+ ; CHECK: br %r14
+   %ptr1 = getelementptr double *%ptr0, i64 2
+   %ptr2 = getelementptr double *%ptr0, i64 4
+Index: llvm-36/test/CodeGen/SystemZ/frame-03.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/frame-03.ll
++++ llvm-36/test/CodeGen/SystemZ/frame-03.ll
+@@ -2,7 +2,7 @@
+ ; uses a different register class, but the set of saved and restored
+ ; registers should be the same.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+ 
+ ; This function should require all FPRs, but no other spill slots.
+ ; We need to save and restore 8 of the 16 FPRs, so the frame size
+Index: llvm-36/test/CodeGen/SystemZ/frame-07.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/frame-07.ll
++++ llvm-36/test/CodeGen/SystemZ/frame-07.ll
+@@ -1,7 +1,7 @@
+ ; Test the saving and restoring of FPRs in large frames.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
+-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck -check-prefix=CHECK-NOFP %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+ 
+ ; Test a frame size that requires some FPRs to be saved and loaded using
+ ; the 20-bit STDY and LDY while others can use the 12-bit STD and LD.
+Index: llvm-36/test/CodeGen/SystemZ/frame-17.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/frame-17.ll
++++ llvm-36/test/CodeGen/SystemZ/frame-17.ll
+@@ -1,6 +1,6 @@
+ ; Test spilling of FPRs.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+ 
+ ; We need to save and restore 8 of the 16 FPRs and allocate an additional
+ ; 4-byte spill slot, rounded to 8 bytes.  The frame size should be exactly
+Index: llvm-36/test/CodeGen/SystemZ/frame-19.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/frame-19.ll
+@@ -0,0 +1,314 @@
++; Test spilling of vector registers.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; We need to allocate a 16-byte spill slot and save the 8 call-saved FPRs.
++; The frame size should be exactly 160 + 16 + 8 * 8 = 240.
++define void @f1(<16 x i8> *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: aghi %r15, -240
++; CHECK-DAG: std %f8,
++; CHECK-DAG: std %f9,
++; CHECK-DAG: std %f10,
++; CHECK-DAG: std %f11,
++; CHECK-DAG: std %f12,
++; CHECK-DAG: std %f13,
++; CHECK-DAG: std %f14,
++; CHECK-DAG: std %f15,
++; CHECK: vst {{%v[0-9]+}}, 160(%r15)
++; CHECK: vl {{%v[0-9]+}}, 160(%r15)
++; CHECK-DAG: ld %f8,
++; CHECK-DAG: ld %f9,
++; CHECK-DAG: ld %f10,
++; CHECK-DAG: ld %f11,
++; CHECK-DAG: ld %f12,
++; CHECK-DAG: ld %f13,
++; CHECK-DAG: ld %f14,
++; CHECK-DAG: ld %f15,
++; CHECK: aghi %r15, 240
++; CHECK: br %r14
++  %v0 = load volatile <16 x i8> *%ptr
++  %v1 = load volatile <16 x i8> *%ptr
++  %v2 = load volatile <16 x i8> *%ptr
++  %v3 = load volatile <16 x i8> *%ptr
++  %v4 = load volatile <16 x i8> *%ptr
++  %v5 = load volatile <16 x i8> *%ptr
++  %v6 = load volatile <16 x i8> *%ptr
++  %v7 = load volatile <16 x i8> *%ptr
++  %v8 = load volatile <16 x i8> *%ptr
++  %v9 = load volatile <16 x i8> *%ptr
++  %v10 = load volatile <16 x i8> *%ptr
++  %v11 = load volatile <16 x i8> *%ptr
++  %v12 = load volatile <16 x i8> *%ptr
++  %v13 = load volatile <16 x i8> *%ptr
++  %v14 = load volatile <16 x i8> *%ptr
++  %v15 = load volatile <16 x i8> *%ptr
++  %v16 = load volatile <16 x i8> *%ptr
++  %v17 = load volatile <16 x i8> *%ptr
++  %v18 = load volatile <16 x i8> *%ptr
++  %v19 = load volatile <16 x i8> *%ptr
++  %v20 = load volatile <16 x i8> *%ptr
++  %v21 = load volatile <16 x i8> *%ptr
++  %v22 = load volatile <16 x i8> *%ptr
++  %v23 = load volatile <16 x i8> *%ptr
++  %v24 = load volatile <16 x i8> *%ptr
++  %v25 = load volatile <16 x i8> *%ptr
++  %v26 = load volatile <16 x i8> *%ptr
++  %v27 = load volatile <16 x i8> *%ptr
++  %v28 = load volatile <16 x i8> *%ptr
++  %v29 = load volatile <16 x i8> *%ptr
++  %v30 = load volatile <16 x i8> *%ptr
++  %v31 = load volatile <16 x i8> *%ptr
++  %vx = load volatile <16 x i8> *%ptr
++  store volatile <16 x i8> %vx, <16 x i8> *%ptr
++  store volatile <16 x i8> %v31, <16 x i8> *%ptr
++  store volatile <16 x i8> %v30, <16 x i8> *%ptr
++  store volatile <16 x i8> %v29, <16 x i8> *%ptr
++  store volatile <16 x i8> %v28, <16 x i8> *%ptr
++  store volatile <16 x i8> %v27, <16 x i8> *%ptr
++  store volatile <16 x i8> %v26, <16 x i8> *%ptr
++  store volatile <16 x i8> %v25, <16 x i8> *%ptr
++  store volatile <16 x i8> %v24, <16 x i8> *%ptr
++  store volatile <16 x i8> %v23, <16 x i8> *%ptr
++  store volatile <16 x i8> %v22, <16 x i8> *%ptr
++  store volatile <16 x i8> %v21, <16 x i8> *%ptr
++  store volatile <16 x i8> %v20, <16 x i8> *%ptr
++  store volatile <16 x i8> %v19, <16 x i8> *%ptr
++  store volatile <16 x i8> %v18, <16 x i8> *%ptr
++  store volatile <16 x i8> %v17, <16 x i8> *%ptr
++  store volatile <16 x i8> %v16, <16 x i8> *%ptr
++  store volatile <16 x i8> %v15, <16 x i8> *%ptr
++  store volatile <16 x i8> %v14, <16 x i8> *%ptr
++  store volatile <16 x i8> %v13, <16 x i8> *%ptr
++  store volatile <16 x i8> %v12, <16 x i8> *%ptr
++  store volatile <16 x i8> %v11, <16 x i8> *%ptr
++  store volatile <16 x i8> %v10, <16 x i8> *%ptr
++  store volatile <16 x i8> %v9, <16 x i8> *%ptr
++  store volatile <16 x i8> %v8, <16 x i8> *%ptr
++  store volatile <16 x i8> %v7, <16 x i8> *%ptr
++  store volatile <16 x i8> %v6, <16 x i8> *%ptr
++  store volatile <16 x i8> %v5, <16 x i8> *%ptr
++  store volatile <16 x i8> %v4, <16 x i8> *%ptr
++  store volatile <16 x i8> %v3, <16 x i8> *%ptr
++  store volatile <16 x i8> %v2, <16 x i8> *%ptr
++  store volatile <16 x i8> %v1, <16 x i8> *%ptr
++  store volatile <16 x i8> %v0, <16 x i8> *%ptr
++  ret void
++}
++
++; Like f1, but no 16-byte slot should be needed.
++define void @f2(<16 x i8> *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: aghi %r15, -224
++; CHECK-DAG: std %f8,
++; CHECK-DAG: std %f9,
++; CHECK-DAG: std %f10,
++; CHECK-DAG: std %f11,
++; CHECK-DAG: std %f12,
++; CHECK-DAG: std %f13,
++; CHECK-DAG: std %f14,
++; CHECK-DAG: std %f15,
++; CHECK-NOT: vst {{.*}}(%r15)
++; CHECK-NOT: vl {{.*}}(%r15)
++; CHECK-DAG: ld %f8,
++; CHECK-DAG: ld %f9,
++; CHECK-DAG: ld %f10,
++; CHECK-DAG: ld %f11,
++; CHECK-DAG: ld %f12,
++; CHECK-DAG: ld %f13,
++; CHECK-DAG: ld %f14,
++; CHECK-DAG: ld %f15,
++; CHECK: aghi %r15, 224
++; CHECK: br %r14
++  %v0 = load volatile <16 x i8> *%ptr
++  %v1 = load volatile <16 x i8> *%ptr
++  %v2 = load volatile <16 x i8> *%ptr
++  %v3 = load volatile <16 x i8> *%ptr
++  %v4 = load volatile <16 x i8> *%ptr
++  %v5 = load volatile <16 x i8> *%ptr
++  %v6 = load volatile <16 x i8> *%ptr
++  %v7 = load volatile <16 x i8> *%ptr
++  %v8 = load volatile <16 x i8> *%ptr
++  %v9 = load volatile <16 x i8> *%ptr
++  %v10 = load volatile <16 x i8> *%ptr
++  %v11 = load volatile <16 x i8> *%ptr
++  %v12 = load volatile <16 x i8> *%ptr
++  %v13 = load volatile <16 x i8> *%ptr
++  %v14 = load volatile <16 x i8> *%ptr
++  %v15 = load volatile <16 x i8> *%ptr
++  %v16 = load volatile <16 x i8> *%ptr
++  %v17 = load volatile <16 x i8> *%ptr
++  %v18 = load volatile <16 x i8> *%ptr
++  %v19 = load volatile <16 x i8> *%ptr
++  %v20 = load volatile <16 x i8> *%ptr
++  %v21 = load volatile <16 x i8> *%ptr
++  %v22 = load volatile <16 x i8> *%ptr
++  %v23 = load volatile <16 x i8> *%ptr
++  %v24 = load volatile <16 x i8> *%ptr
++  %v25 = load volatile <16 x i8> *%ptr
++  %v26 = load volatile <16 x i8> *%ptr
++  %v27 = load volatile <16 x i8> *%ptr
++  %v28 = load volatile <16 x i8> *%ptr
++  %v29 = load volatile <16 x i8> *%ptr
++  %v30 = load volatile <16 x i8> *%ptr
++  %v31 = load volatile <16 x i8> *%ptr
++  store volatile <16 x i8> %v31, <16 x i8> *%ptr
++  store volatile <16 x i8> %v30, <16 x i8> *%ptr
++  store volatile <16 x i8> %v29, <16 x i8> *%ptr
++  store volatile <16 x i8> %v28, <16 x i8> *%ptr
++  store volatile <16 x i8> %v27, <16 x i8> *%ptr
++  store volatile <16 x i8> %v26, <16 x i8> *%ptr
++  store volatile <16 x i8> %v25, <16 x i8> *%ptr
++  store volatile <16 x i8> %v24, <16 x i8> *%ptr
++  store volatile <16 x i8> %v23, <16 x i8> *%ptr
++  store volatile <16 x i8> %v22, <16 x i8> *%ptr
++  store volatile <16 x i8> %v21, <16 x i8> *%ptr
++  store volatile <16 x i8> %v20, <16 x i8> *%ptr
++  store volatile <16 x i8> %v19, <16 x i8> *%ptr
++  store volatile <16 x i8> %v18, <16 x i8> *%ptr
++  store volatile <16 x i8> %v17, <16 x i8> *%ptr
++  store volatile <16 x i8> %v16, <16 x i8> *%ptr
++  store volatile <16 x i8> %v15, <16 x i8> *%ptr
++  store volatile <16 x i8> %v14, <16 x i8> *%ptr
++  store volatile <16 x i8> %v13, <16 x i8> *%ptr
++  store volatile <16 x i8> %v12, <16 x i8> *%ptr
++  store volatile <16 x i8> %v11, <16 x i8> *%ptr
++  store volatile <16 x i8> %v10, <16 x i8> *%ptr
++  store volatile <16 x i8> %v9, <16 x i8> *%ptr
++  store volatile <16 x i8> %v8, <16 x i8> *%ptr
++  store volatile <16 x i8> %v7, <16 x i8> *%ptr
++  store volatile <16 x i8> %v6, <16 x i8> *%ptr
++  store volatile <16 x i8> %v5, <16 x i8> *%ptr
++  store volatile <16 x i8> %v4, <16 x i8> *%ptr
++  store volatile <16 x i8> %v3, <16 x i8> *%ptr
++  store volatile <16 x i8> %v2, <16 x i8> *%ptr
++  store volatile <16 x i8> %v1, <16 x i8> *%ptr
++  store volatile <16 x i8> %v0, <16 x i8> *%ptr
++  ret void
++}
++
++; Like f2, but only %f8 should be saved.
++define void @f3(<16 x i8> *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: aghi %r15, -168
++; CHECK-DAG: std %f8,
++; CHECK-NOT: vst {{.*}}(%r15)
++; CHECK-NOT: vl {{.*}}(%r15)
++; CHECK-NOT: %v9
++; CHECK-NOT: %v10
++; CHECK-NOT: %v11
++; CHECK-NOT: %v12
++; CHECK-NOT: %v13
++; CHECK-NOT: %v14
++; CHECK-NOT: %v15
++; CHECK-DAG: ld %f8,
++; CHECK: aghi %r15, 168
++; CHECK: br %r14
++  %v0 = load volatile <16 x i8> *%ptr
++  %v1 = load volatile <16 x i8> *%ptr
++  %v2 = load volatile <16 x i8> *%ptr
++  %v3 = load volatile <16 x i8> *%ptr
++  %v4 = load volatile <16 x i8> *%ptr
++  %v5 = load volatile <16 x i8> *%ptr
++  %v6 = load volatile <16 x i8> *%ptr
++  %v7 = load volatile <16 x i8> *%ptr
++  %v8 = load volatile <16 x i8> *%ptr
++  %v16 = load volatile <16 x i8> *%ptr
++  %v17 = load volatile <16 x i8> *%ptr
++  %v18 = load volatile <16 x i8> *%ptr
++  %v19 = load volatile <16 x i8> *%ptr
++  %v20 = load volatile <16 x i8> *%ptr
++  %v21 = load volatile <16 x i8> *%ptr
++  %v22 = load volatile <16 x i8> *%ptr
++  %v23 = load volatile <16 x i8> *%ptr
++  %v24 = load volatile <16 x i8> *%ptr
++  %v25 = load volatile <16 x i8> *%ptr
++  %v26 = load volatile <16 x i8> *%ptr
++  %v27 = load volatile <16 x i8> *%ptr
++  %v28 = load volatile <16 x i8> *%ptr
++  %v29 = load volatile <16 x i8> *%ptr
++  %v30 = load volatile <16 x i8> *%ptr
++  %v31 = load volatile <16 x i8> *%ptr
++  store volatile <16 x i8> %v31, <16 x i8> *%ptr
++  store volatile <16 x i8> %v30, <16 x i8> *%ptr
++  store volatile <16 x i8> %v29, <16 x i8> *%ptr
++  store volatile <16 x i8> %v28, <16 x i8> *%ptr
++  store volatile <16 x i8> %v27, <16 x i8> *%ptr
++  store volatile <16 x i8> %v26, <16 x i8> *%ptr
++  store volatile <16 x i8> %v25, <16 x i8> *%ptr
++  store volatile <16 x i8> %v24, <16 x i8> *%ptr
++  store volatile <16 x i8> %v23, <16 x i8> *%ptr
++  store volatile <16 x i8> %v22, <16 x i8> *%ptr
++  store volatile <16 x i8> %v21, <16 x i8> *%ptr
++  store volatile <16 x i8> %v20, <16 x i8> *%ptr
++  store volatile <16 x i8> %v19, <16 x i8> *%ptr
++  store volatile <16 x i8> %v18, <16 x i8> *%ptr
++  store volatile <16 x i8> %v17, <16 x i8> *%ptr
++  store volatile <16 x i8> %v16, <16 x i8> *%ptr
++  store volatile <16 x i8> %v8, <16 x i8> *%ptr
++  store volatile <16 x i8> %v7, <16 x i8> *%ptr
++  store volatile <16 x i8> %v6, <16 x i8> *%ptr
++  store volatile <16 x i8> %v5, <16 x i8> *%ptr
++  store volatile <16 x i8> %v4, <16 x i8> *%ptr
++  store volatile <16 x i8> %v3, <16 x i8> *%ptr
++  store volatile <16 x i8> %v2, <16 x i8> *%ptr
++  store volatile <16 x i8> %v1, <16 x i8> *%ptr
++  store volatile <16 x i8> %v0, <16 x i8> *%ptr
++  ret void
++}
++
++; Like f2, but no registers should be saved.
++define void @f4(<16 x i8> *%ptr) {
++; CHECK-LABEL: f4:
++; CHECK-NOT: %r15
++; CHECK: br %r14
++  %v0 = load volatile <16 x i8> *%ptr
++  %v1 = load volatile <16 x i8> *%ptr
++  %v2 = load volatile <16 x i8> *%ptr
++  %v3 = load volatile <16 x i8> *%ptr
++  %v4 = load volatile <16 x i8> *%ptr
++  %v5 = load volatile <16 x i8> *%ptr
++  %v6 = load volatile <16 x i8> *%ptr
++  %v7 = load volatile <16 x i8> *%ptr
++  %v16 = load volatile <16 x i8> *%ptr
++  %v17 = load volatile <16 x i8> *%ptr
++  %v18 = load volatile <16 x i8> *%ptr
++  %v19 = load volatile <16 x i8> *%ptr
++  %v20 = load volatile <16 x i8> *%ptr
++  %v21 = load volatile <16 x i8> *%ptr
++  %v22 = load volatile <16 x i8> *%ptr
++  %v23 = load volatile <16 x i8> *%ptr
++  %v24 = load volatile <16 x i8> *%ptr
++  %v25 = load volatile <16 x i8> *%ptr
++  %v26 = load volatile <16 x i8> *%ptr
++  %v27 = load volatile <16 x i8> *%ptr
++  %v28 = load volatile <16 x i8> *%ptr
++  %v29 = load volatile <16 x i8> *%ptr
++  %v30 = load volatile <16 x i8> *%ptr
++  %v31 = load volatile <16 x i8> *%ptr
++  store volatile <16 x i8> %v31, <16 x i8> *%ptr
++  store volatile <16 x i8> %v30, <16 x i8> *%ptr
++  store volatile <16 x i8> %v29, <16 x i8> *%ptr
++  store volatile <16 x i8> %v28, <16 x i8> *%ptr
++  store volatile <16 x i8> %v27, <16 x i8> *%ptr
++  store volatile <16 x i8> %v26, <16 x i8> *%ptr
++  store volatile <16 x i8> %v25, <16 x i8> *%ptr
++  store volatile <16 x i8> %v24, <16 x i8> *%ptr
++  store volatile <16 x i8> %v23, <16 x i8> *%ptr
++  store volatile <16 x i8> %v22, <16 x i8> *%ptr
++  store volatile <16 x i8> %v21, <16 x i8> *%ptr
++  store volatile <16 x i8> %v20, <16 x i8> *%ptr
++  store volatile <16 x i8> %v19, <16 x i8> *%ptr
++  store volatile <16 x i8> %v18, <16 x i8> *%ptr
++  store volatile <16 x i8> %v17, <16 x i8> *%ptr
++  store volatile <16 x i8> %v16, <16 x i8> *%ptr
++  store volatile <16 x i8> %v7, <16 x i8> *%ptr
++  store volatile <16 x i8> %v6, <16 x i8> *%ptr
++  store volatile <16 x i8> %v5, <16 x i8> *%ptr
++  store volatile <16 x i8> %v4, <16 x i8> *%ptr
++  store volatile <16 x i8> %v3, <16 x i8> *%ptr
++  store volatile <16 x i8> %v2, <16 x i8> *%ptr
++  store volatile <16 x i8> %v1, <16 x i8> *%ptr
++  store volatile <16 x i8> %v0, <16 x i8> *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/frame-20.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/frame-20.ll
+@@ -0,0 +1,445 @@
++; Like frame-03.ll, but for z13.  In this case we have 16 more registers
++; available.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; This function should require all FPRs, but no other spill slots.
++; We need to save and restore 8 of the 16 FPRs, so the frame size
++; should be exactly 160 + 8 * 8 = 224.  The CFA offset is 160
++; (the caller-allocated part of the frame) + 224.
++define void @f1(double *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: aghi %r15, -224
++; CHECK: .cfi_def_cfa_offset 384
++; CHECK: std %f8, 216(%r15)
++; CHECK: std %f9, 208(%r15)
++; CHECK: std %f10, 200(%r15)
++; CHECK: std %f11, 192(%r15)
++; CHECK: std %f12, 184(%r15)
++; CHECK: std %f13, 176(%r15)
++; CHECK: std %f14, 168(%r15)
++; CHECK: std %f15, 160(%r15)
++; CHECK: .cfi_offset %f8, -168
++; CHECK: .cfi_offset %f9, -176
++; CHECK: .cfi_offset %f10, -184
++; CHECK: .cfi_offset %f11, -192
++; CHECK: .cfi_offset %f12, -200
++; CHECK: .cfi_offset %f13, -208
++; CHECK: .cfi_offset %f14, -216
++; CHECK: .cfi_offset %f15, -224
++; CHECK-DAG: ld %f0, 0(%r2)
++; CHECK-DAG: ld %f7, 0(%r2)
++; CHECK-DAG: ld %f8, 0(%r2)
++; CHECK-DAG: ld %f15, 0(%r2)
++; CHECK-DAG: vlrepg %v16, 0(%r2)
++; CHECK-DAG: vlrepg %v23, 0(%r2)
++; CHECK-DAG: vlrepg %v24, 0(%r2)
++; CHECK-DAG: vlrepg %v31, 0(%r2)
++; CHECK: ld %f8, 216(%r15)
++; CHECK: ld %f9, 208(%r15)
++; CHECK: ld %f10, 200(%r15)
++; CHECK: ld %f11, 192(%r15)
++; CHECK: ld %f12, 184(%r15)
++; CHECK: ld %f13, 176(%r15)
++; CHECK: ld %f14, 168(%r15)
++; CHECK: ld %f15, 160(%r15)
++; CHECK: aghi %r15, 224
++; CHECK: br %r14
++  %l0 = load volatile double *%ptr
++  %l1 = load volatile double *%ptr
++  %l2 = load volatile double *%ptr
++  %l3 = load volatile double *%ptr
++  %l4 = load volatile double *%ptr
++  %l5 = load volatile double *%ptr
++  %l6 = load volatile double *%ptr
++  %l7 = load volatile double *%ptr
++  %l8 = load volatile double *%ptr
++  %l9 = load volatile double *%ptr
++  %l10 = load volatile double *%ptr
++  %l11 = load volatile double *%ptr
++  %l12 = load volatile double *%ptr
++  %l13 = load volatile double *%ptr
++  %l14 = load volatile double *%ptr
++  %l15 = load volatile double *%ptr
++  %l16 = load volatile double *%ptr
++  %l17 = load volatile double *%ptr
++  %l18 = load volatile double *%ptr
++  %l19 = load volatile double *%ptr
++  %l20 = load volatile double *%ptr
++  %l21 = load volatile double *%ptr
++  %l22 = load volatile double *%ptr
++  %l23 = load volatile double *%ptr
++  %l24 = load volatile double *%ptr
++  %l25 = load volatile double *%ptr
++  %l26 = load volatile double *%ptr
++  %l27 = load volatile double *%ptr
++  %l28 = load volatile double *%ptr
++  %l29 = load volatile double *%ptr
++  %l30 = load volatile double *%ptr
++  %l31 = load volatile double *%ptr
++  %acc0 = fsub double %l0, %l0
++  %acc1 = fsub double %l1, %acc0
++  %acc2 = fsub double %l2, %acc1
++  %acc3 = fsub double %l3, %acc2
++  %acc4 = fsub double %l4, %acc3
++  %acc5 = fsub double %l5, %acc4
++  %acc6 = fsub double %l6, %acc5
++  %acc7 = fsub double %l7, %acc6
++  %acc8 = fsub double %l8, %acc7
++  %acc9 = fsub double %l9, %acc8
++  %acc10 = fsub double %l10, %acc9
++  %acc11 = fsub double %l11, %acc10
++  %acc12 = fsub double %l12, %acc11
++  %acc13 = fsub double %l13, %acc12
++  %acc14 = fsub double %l14, %acc13
++  %acc15 = fsub double %l15, %acc14
++  %acc16 = fsub double %l16, %acc15
++  %acc17 = fsub double %l17, %acc16
++  %acc18 = fsub double %l18, %acc17
++  %acc19 = fsub double %l19, %acc18
++  %acc20 = fsub double %l20, %acc19
++  %acc21 = fsub double %l21, %acc20
++  %acc22 = fsub double %l22, %acc21
++  %acc23 = fsub double %l23, %acc22
++  %acc24 = fsub double %l24, %acc23
++  %acc25 = fsub double %l25, %acc24
++  %acc26 = fsub double %l26, %acc25
++  %acc27 = fsub double %l27, %acc26
++  %acc28 = fsub double %l28, %acc27
++  %acc29 = fsub double %l29, %acc28
++  %acc30 = fsub double %l30, %acc29
++  %acc31 = fsub double %l31, %acc30
++  store volatile double %acc0, double *%ptr
++  store volatile double %acc1, double *%ptr
++  store volatile double %acc2, double *%ptr
++  store volatile double %acc3, double *%ptr
++  store volatile double %acc4, double *%ptr
++  store volatile double %acc5, double *%ptr
++  store volatile double %acc6, double *%ptr
++  store volatile double %acc7, double *%ptr
++  store volatile double %acc8, double *%ptr
++  store volatile double %acc9, double *%ptr
++  store volatile double %acc10, double *%ptr
++  store volatile double %acc11, double *%ptr
++  store volatile double %acc12, double *%ptr
++  store volatile double %acc13, double *%ptr
++  store volatile double %acc14, double *%ptr
++  store volatile double %acc15, double *%ptr
++  store volatile double %acc16, double *%ptr
++  store volatile double %acc17, double *%ptr
++  store volatile double %acc18, double *%ptr
++  store volatile double %acc19, double *%ptr
++  store volatile double %acc20, double *%ptr
++  store volatile double %acc21, double *%ptr
++  store volatile double %acc22, double *%ptr
++  store volatile double %acc23, double *%ptr
++  store volatile double %acc24, double *%ptr
++  store volatile double %acc25, double *%ptr
++  store volatile double %acc26, double *%ptr
++  store volatile double %acc27, double *%ptr
++  store volatile double %acc28, double *%ptr
++  store volatile double %acc29, double *%ptr
++  store volatile double %acc30, double *%ptr
++  store volatile double %acc31, double *%ptr
++  ret void
++}
++
++; Like f1, but requires one fewer FPR.  We allocate in numerical order,
++; so %f15 is the one that gets dropped.
++define void @f2(double *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: aghi %r15, -216
++; CHECK: .cfi_def_cfa_offset 376
++; CHECK: std %f8, 208(%r15)
++; CHECK: std %f9, 200(%r15)
++; CHECK: std %f10, 192(%r15)
++; CHECK: std %f11, 184(%r15)
++; CHECK: std %f12, 176(%r15)
++; CHECK: std %f13, 168(%r15)
++; CHECK: std %f14, 160(%r15)
++; CHECK: .cfi_offset %f8, -168
++; CHECK: .cfi_offset %f9, -176
++; CHECK: .cfi_offset %f10, -184
++; CHECK: .cfi_offset %f11, -192
++; CHECK: .cfi_offset %f12, -200
++; CHECK: .cfi_offset %f13, -208
++; CHECK: .cfi_offset %f14, -216
++; CHECK-NOT: %v15
++; CHECK-NOT: %f15
++; CHECK: ld %f8, 208(%r15)
++; CHECK: ld %f9, 200(%r15)
++; CHECK: ld %f10, 192(%r15)
++; CHECK: ld %f11, 184(%r15)
++; CHECK: ld %f12, 176(%r15)
++; CHECK: ld %f13, 168(%r15)
++; CHECK: ld %f14, 160(%r15)
++; CHECK: aghi %r15, 216
++; CHECK: br %r14
++  %l0 = load volatile double *%ptr
++  %l1 = load volatile double *%ptr
++  %l2 = load volatile double *%ptr
++  %l3 = load volatile double *%ptr
++  %l4 = load volatile double *%ptr
++  %l5 = load volatile double *%ptr
++  %l6 = load volatile double *%ptr
++  %l7 = load volatile double *%ptr
++  %l8 = load volatile double *%ptr
++  %l9 = load volatile double *%ptr
++  %l10 = load volatile double *%ptr
++  %l11 = load volatile double *%ptr
++  %l12 = load volatile double *%ptr
++  %l13 = load volatile double *%ptr
++  %l14 = load volatile double *%ptr
++  %l16 = load volatile double *%ptr
++  %l17 = load volatile double *%ptr
++  %l18 = load volatile double *%ptr
++  %l19 = load volatile double *%ptr
++  %l20 = load volatile double *%ptr
++  %l21 = load volatile double *%ptr
++  %l22 = load volatile double *%ptr
++  %l23 = load volatile double *%ptr
++  %l24 = load volatile double *%ptr
++  %l25 = load volatile double *%ptr
++  %l26 = load volatile double *%ptr
++  %l27 = load volatile double *%ptr
++  %l28 = load volatile double *%ptr
++  %l29 = load volatile double *%ptr
++  %l30 = load volatile double *%ptr
++  %l31 = load volatile double *%ptr
++  %acc0 = fsub double %l0, %l0
++  %acc1 = fsub double %l1, %acc0
++  %acc2 = fsub double %l2, %acc1
++  %acc3 = fsub double %l3, %acc2
++  %acc4 = fsub double %l4, %acc3
++  %acc5 = fsub double %l5, %acc4
++  %acc6 = fsub double %l6, %acc5
++  %acc7 = fsub double %l7, %acc6
++  %acc8 = fsub double %l8, %acc7
++  %acc9 = fsub double %l9, %acc8
++  %acc10 = fsub double %l10, %acc9
++  %acc11 = fsub double %l11, %acc10
++  %acc12 = fsub double %l12, %acc11
++  %acc13 = fsub double %l13, %acc12
++  %acc14 = fsub double %l14, %acc13
++  %acc16 = fsub double %l16, %acc14
++  %acc17 = fsub double %l17, %acc16
++  %acc18 = fsub double %l18, %acc17
++  %acc19 = fsub double %l19, %acc18
++  %acc20 = fsub double %l20, %acc19
++  %acc21 = fsub double %l21, %acc20
++  %acc22 = fsub double %l22, %acc21
++  %acc23 = fsub double %l23, %acc22
++  %acc24 = fsub double %l24, %acc23
++  %acc25 = fsub double %l25, %acc24
++  %acc26 = fsub double %l26, %acc25
++  %acc27 = fsub double %l27, %acc26
++  %acc28 = fsub double %l28, %acc27
++  %acc29 = fsub double %l29, %acc28
++  %acc30 = fsub double %l30, %acc29
++  %acc31 = fsub double %l31, %acc30
++  store volatile double %acc0, double *%ptr
++  store volatile double %acc1, double *%ptr
++  store volatile double %acc2, double *%ptr
++  store volatile double %acc3, double *%ptr
++  store volatile double %acc4, double *%ptr
++  store volatile double %acc5, double *%ptr
++  store volatile double %acc6, double *%ptr
++  store volatile double %acc7, double *%ptr
++  store volatile double %acc8, double *%ptr
++  store volatile double %acc9, double *%ptr
++  store volatile double %acc10, double *%ptr
++  store volatile double %acc11, double *%ptr
++  store volatile double %acc12, double *%ptr
++  store volatile double %acc13, double *%ptr
++  store volatile double %acc14, double *%ptr
++  store volatile double %acc16, double *%ptr
++  store volatile double %acc17, double *%ptr
++  store volatile double %acc18, double *%ptr
++  store volatile double %acc19, double *%ptr
++  store volatile double %acc20, double *%ptr
++  store volatile double %acc21, double *%ptr
++  store volatile double %acc22, double *%ptr
++  store volatile double %acc23, double *%ptr
++  store volatile double %acc24, double *%ptr
++  store volatile double %acc25, double *%ptr
++  store volatile double %acc26, double *%ptr
++  store volatile double %acc27, double *%ptr
++  store volatile double %acc28, double *%ptr
++  store volatile double %acc29, double *%ptr
++  store volatile double %acc30, double *%ptr
++  store volatile double %acc31, double *%ptr
++  ret void
++}
++
++; Like f1, but should require only one call-saved FPR.
++define void @f3(double *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: aghi %r15, -168
++; CHECK: .cfi_def_cfa_offset 328
++; CHECK: std %f8, 160(%r15)
++; CHECK: .cfi_offset %f8, -168
++; CHECK-NOT: {{%[fv]9}}
++; CHECK-NOT: {{%[fv]1[0-5]}}
++; CHECK: ld %f8, 160(%r15)
++; CHECK: aghi %r15, 168
++; CHECK: br %r14
++  %l0 = load volatile double *%ptr
++  %l1 = load volatile double *%ptr
++  %l2 = load volatile double *%ptr
++  %l3 = load volatile double *%ptr
++  %l4 = load volatile double *%ptr
++  %l5 = load volatile double *%ptr
++  %l6 = load volatile double *%ptr
++  %l7 = load volatile double *%ptr
++  %l8 = load volatile double *%ptr
++  %l16 = load volatile double *%ptr
++  %l17 = load volatile double *%ptr
++  %l18 = load volatile double *%ptr
++  %l19 = load volatile double *%ptr
++  %l20 = load volatile double *%ptr
++  %l21 = load volatile double *%ptr
++  %l22 = load volatile double *%ptr
++  %l23 = load volatile double *%ptr
++  %l24 = load volatile double *%ptr
++  %l25 = load volatile double *%ptr
++  %l26 = load volatile double *%ptr
++  %l27 = load volatile double *%ptr
++  %l28 = load volatile double *%ptr
++  %l29 = load volatile double *%ptr
++  %l30 = load volatile double *%ptr
++  %l31 = load volatile double *%ptr
++  %acc0 = fsub double %l0, %l0
++  %acc1 = fsub double %l1, %acc0
++  %acc2 = fsub double %l2, %acc1
++  %acc3 = fsub double %l3, %acc2
++  %acc4 = fsub double %l4, %acc3
++  %acc5 = fsub double %l5, %acc4
++  %acc6 = fsub double %l6, %acc5
++  %acc7 = fsub double %l7, %acc6
++  %acc8 = fsub double %l8, %acc7
++  %acc16 = fsub double %l16, %acc8
++  %acc17 = fsub double %l17, %acc16
++  %acc18 = fsub double %l18, %acc17
++  %acc19 = fsub double %l19, %acc18
++  %acc20 = fsub double %l20, %acc19
++  %acc21 = fsub double %l21, %acc20
++  %acc22 = fsub double %l22, %acc21
++  %acc23 = fsub double %l23, %acc22
++  %acc24 = fsub double %l24, %acc23
++  %acc25 = fsub double %l25, %acc24
++  %acc26 = fsub double %l26, %acc25
++  %acc27 = fsub double %l27, %acc26
++  %acc28 = fsub double %l28, %acc27
++  %acc29 = fsub double %l29, %acc28
++  %acc30 = fsub double %l30, %acc29
++  %acc31 = fsub double %l31, %acc30
++  store volatile double %acc0, double *%ptr
++  store volatile double %acc1, double *%ptr
++  store volatile double %acc2, double *%ptr
++  store volatile double %acc3, double *%ptr
++  store volatile double %acc4, double *%ptr
++  store volatile double %acc5, double *%ptr
++  store volatile double %acc6, double *%ptr
++  store volatile double %acc7, double *%ptr
++  store volatile double %acc8, double *%ptr
++  store volatile double %acc16, double *%ptr
++  store volatile double %acc17, double *%ptr
++  store volatile double %acc18, double *%ptr
++  store volatile double %acc19, double *%ptr
++  store volatile double %acc20, double *%ptr
++  store volatile double %acc21, double *%ptr
++  store volatile double %acc22, double *%ptr
++  store volatile double %acc23, double *%ptr
++  store volatile double %acc24, double *%ptr
++  store volatile double %acc25, double *%ptr
++  store volatile double %acc26, double *%ptr
++  store volatile double %acc27, double *%ptr
++  store volatile double %acc28, double *%ptr
++  store volatile double %acc29, double *%ptr
++  store volatile double %acc30, double *%ptr
++  store volatile double %acc31, double *%ptr
++  ret void
++}
++
++; This function should use all call-clobbered FPRs and vector registers
++; but no call-saved ones.  It shouldn't need to create a frame.
++define void @f4(double *%ptr) {
++; CHECK-LABEL: f4:
++; CHECK-NOT: %r15
++; CHECK-NOT: {{%[fv][89]}}
++; CHECK-NOT: {{%[fv]1[0-5]}}
++; CHECK: br %r14
++  %l0 = load volatile double *%ptr
++  %l1 = load volatile double *%ptr
++  %l2 = load volatile double *%ptr
++  %l3 = load volatile double *%ptr
++  %l4 = load volatile double *%ptr
++  %l5 = load volatile double *%ptr
++  %l6 = load volatile double *%ptr
++  %l7 = load volatile double *%ptr
++  %l16 = load volatile double *%ptr
++  %l17 = load volatile double *%ptr
++  %l18 = load volatile double *%ptr
++  %l19 = load volatile double *%ptr
++  %l20 = load volatile double *%ptr
++  %l21 = load volatile double *%ptr
++  %l22 = load volatile double *%ptr
++  %l23 = load volatile double *%ptr
++  %l24 = load volatile double *%ptr
++  %l25 = load volatile double *%ptr
++  %l26 = load volatile double *%ptr
++  %l27 = load volatile double *%ptr
++  %l28 = load volatile double *%ptr
++  %l29 = load volatile double *%ptr
++  %l30 = load volatile double *%ptr
++  %l31 = load volatile double *%ptr
++  %acc0 = fsub double %l0, %l0
++  %acc1 = fsub double %l1, %acc0
++  %acc2 = fsub double %l2, %acc1
++  %acc3 = fsub double %l3, %acc2
++  %acc4 = fsub double %l4, %acc3
++  %acc5 = fsub double %l5, %acc4
++  %acc6 = fsub double %l6, %acc5
++  %acc7 = fsub double %l7, %acc6
++  %acc16 = fsub double %l16, %acc7
++  %acc17 = fsub double %l17, %acc16
++  %acc18 = fsub double %l18, %acc17
++  %acc19 = fsub double %l19, %acc18
++  %acc20 = fsub double %l20, %acc19
++  %acc21 = fsub double %l21, %acc20
++  %acc22 = fsub double %l22, %acc21
++  %acc23 = fsub double %l23, %acc22
++  %acc24 = fsub double %l24, %acc23
++  %acc25 = fsub double %l25, %acc24
++  %acc26 = fsub double %l26, %acc25
++  %acc27 = fsub double %l27, %acc26
++  %acc28 = fsub double %l28, %acc27
++  %acc29 = fsub double %l29, %acc28
++  %acc30 = fsub double %l30, %acc29
++  %acc31 = fsub double %l31, %acc30
++  store volatile double %acc0, double *%ptr
++  store volatile double %acc1, double *%ptr
++  store volatile double %acc2, double *%ptr
++  store volatile double %acc3, double *%ptr
++  store volatile double %acc4, double *%ptr
++  store volatile double %acc5, double *%ptr
++  store volatile double %acc6, double *%ptr
++  store volatile double %acc7, double *%ptr
++  store volatile double %acc16, double *%ptr
++  store volatile double %acc17, double *%ptr
++  store volatile double %acc18, double *%ptr
++  store volatile double %acc19, double *%ptr
++  store volatile double %acc20, double *%ptr
++  store volatile double %acc21, double *%ptr
++  store volatile double %acc22, double *%ptr
++  store volatile double %acc23, double *%ptr
++  store volatile double %acc24, double *%ptr
++  store volatile double %acc25, double *%ptr
++  store volatile double %acc26, double *%ptr
++  store volatile double %acc27, double *%ptr
++  store volatile double %acc28, double *%ptr
++  store volatile double %acc29, double *%ptr
++  store volatile double %acc30, double *%ptr
++  store volatile double %acc31, double *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/htm-intrinsics.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/htm-intrinsics.ll
+@@ -0,0 +1,352 @@
++; Test transactional-execution intrinsics.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
++
++declare i32 @llvm.s390.tbegin(i8 *, i32)
++declare i32 @llvm.s390.tbegin.nofloat(i8 *, i32)
++declare void @llvm.s390.tbeginc(i8 *, i32)
++declare i32 @llvm.s390.tend()
++declare void @llvm.s390.tabort(i64)
++declare void @llvm.s390.ntstg(i64, i64 *)
++declare i32 @llvm.s390.etnd()
++declare void @llvm.s390.ppa.txassist(i32)
++
++; TBEGIN.
++define void @test_tbegin() {
++; CHECK-LABEL: test_tbegin:
++; CHECK-NOT: stmg
++; CHECK: std %f8,
++; CHECK: std %f9,
++; CHECK: std %f10,
++; CHECK: std %f11,
++; CHECK: std %f12,
++; CHECK: std %f13,
++; CHECK: std %f14,
++; CHECK: std %f15,
++; CHECK: tbegin 0, 65292
++; CHECK: ld %f8,
++; CHECK: ld %f9,
++; CHECK: ld %f10,
++; CHECK: ld %f11,
++; CHECK: ld %f12,
++; CHECK: ld %f13,
++; CHECK: ld %f14,
++; CHECK: ld %f15,
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin(i8 *null, i32 65292)
++  ret void
++}
++
++; TBEGIN (nofloat).
++define void @test_tbegin_nofloat1() {
++; CHECK-LABEL: test_tbegin_nofloat1:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbegin 0, 65292
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
++  ret void
++}
++
++; TBEGIN (nofloat) with integer CC return value.
++define i32 @test_tbegin_nofloat2() {
++; CHECK-LABEL: test_tbegin_nofloat2:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbegin 0, 65292
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
++  ret i32 %res
++}
++
++; TBEGIN (nofloat) with implicit CC check.
++define void @test_tbegin_nofloat3(i32 *%ptr) {
++; CHECK-LABEL: test_tbegin_nofloat3:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbegin 0, 65292
++; CHECK: jnh  {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
++  %cmp = icmp eq i32 %res, 2
++  br i1 %cmp, label %if.then, label %if.end
++
++if.then:                                          ; preds = %entry
++  store i32 0, i32* %ptr, align 4
++  br label %if.end
++
++if.end:                                           ; preds = %if.then, %entry
++  ret void
++}
++
++; TBEGIN (nofloat) with dual CC use.
++define i32 @test_tbegin_nofloat4(i32 %pad, i32 *%ptr) {
++; CHECK-LABEL: test_tbegin_nofloat4:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbegin 0, 65292
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: cijlh %r2, 2,  {{\.L*}}
++; CHECK: mvhi 0(%r3), 0
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65292)
++  %cmp = icmp eq i32 %res, 2
++  br i1 %cmp, label %if.then, label %if.end
++
++if.then:                                          ; preds = %entry
++  store i32 0, i32* %ptr, align 4
++  br label %if.end
++
++if.end:                                           ; preds = %if.then, %entry
++  ret i32 %res
++}
++
++; TBEGIN (nofloat) with register.
++define void @test_tbegin_nofloat5(i8 *%ptr) {
++; CHECK-LABEL: test_tbegin_nofloat5:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbegin 0(%r2), 65292
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin.nofloat(i8 *%ptr, i32 65292)
++  ret void
++}
++
++; TBEGIN (nofloat) with GRSM 0x0f00.
++define void @test_tbegin_nofloat6() {
++; CHECK-LABEL: test_tbegin_nofloat6:
++; CHECK: stmg %r6, %r15,
++; CHECK-NOT: std
++; CHECK: tbegin 0, 3840
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 3840)
++  ret void
++}
++
++; TBEGIN (nofloat) with GRSM 0xf100.
++define void @test_tbegin_nofloat7() {
++; CHECK-LABEL: test_tbegin_nofloat7:
++; CHECK: stmg %r8, %r15,
++; CHECK-NOT: std
++; CHECK: tbegin 0, 61696
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 61696)
++  ret void
++}
++
++; TBEGIN (nofloat) with GRSM 0xfe00 -- stack pointer added automatically.
++define void @test_tbegin_nofloat8() {
++; CHECK-LABEL: test_tbegin_nofloat8:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbegin 0, 65280
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 65024)
++  ret void
++}
++
++; TBEGIN (nofloat) with GRSM 0xfb00 -- no frame pointer needed.
++define void @test_tbegin_nofloat9() {
++; CHECK-LABEL: test_tbegin_nofloat9:
++; CHECK: stmg %r10, %r15,
++; CHECK-NOT: std
++; CHECK: tbegin 0, 64256
++; CHECK: br %r14
++  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 64256)
++  ret void
++}
++
++; TBEGIN (nofloat) with GRSM 0xfb00 -- frame pointer added automatically.
++define void @test_tbegin_nofloat10(i64 %n) {
++; CHECK-LABEL: test_tbegin_nofloat10:
++; CHECK: stmg %r11, %r15,
++; CHECK-NOT: std
++; CHECK: tbegin 0, 65280
++; CHECK: br %r14
++  %buf = alloca i8, i64 %n
++  call i32 @llvm.s390.tbegin.nofloat(i8 *null, i32 64256)
++  ret void
++}
++
++; TBEGINC.
++define void @test_tbeginc() {
++; CHECK-LABEL: test_tbeginc:
++; CHECK-NOT: stmg
++; CHECK-NOT: std
++; CHECK: tbeginc 0, 65288
++; CHECK: br %r14
++  call void @llvm.s390.tbeginc(i8 *null, i32 65288)
++  ret void
++}
++
++; TEND with integer CC return value.
++define i32 @test_tend1() {
++; CHECK-LABEL: test_tend1:
++; CHECK: tend
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.tend()
++  ret i32 %res
++}
++
++; TEND with implicit CC check.
++define void @test_tend3(i32 *%ptr) {
++; CHECK-LABEL: test_tend3:
++; CHECK: tend
++; CHECK: je  {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.tend()
++  %cmp = icmp eq i32 %res, 2
++  br i1 %cmp, label %if.then, label %if.end
++
++if.then:                                          ; preds = %entry
++  store i32 0, i32* %ptr, align 4
++  br label %if.end
++
++if.end:                                           ; preds = %if.then, %entry
++  ret void
++}
++
++; TEND with dual CC use.
++define i32 @test_tend2(i32 %pad, i32 *%ptr) {
++; CHECK-LABEL: test_tend2:
++; CHECK: tend
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: cijlh %r2, 2,  {{\.L*}}
++; CHECK: mvhi 0(%r3), 0
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.tend()
++  %cmp = icmp eq i32 %res, 2
++  br i1 %cmp, label %if.then, label %if.end
++
++if.then:                                          ; preds = %entry
++  store i32 0, i32* %ptr, align 4
++  br label %if.end
++
++if.end:                                           ; preds = %if.then, %entry
++  ret i32 %res
++}
++
++; TABORT with register only.
++define void @test_tabort1(i64 %val) {
++; CHECK-LABEL: test_tabort1:
++; CHECK: tabort 0(%r2)
++; CHECK: br %r14
++  call void @llvm.s390.tabort(i64 %val)
++  ret void
++}
++
++; TABORT with immediate only.
++define void @test_tabort2(i64 %val) {
++; CHECK-LABEL: test_tabort2:
++; CHECK: tabort 1234
++; CHECK: br %r14
++  call void @llvm.s390.tabort(i64 1234)
++  ret void
++}
++
++; TABORT with register + immediate.
++define void @test_tabort3(i64 %val) {
++; CHECK-LABEL: test_tabort3:
++; CHECK: tabort 1234(%r2)
++; CHECK: br %r14
++  %sum = add i64 %val, 1234
++  call void @llvm.s390.tabort(i64 %sum)
++  ret void
++}
++
++; TABORT with out-of-range immediate.
++define void @test_tabort4(i64 %val) {
++; CHECK-LABEL: test_tabort4:
++; CHECK: tabort 0({{%r[1-5]}})
++; CHECK: br %r14
++  call void @llvm.s390.tabort(i64 4096)
++  ret void
++}
++
++; NTSTG with base pointer only.
++define void @test_ntstg1(i64 *%ptr, i64 %val) {
++; CHECK-LABEL: test_ntstg1:
++; CHECK: ntstg %r3, 0(%r2)
++; CHECK: br %r14
++  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
++  ret void
++}
++
++; NTSTG with base and index.
++; Check that VSTL doesn't allow an index.
++define void @test_ntstg2(i64 *%base, i64 %index, i64 %val) {
++; CHECK-LABEL: test_ntstg2:
++; CHECK: sllg [[REG:%r[1-5]]], %r3, 3
++; CHECK: ntstg %r4, 0([[REG]],%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i64 %index
++  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
++  ret void
++}
++
++; NTSTG with the highest in-range displacement.
++define void @test_ntstg3(i64 *%base, i64 %val) {
++; CHECK-LABEL: test_ntstg3:
++; CHECK: ntstg %r3, 524280(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i64 65535
++  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
++  ret void
++}
++
++; NTSTG with an out-of-range positive displacement.
++define void @test_ntstg4(i64 *%base, i64 %val) {
++; CHECK-LABEL: test_ntstg4:
++; CHECK: ntstg %r3, 0({{%r[1-5]}})
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i64 65536
++  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
++  ret void
++}
++
++; NTSTG with the lowest in-range displacement.
++define void @test_ntstg5(i64 *%base, i64 %val) {
++; CHECK-LABEL: test_ntstg5:
++; CHECK: ntstg %r3, -524288(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i64 -65536
++  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
++  ret void
++}
++
++; NTSTG with an out-of-range negative displacement.
++define void @test_ntstg6(i64 *%base, i64 %val) {
++; CHECK-LABEL: test_ntstg6:
++; CHECK: ntstg %r3, 0({{%r[1-5]}})
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i64 -65537
++  call void @llvm.s390.ntstg(i64 %val, i64 *%ptr)
++  ret void
++}
++
++; ETND.
++define i32 @test_etnd() {
++; CHECK-LABEL: test_etnd:
++; CHECK: etnd %r2
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.etnd()
++  ret i32 %res
++}
++
++; PPA (Transaction-Abort Assist)
++define void @test_ppa_txassist(i32 %val) {
++; CHECK-LABEL: test_ppa_txassist:
++; CHECK: ppa %r2, 0, 1
++; CHECK: br %r14
++  call void @llvm.s390.ppa.txassist(i32 %val)
++  ret void
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/int-cmp-12.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/int-cmp-12.ll
++++ llvm-36/test/CodeGen/SystemZ/int-cmp-12.ll
+@@ -49,13 +49,24 @@ define double @f4(double %a, double %b,
+   ret double %res
+ }
+ 
+-; Check the next value up, which must use a register comparison.
++; Check the next value up, which can use a shifted comparison
+ define double @f5(double %a, double %b, i64 %i1) {
+ ; CHECK-LABEL: f5:
+-; CHECK: clgrjl %r2,
++; CHECK: srlg [[REG:%r[0-5]]], %r2, 32
++; CHECK: cgije [[REG]], 0
+ ; CHECK: ldr %f0, %f2
+ ; CHECK: br %r14
+   %cond = icmp ult i64 %i1, 4294967296
+   %res = select i1 %cond, double %a, double %b
+   ret double %res
+ }
++; Check the next value up, which must use a register comparison.
++define double @f6(double %a, double %b, i64 %i1) {
++; CHECK-LABEL: f6:
++; CHECK: clgrjl %r2,
++; CHECK: ldr %f0, %f2
++; CHECK: br %r14
++  %cond = icmp ult i64 %i1, 4294967297
++  %res = select i1 %cond, double %a, double %b
++  ret double %res
++}
+Index: llvm-36/test/CodeGen/SystemZ/int-cmp-47.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/int-cmp-47.ll
++++ llvm-36/test/CodeGen/SystemZ/int-cmp-47.ll
+@@ -309,7 +309,8 @@ exit:
+ define void @f17(i64 %a) {
+ ; CHECK-LABEL: f17:
+ ; CHECK-NOT: tmhh
+-; CHECK: llihh {{%r[0-5]}}, 49151
++; CHECK: srlg [[REG:%r[0-5]]], %r2, 48
++; CHECK: cgfi [[REG]], 49151
+ ; CHECK-NOT: tmhh
+ ; CHECK: br %r14
+ entry:
+Index: llvm-36/test/CodeGen/SystemZ/int-cmp-50.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/int-cmp-50.ll
+@@ -0,0 +1,30 @@
++; Verify that we do not crash on always-true conditions
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -O0
++;
++; This test was compiled using clang -O0 from the following source code:
++;
++; int test(unsigned long x)
++; {
++;   return x >= 0 && x <= 15;
++; }
++
++define signext i32 @test(i64 %x) {
++entry:
++  %x.addr = alloca i64, align 8
++  store i64 %x, i64* %x.addr, align 8
++  %0 = load i64 *%x.addr, align 8
++  %cmp = icmp uge i64 %0, 0
++  br i1 %cmp, label %land.rhs, label %land.end
++
++land.rhs:                                         ; preds = %entry
++  %1 = load i64 *%x.addr, align 8
++  %cmp1 = icmp ule i64 %1, 15
++  br label %land.end
++
++land.end:                                         ; preds = %land.rhs, %entry
++  %2 = phi i1 [ false, %entry ], [ %cmp1, %land.rhs ]
++  %land.ext = zext i1 %2 to i32
++  ret i32 %land.ext
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/risbg-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/risbg-03.ll
+@@ -0,0 +1,30 @@
++; Test use of RISBG vs RISBGN on zEC12.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
++
++; On zEC12, we generally prefer RISBGN.
++define i64 @f1(i64 %a, i64 %b) {
++; CHECK-LABEL: f1:
++; CHECK: risbgn %r2, %r3, 60, 62, 0
++; CHECK: br %r14
++  %anda = and i64 %a, -15
++  %andb = and i64 %b, 14
++  %or = or i64 %anda, %andb
++  ret i64 %or
++}
++
++; But we may fall back to RISBG if we can use the condition code.
++define i64 @f2(i64 %a, i64 %b, i32* %c) {
++; CHECK-LABEL: f2:
++; CHECK: risbg %r2, %r3, 60, 62, 0
++; CHECK-NEXT: ipm
++; CHECK: br %r14
++  %anda = and i64 %a, -15
++  %andb = and i64 %b, 14
++  %or = or i64 %anda, %andb
++  %cmp = icmp sgt i64 %or, 0
++  %conv = zext i1 %cmp to i32
++  store i32 %conv, i32* %c, align 4
++  ret i64 %or
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/tls-01.ll
+===================================================================
+--- llvm-36.orig/test/CodeGen/SystemZ/tls-01.ll
++++ llvm-36/test/CodeGen/SystemZ/tls-01.ll
+@@ -1,7 +1,7 @@
+-; Test initial-exec TLS accesses.
++; Test local-exec TLS accesses.
+ ;
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-MAIN
+-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-CP
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-MAIN
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-CP
+ 
+ @x = thread_local global i32 0
+ 
+Index: llvm-36/test/CodeGen/SystemZ/tls-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/tls-02.ll
+@@ -0,0 +1,18 @@
++; Test initial-exec TLS accesses.
++;
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
++
++@x = thread_local(initialexec) global i32 0
++
++; The offset must be loaded from the GOT.  This TLS access model does
++; not use literal pool constants.
++define i32 *@foo() {
++; CHECK-MAIN-LABEL: foo:
++; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
++; CHECK-MAIN: sllg %r2, [[HIGH]], 32
++; CHECK-MAIN: ear %r2, %a1
++; CHECK-MAIN: larl %r1, x@INDNTPOFF
++; CHECK-MAIN: ag %r2, 0(%r1)
++; CHECK-MAIN: br %r14
++  ret i32 *@x
++}
+Index: llvm-36/test/CodeGen/SystemZ/tls-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/tls-03.ll
+@@ -0,0 +1,23 @@
++; Test general-dynamic TLS accesses.
++;
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-CP
++
++@x = thread_local global i32 0
++
++; Call __tls_get_offset to retrieve the symbol's TLS offset.
++define i32 *@foo() {
++; CHECK-CP: .LCP{{.*}}:
++; CHECK-CP: .quad x@TLSGD
++;
++; CHECK-MAIN-LABEL: foo:
++; CHECK-MAIN-DAG: larl %r12, _GLOBAL_OFFSET_TABLE_
++; CHECK-MAIN-DAG: lgrl %r2, .LCP{{.*}}
++; CHECK-MAIN: brasl %r14, __tls_get_offset@PLT:tls_gdcall:x
++; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
++; CHECK-MAIN: sllg [[TP:%r[0-5]]], [[HIGH]], 32
++; CHECK-MAIN: ear [[TP]], %a1
++; CHECK-MAIN: agr %r2, [[TP]]
++; CHECK-MAIN: br %r14
++  ret i32 *@x
++}
+Index: llvm-36/test/CodeGen/SystemZ/tls-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/tls-04.ll
+@@ -0,0 +1,28 @@
++; Test local-dynamic TLS accesses.
++;
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-MAIN
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-CP
++
++@x = thread_local(localdynamic) global i32 0
++
++; Call __tls_get_offset to retrieve the module's TLS base offset.
++; Add the per-symbol offset and the thread pointer.
++define i32 *@foo() {
++; CHECK-CP: .LCP{{.*}}_0:
++; CHECK-CP: .quad x@TLSLDM
++; CHECK-CP: .LCP{{.*}}_1:
++; CHECK-CP: .quad x@DTPOFF
++;
++; CHECK-MAIN-LABEL: foo:
++; CHECK-MAIN-DAG: larl %r12, _GLOBAL_OFFSET_TABLE_
++; CHECK-MAIN-DAG: lgrl %r2, .LCP{{.*}}_0
++; CHECK-MAIN: brasl %r14, __tls_get_offset@PLT:tls_ldcall:x
++; CHECK-MAIN: larl %r1, .LCP{{.*}}_1
++; CHECK-MAIN: ag %r2, 0(%r1)
++; CHECK-MAIN: ear [[HIGH:%r[0-5]]], %a0
++; CHECK-MAIN: sllg [[TP:%r[0-5]]], [[HIGH]], 32
++; CHECK-MAIN: ear [[TP]], %a1
++; CHECK-MAIN: agr %r2, [[TP]]
++; CHECK-MAIN: br %r14
++  ret i32 *@x
++}
+Index: llvm-36/test/CodeGen/SystemZ/tls-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/tls-05.ll
+@@ -0,0 +1,15 @@
++; Test general-dynamic TLS access optimizations.
++;
++; If we access the same TLS variable twice, there should only be
++; a single call to __tls_get_offset.
++;
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 1
++
++@x = thread_local global i32 0
++
++define i32 @foo() {
++  %val = load i32* @x
++  %inc = add nsw i32 %val, 1
++  store i32 %inc, i32* @x
++  ret i32 %val
++}
+Index: llvm-36/test/CodeGen/SystemZ/tls-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/tls-06.ll
+@@ -0,0 +1,17 @@
++; Test general-dynamic TLS access optimizations.
++;
++; If we access two different TLS variables, we need two calls to
++; __tls_get_offset, but should load _GLOBAL_OFFSET_TABLE only once.
++;
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 2
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "_GLOBAL_OFFSET_TABLE_" | count 1
++
++@x = thread_local global i32 0
++@y = thread_local global i32 0
++
++define i32 @foo() {
++  %valx = load i32* @x
++  %valy = load i32* @y
++  %add = add nsw i32 %valx, %valy
++  ret i32 %add
++}
+Index: llvm-36/test/CodeGen/SystemZ/tls-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/tls-07.ll
+@@ -0,0 +1,16 @@
++; Test local-dynamic TLS access optimizations.
++;
++; If we access two different local-dynamic TLS variables, we only
++; need a single call to __tls_get_offset.
++;
++; RUN: llc < %s -mcpu=z10 -mtriple=s390x-linux-gnu -relocation-model=pic | grep "__tls_get_offset" | count 1
++
++@x = thread_local(localdynamic) global i32 0
++@y = thread_local(localdynamic) global i32 0
++
++define i32 @foo() {
++  %valx = load i32* @x
++  %valy = load i32* @y
++  %add = add nsw i32 %valx, %valy
++  ret i32 %add
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-abi-align.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-abi-align.ll
+@@ -0,0 +1,49 @@
++; Verify that we use the vector ABI datalayout if and only if
++; the vector facility is present.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=generic | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=vector | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+vector | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector,vector | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector,+vector | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=-vector | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=vector,-vector | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+vector,-vector | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -mattr=-vector | \
++; RUN:   FileCheck -check-prefix=CHECK-NOVECTOR %s
++
++%struct.S = type { i8, <2 x i64> }
++
++define void @test(%struct.S* %s) nounwind {
++; CHECK-VECTOR-LABEL: @test
++; CHECK-VECTOR: vl %v0, 8(%r2)
++; CHECK-NOVECTOR-LABEL: @test
++; CHECK-NOVECTOR-DAG: agsi 16(%r2), 1
++; CHECK-NOVECTOR-DAG: agsi 24(%r2), 1
++  %ptr = getelementptr %struct.S* %s, i64 0, i32 1
++  %vec = load <2 x i64>* %ptr
++  %add = add <2 x i64> %vec, <i64 1, i64 1>
++  store <2 x i64> %add, <2 x i64>* %ptr
++  ret void
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-abs-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-abs-01.ll
+@@ -0,0 +1,146 @@
++; Test v16i8 absolute.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <16 x i8> @f1(<16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlpb %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp slt <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val
++  ret <16 x i8> %ret
++}
++
++; Test with sle.
++define <16 x i8> @f2(<16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlpb %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sle <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val
++  ret <16 x i8> %ret
++}
++
++; Test with sgt.
++define <16 x i8> @f3(<16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlpb %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sgt <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg
++  ret <16 x i8> %ret
++}
++
++; Test with sge.
++define <16 x i8> @f4(<16 x i8> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlpb %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sge <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg
++  ret <16 x i8> %ret
++}
++
++; Test that negative absolute uses VLPB too.  There is no vector equivalent
++; of LOAD NEGATIVE.
++define <16 x i8> @f5(<16 x i8> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlpb [[REG:%v[0-9]+]], %v24
++; CHECK: vlcb %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %abs = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val
++  %ret = sub <16 x i8> zeroinitializer, %abs
++  ret <16 x i8> %ret
++}
++
++; Try another form of negative absolute (slt version).
++define <16 x i8> @f6(<16 x i8> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlpb [[REG:%v[0-9]+]], %v24
++; CHECK: vlcb %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg
++  ret <16 x i8> %ret
++}
++
++; Test with sle.
++define <16 x i8> @f7(<16 x i8> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vlpb [[REG:%v[0-9]+]], %v24
++; CHECK: vlcb %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sle <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %val, <16 x i8> %neg
++  ret <16 x i8> %ret
++}
++
++; Test with sgt.
++define <16 x i8> @f8(<16 x i8> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vlpb [[REG:%v[0-9]+]], %v24
++; CHECK: vlcb %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sgt <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val
++  ret <16 x i8> %ret
++}
++
++; Test with sge.
++define <16 x i8> @f9(<16 x i8> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vlpb [[REG:%v[0-9]+]], %v24
++; CHECK: vlcb %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sge <16 x i8> %val, zeroinitializer
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %ret = select <16 x i1> %cmp, <16 x i8> %neg, <16 x i8> %val
++  ret <16 x i8> %ret
++}
++
++; Test with an SRA-based boolean vector.
++define <16 x i8> @f10(<16 x i8> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vlpb %v24, %v24
++; CHECK: br %r14
++  %shr = ashr <16 x i8> %val,
++              <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
++               i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %and1 = and <16 x i8> %shr, %neg
++  %not = xor <16 x i8> %shr,
++             <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
++              i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
++  %and2 = and <16 x i8> %not, %val
++  %ret = or <16 x i8> %and1, %and2
++  ret <16 x i8> %ret
++}
++
++; ...and again in reverse
++define <16 x i8> @f11(<16 x i8> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vlpb [[REG:%v[0-9]+]], %v24
++; CHECK: vlcb %v24, [[REG]]
++; CHECK: br %r14
++  %shr = ashr <16 x i8> %val,
++              <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,
++               i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  %and1 = and <16 x i8> %shr, %val
++  %not = xor <16 x i8> %shr,
++             <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
++              i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
++  %neg = sub <16 x i8> zeroinitializer, %val
++  %and2 = and <16 x i8> %not, %neg
++  %ret = or <16 x i8> %and1, %and2
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-abs-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-abs-02.ll
+@@ -0,0 +1,142 @@
++; Test v8i16 absolute.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <8 x i16> @f1(<8 x i16> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlph %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp slt <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val
++  ret <8 x i16> %ret
++}
++
++; Test with sle.
++define <8 x i16> @f2(<8 x i16> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlph %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sle <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val
++  ret <8 x i16> %ret
++}
++
++; Test with sgt.
++define <8 x i16> @f3(<8 x i16> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlph %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sgt <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg
++  ret <8 x i16> %ret
++}
++
++; Test with sge.
++define <8 x i16> @f4(<8 x i16> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlph %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sge <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg
++  ret <8 x i16> %ret
++}
++
++; Test that negative absolute uses VLPH too.  There is no vector equivalent
++; of LOAD NEGATIVE.
++define <8 x i16> @f5(<8 x i16> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlph [[REG:%v[0-9]+]], %v24
++; CHECK: vlch %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %abs = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val
++  %ret = sub <8 x i16> zeroinitializer, %abs
++  ret <8 x i16> %ret
++}
++
++; Try another form of negative absolute (slt version).
++define <8 x i16> @f6(<8 x i16> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlph [[REG:%v[0-9]+]], %v24
++; CHECK: vlch %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg
++  ret <8 x i16> %ret
++}
++
++; Test with sle.
++define <8 x i16> @f7(<8 x i16> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vlph [[REG:%v[0-9]+]], %v24
++; CHECK: vlch %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sle <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %val, <8 x i16> %neg
++  ret <8 x i16> %ret
++}
++
++; Test with sgt.
++define <8 x i16> @f8(<8 x i16> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vlph [[REG:%v[0-9]+]], %v24
++; CHECK: vlch %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sgt <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val
++  ret <8 x i16> %ret
++}
++
++; Test with sge.
++define <8 x i16> @f9(<8 x i16> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vlph [[REG:%v[0-9]+]], %v24
++; CHECK: vlch %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sge <8 x i16> %val, zeroinitializer
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %ret = select <8 x i1> %cmp, <8 x i16> %neg, <8 x i16> %val
++  ret <8 x i16> %ret
++}
++
++; Test with an SRA-based boolean vector.
++define <8 x i16> @f10(<8 x i16> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vlph %v24, %v24
++; CHECK: br %r14
++  %shr = ashr <8 x i16> %val,
++              <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %and1 = and <8 x i16> %shr, %neg
++  %not = xor <8 x i16> %shr,
++             <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
++  %and2 = and <8 x i16> %not, %val
++  %ret = or <8 x i16> %and1, %and2
++  ret <8 x i16> %ret
++}
++
++; ...and again in reverse
++define <8 x i16> @f11(<8 x i16> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vlph [[REG:%v[0-9]+]], %v24
++; CHECK: vlch %v24, [[REG]]
++; CHECK: br %r14
++  %shr = ashr <8 x i16> %val,
++              <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %and1 = and <8 x i16> %shr, %val
++  %not = xor <8 x i16> %shr,
++             <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
++  %neg = sub <8 x i16> zeroinitializer, %val
++  %and2 = and <8 x i16> %not, %neg
++  %ret = or <8 x i16> %and1, %and2
++  ret <8 x i16> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-abs-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-abs-03.ll
+@@ -0,0 +1,138 @@
++; Test v4i32 absolute.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <4 x i32> @f1(<4 x i32> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlpf %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp slt <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val
++  ret <4 x i32> %ret
++}
++
++; Test with sle.
++define <4 x i32> @f2(<4 x i32> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlpf %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sle <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val
++  ret <4 x i32> %ret
++}
++
++; Test with sgt.
++define <4 x i32> @f3(<4 x i32> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlpf %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sgt <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg
++  ret <4 x i32> %ret
++}
++
++; Test with sge.
++define <4 x i32> @f4(<4 x i32> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlpf %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sge <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg
++  ret <4 x i32> %ret
++}
++
++; Test that negative absolute uses VLPF too.  There is no vector equivalent
++; of LOAD NEGATIVE.
++define <4 x i32> @f5(<4 x i32> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlpf [[REG:%v[0-9]+]], %v24
++; CHECK: vlcf %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %abs = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val
++  %ret = sub <4 x i32> zeroinitializer, %abs
++  ret <4 x i32> %ret
++}
++
++; Try another form of negative absolute (slt version).
++define <4 x i32> @f6(<4 x i32> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlpf [[REG:%v[0-9]+]], %v24
++; CHECK: vlcf %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg
++  ret <4 x i32> %ret
++}
++
++; Test with sle.
++define <4 x i32> @f7(<4 x i32> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vlpf [[REG:%v[0-9]+]], %v24
++; CHECK: vlcf %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sle <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %neg
++  ret <4 x i32> %ret
++}
++
++; Test with sgt.
++define <4 x i32> @f8(<4 x i32> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vlpf [[REG:%v[0-9]+]], %v24
++; CHECK: vlcf %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sgt <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val
++  ret <4 x i32> %ret
++}
++
++; Test with sge.
++define <4 x i32> @f9(<4 x i32> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vlpf [[REG:%v[0-9]+]], %v24
++; CHECK: vlcf %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sge <4 x i32> %val, zeroinitializer
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %ret = select <4 x i1> %cmp, <4 x i32> %neg, <4 x i32> %val
++  ret <4 x i32> %ret
++}
++
++; Test with an SRA-based boolean vector.
++define <4 x i32> @f10(<4 x i32> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vlpf %v24, %v24
++; CHECK: br %r14
++  %shr = ashr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31>
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %and1 = and <4 x i32> %shr, %neg
++  %not = xor <4 x i32> %shr, <i32 -1, i32 -1, i32 -1, i32 -1>
++  %and2 = and <4 x i32> %not, %val
++  %ret = or <4 x i32> %and1, %and2
++  ret <4 x i32> %ret
++}
++
++; ...and again in reverse
++define <4 x i32> @f11(<4 x i32> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vlpf [[REG:%v[0-9]+]], %v24
++; CHECK: vlcf %v24, [[REG]]
++; CHECK: br %r14
++  %shr = ashr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31>
++  %and1 = and <4 x i32> %shr, %val
++  %not = xor <4 x i32> %shr, <i32 -1, i32 -1, i32 -1, i32 -1>
++  %neg = sub <4 x i32> zeroinitializer, %val
++  %and2 = and <4 x i32> %not, %neg
++  %ret = or <4 x i32> %and1, %and2
++  ret <4 x i32> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-abs-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-abs-04.ll
+@@ -0,0 +1,138 @@
++; Test v2i64 absolute.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <2 x i64> @f1(<2 x i64> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlpg %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp slt <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val
++  ret <2 x i64> %ret
++}
++
++; Test with sle.
++define <2 x i64> @f2(<2 x i64> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlpg %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sle <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val
++  ret <2 x i64> %ret
++}
++
++; Test with sgt.
++define <2 x i64> @f3(<2 x i64> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlpg %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sgt <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg
++  ret <2 x i64> %ret
++}
++
++; Test with sge.
++define <2 x i64> @f4(<2 x i64> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlpg %v24, %v24
++; CHECK: br %r14
++  %cmp = icmp sge <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg
++  ret <2 x i64> %ret
++}
++
++; Test that negative absolute uses VLPG too.  There is no vector equivalent
++; of LOAD NEGATIVE.
++define <2 x i64> @f5(<2 x i64> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlpg [[REG:%v[0-9]+]], %v24
++; CHECK: vlcg %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %abs = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val
++  %ret = sub <2 x i64> zeroinitializer, %abs
++  ret <2 x i64> %ret
++}
++
++; Try another form of negative absolute (slt version).
++define <2 x i64> @f6(<2 x i64> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlpg [[REG:%v[0-9]+]], %v24
++; CHECK: vlcg %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp slt <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg
++  ret <2 x i64> %ret
++}
++
++; Test with sle.
++define <2 x i64> @f7(<2 x i64> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vlpg [[REG:%v[0-9]+]], %v24
++; CHECK: vlcg %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sle <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %val, <2 x i64> %neg
++  ret <2 x i64> %ret
++}
++
++; Test with sgt.
++define <2 x i64> @f8(<2 x i64> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vlpg [[REG:%v[0-9]+]], %v24
++; CHECK: vlcg %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sgt <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val
++  ret <2 x i64> %ret
++}
++
++; Test with sge.
++define <2 x i64> @f9(<2 x i64> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vlpg [[REG:%v[0-9]+]], %v24
++; CHECK: vlcg %v24, [[REG]]
++; CHECK: br %r14
++  %cmp = icmp sge <2 x i64> %val, zeroinitializer
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %ret = select <2 x i1> %cmp, <2 x i64> %neg, <2 x i64> %val
++  ret <2 x i64> %ret
++}
++
++; Test with an SRA-based boolean vector.
++define <2 x i64> @f10(<2 x i64> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vlpg %v24, %v24
++; CHECK: br %r14
++  %shr = ashr <2 x i64> %val, <i64 63, i64 63>
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %and1 = and <2 x i64> %shr, %neg
++  %not = xor <2 x i64> %shr, <i64 -1, i64 -1>
++  %and2 = and <2 x i64> %not, %val
++  %ret = or <2 x i64> %and1, %and2
++  ret <2 x i64> %ret
++}
++
++; ...and again in reverse
++define <2 x i64> @f11(<2 x i64> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vlpg [[REG:%v[0-9]+]], %v24
++; CHECK: vlcg %v24, [[REG]]
++; CHECK: br %r14
++  %shr = ashr <2 x i64> %val, <i64 63, i64 63>
++  %and1 = and <2 x i64> %shr, %val
++  %not = xor <2 x i64> %shr, <i64 -1, i64 -1>
++  %neg = sub <2 x i64> zeroinitializer, %val
++  %and2 = and <2 x i64> %not, %neg
++  %ret = or <2 x i64> %and1, %and2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-abs-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-abs-05.ll
+@@ -0,0 +1,46 @@
++; Test f64 and v2f64 absolute.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare double @llvm.fabs.f64(double)
++declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
++
++; Test a plain absolute.
++define <2 x double> @f1(<2 x double> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vflpdb %v24, %v24
++; CHECK: br %r14
++  %ret = call <2 x double> @llvm.fabs.v2f64(<2 x double> %val)
++  ret <2 x double> %ret
++}
++
++; Test a negative absolute.
++define <2 x double> @f2(<2 x double> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vflndb %v24, %v24
++; CHECK: br %r14
++  %abs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %val)
++  %ret = fsub <2 x double> <double -0.0, double -0.0>, %abs
++  ret <2 x double> %ret
++}
++
++; Test an f64 absolute that uses vector registers.
++define double @f3(<2 x double> %val) {
++; CHECK-LABEL: f3:
++; CHECK: wflpdb %f0, %v24
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %ret = call double @llvm.fabs.f64(double %scalar)
++  ret double %ret
++}
++
++; Test an f64 negative absolute that uses vector registers.
++define double @f4(<2 x double> %val) {
++; CHECK-LABEL: f4:
++; CHECK: wflndb %f0, %v24
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %abs = call double @llvm.fabs.f64(double %scalar)
++  %ret = fsub double -0.0, %abs
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-add-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-add-01.ll
+@@ -0,0 +1,60 @@
++; Test vector addition.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 addition.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vab %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = add <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 addition.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vah %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = add <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 addition.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vaf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = add <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 addition.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vag %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = add <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v2f64 addition.
++define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
++                        <2 x double> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vfadb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = fadd <2 x double> %val1, %val2
++  ret <2 x double> %ret
++}
++
++; Test an f64 addition that uses vector registers.
++define double @f6(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: wfadb %f0, %v24, %v26
++; CHECK: br %r14
++  %scalar1 = extractelement <2 x double> %val1, i32 0
++  %scalar2 = extractelement <2 x double> %val2, i32 0
++  %ret = fadd double %scalar1, %scalar2
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-and-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-and-01.ll
+@@ -0,0 +1,39 @@
++; Test vector AND.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 AND.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vn %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = and <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 AND.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vn %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = and <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 AND.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vn %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = and <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 AND.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vn %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = and <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-and-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-and-02.ll
+@@ -0,0 +1,91 @@
++; Test vector AND-NOT.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 AND-NOT.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vnc %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <16 x i8> %val2, <i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1>
++  %ret = and <16 x i8> %val1, %not
++  ret <16 x i8> %ret
++}
++
++; ...and again with the reverse.
++define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vnc %v24, %v28, %v26
++; CHECK: br %r14
++  %not = xor <16 x i8> %val1, <i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1>
++  %ret = and <16 x i8> %not, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 AND-NOT.
++define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vnc %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <8 x i16> %val2, <i16 -1, i16 -1, i16 -1, i16 -1,
++                               i16 -1, i16 -1, i16 -1, i16 -1>
++  %ret = and <8 x i16> %val1, %not
++  ret <8 x i16> %ret
++}
++
++; ...and again with the reverse.
++define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vnc %v24, %v28, %v26
++; CHECK: br %r14
++  %not = xor <8 x i16> %val1, <i16 -1, i16 -1, i16 -1, i16 -1,
++                               i16 -1, i16 -1, i16 -1, i16 -1>
++  %ret = and <8 x i16> %not, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 AND-NOT.
++define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vnc %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <4 x i32> %val2, <i32 -1, i32 -1, i32 -1, i32 -1>
++  %ret = and <4 x i32> %val1, %not
++  ret <4 x i32> %ret
++}
++
++; ...and again with the reverse.
++define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vnc %v24, %v28, %v26
++; CHECK: br %r14
++  %not = xor <4 x i32> %val1, <i32 -1, i32 -1, i32 -1, i32 -1>
++  %ret = and <4 x i32> %not, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 AND-NOT.
++define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vnc %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <2 x i64> %val2, <i64 -1, i64 -1>
++  %ret = and <2 x i64> %val1, %not
++  ret <2 x i64> %ret
++}
++
++; ...and again with the reverse.
++define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vnc %v24, %v28, %v26
++; CHECK: br %r14
++  %not = xor <2 x i64> %val1, <i64 -1, i64 -1>
++  %ret = and <2 x i64> %not, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-and-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-and-03.ll
+@@ -0,0 +1,113 @@
++; Test vector zero extensions, which need to be implemented as ANDs.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i1->v16i8 extension.
++define <16 x i8> @f1(<16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vrepib [[REG:%v[0-9]+]], 1
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <16 x i8> %val to <16 x i1>
++  %ret = zext <16 x i1> %trunc to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i1->v8i16 extension.
++define <8 x i16> @f2(<8 x i16> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vrepih [[REG:%v[0-9]+]], 1
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <8 x i16> %val to <8 x i1>
++  %ret = zext <8 x i1> %trunc to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i8->v8i16 extension.
++define <8 x i16> @f3(<8 x i16> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vgbm [[REG:%v[0-9]+]], 21845
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <8 x i16> %val to <8 x i8>
++  %ret = zext <8 x i8> %trunc to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i1->v4i32 extension.
++define <4 x i32> @f4(<4 x i32> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vrepif [[REG:%v[0-9]+]], 1
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i1>
++  %ret = zext <4 x i1> %trunc to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i8->v4i32 extension.
++define <4 x i32> @f5(<4 x i32> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vgbm [[REG:%v[0-9]+]], 4369
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i8>
++  %ret = zext <4 x i8> %trunc to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i16->v4i32 extension.
++define <4 x i32> @f6(<4 x i32> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vgbm [[REG:%v[0-9]+]], 13107
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i16>
++  %ret = zext <4 x i16> %trunc to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i1->v2i64 extension.
++define <2 x i64> @f7(<2 x i64> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vrepig [[REG:%v[0-9]+]], 1
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i1>
++  %ret = zext <2 x i1> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i8->v2i64 extension.
++define <2 x i64> @f8(<2 x i64> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vgbm [[REG:%v[0-9]+]], 257
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i8>
++  %ret = zext <2 x i8> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i16->v2i64 extension.
++define <2 x i64> @f9(<2 x i64> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vgbm [[REG:%v[0-9]+]], 771
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i16>
++  %ret = zext <2 x i16> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i32->v2i64 extension.
++define <2 x i64> @f10(<2 x i64> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vgbm [[REG:%v[0-9]+]], 3855
++; CHECK: vn %v24, %v24, [[REG]]
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i32>
++  %ret = zext <2 x i32> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-01.ll
+@@ -0,0 +1,48 @@
++; Test the handling of named vector arguments.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
++
++; This routine has 6 integer arguments, which fill up r2-r5 and
++; the stack slot at offset 160, and 10 vector arguments, which
++; fill up v24-v31 and the two double-wide stack slots at 168
++; and 184.
++declare void @bar(i64, i64, i64, i64, i64, i64,
++                  <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>,
++                  <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>,
++                  <4 x i32>, <4 x i32>)
++
++define void @foo() {
++; CHECK-VEC-LABEL: foo:
++; CHECK-VEC-DAG: vrepif %v24, 1
++; CHECK-VEC-DAG: vrepif %v26, 2
++; CHECK-VEC-DAG: vrepif %v28, 3
++; CHECK-VEC-DAG: vrepif %v30, 4
++; CHECK-VEC-DAG: vrepif %v25, 5
++; CHECK-VEC-DAG: vrepif %v27, 6
++; CHECK-VEC-DAG: vrepif %v29, 7
++; CHECK-VEC-DAG: vrepif %v31, 8
++; CHECK-VEC: brasl %r14, bar@PLT
++;
++; CHECK-STACK-LABEL: foo:
++; CHECK-STACK: aghi %r15, -200
++; CHECK-STACK-DAG: mvghi 160(%r15), 6
++; CHECK-STACK-DAG: vrepif [[REG1:%v[0-9]+]], 9
++; CHECK-STACK-DAG: vst [[REG1]], 168(%r15)
++; CHECK-STACK-DAG: vrepif [[REG2:%v[0-9]+]], 10
++; CHECK-STACK-DAG: vst [[REG2]], 184(%r15)
++; CHECK-STACK: brasl %r14, bar@PLT
++
++  call void @bar (i64 1, i64 2, i64 3, i64 4, i64 5, i64 6,
++                  <4 x i32> <i32 1, i32 1, i32 1, i32 1>,
++                  <4 x i32> <i32 2, i32 2, i32 2, i32 2>,
++                  <4 x i32> <i32 3, i32 3, i32 3, i32 3>,
++                  <4 x i32> <i32 4, i32 4, i32 4, i32 4>,
++                  <4 x i32> <i32 5, i32 5, i32 5, i32 5>,
++                  <4 x i32> <i32 6, i32 6, i32 6, i32 6>,
++                  <4 x i32> <i32 7, i32 7, i32 7, i32 7>,
++                  <4 x i32> <i32 8, i32 8, i32 8, i32 8>,
++                  <4 x i32> <i32 9, i32 9, i32 9, i32 9>,
++                  <4 x i32> <i32 10, i32 10, i32 10, i32 10>)
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-02.ll
+@@ -0,0 +1,31 @@
++; Test the handling of unnamed vector arguments.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
++
++; This routine is called with two named vector argument (passed
++; in %v24 and %v26) and two unnamed vector arguments (passed
++; in the double-wide stack slots at 160 and 176).
++declare void @bar(<4 x i32>, <4 x i32>, ...)
++
++define void @foo() {
++; CHECK-VEC-LABEL: foo:
++; CHECK-VEC-DAG: vrepif %v24, 1
++; CHECK-VEC-DAG: vrepif %v26, 2
++; CHECK-VEC: brasl %r14, bar@PLT
++;
++; CHECK-STACK-LABEL: foo:
++; CHECK-STACK: aghi %r15, -192
++; CHECK-STACK-DAG: vrepif [[REG1:%v[0-9]+]], 3
++; CHECK-STACK-DAG: vst [[REG1]], 160(%r15)
++; CHECK-STACK-DAG: vrepif [[REG2:%v[0-9]+]], 4
++; CHECK-STACK-DAG: vst [[REG2]], 176(%r15)
++; CHECK-STACK: brasl %r14, bar@PLT
++
++  call void (<4 x i32>, <4 x i32>, ...)* @bar
++              (<4 x i32> <i32 1, i32 1, i32 1, i32 1>,
++               <4 x i32> <i32 2, i32 2, i32 2, i32 2>,
++               <4 x i32> <i32 3, i32 3, i32 3, i32 3>,
++               <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-03.ll
+@@ -0,0 +1,30 @@
++; Test the handling of incoming vector arguments.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; This routine has 10 vector arguments, which fill up %v24-%v31 and
++; the two double-wide stack slots at 160 and 176.
++define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4,
++                      <4 x i32> %v5, <4 x i32> %v6, <4 x i32> %v7, <4 x i32> %v8,
++                      <4 x i32> %v9, <4 x i32> %v10) {
++; CHECK-LABEL: foo:
++; CHECK: vl [[REG1:%v[0-9]+]], 176(%r15)
++; CHECK: vsf %v24, %v26, [[REG1]]
++; CHECK: br %r14
++  %y = sub <4 x i32> %v2, %v10
++  ret <4 x i32> %y
++}
++
++; This routine has 10 vector arguments, which fill up %v24-%v31 and
++; the two single-wide stack slots at 160 and 168.
++define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4,
++                     <4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8,
++                     <4 x i8> %v9, <4 x i8> %v10) {
++; CHECK-LABEL: bar:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15)
++; CHECK: vsb %v24, %v26, [[REG1]]
++; CHECK: br %r14
++  %y = sub <4 x i8> %v2, %v10
++  ret <4 x i8> %y
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-04.ll
+@@ -0,0 +1,50 @@
++; Test the handling of named short vector arguments.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
++
++; This routine has 12 vector arguments, which fill up %v24-%v31
++; and the four single-wide stack slots starting at 160.
++declare void @bar(<1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>,
++                  <1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>,
++                  <1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>)
++
++define void @foo() {
++; CHECK-VEC-LABEL: foo:
++; CHECK-VEC-DAG: vrepib %v24, 1
++; CHECK-VEC-DAG: vrepib %v26, 2
++; CHECK-VEC-DAG: vrepib %v28, 3
++; CHECK-VEC-DAG: vrepib %v30, 4
++; CHECK-VEC-DAG: vrepib %v25, 5
++; CHECK-VEC-DAG: vrepib %v27, 6
++; CHECK-VEC-DAG: vrepib %v29, 7
++; CHECK-VEC-DAG: vrepib %v31, 8
++; CHECK-VEC: brasl %r14, bar@PLT
++;
++; CHECK-STACK-LABEL: foo:
++; CHECK-STACK: aghi %r15, -192
++; CHECK-STACK-DAG: llihh [[REG1:%r[0-9]+]], 2304
++; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
++; CHECK-STACK-DAG: llihh [[REG2:%r[0-9]+]], 2570
++; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
++; CHECK-STACK-DAG: llihf [[REG3:%r[0-9]+]], 185273099
++; CHECK-STACK-DAG: stg [[REG3]], 176(%r15)
++; CHECK-STACK-DAG: llihf [[REG4:%r[0-9]+]], 202116108
++; CHECK-STACK-DAG: oilf [[REG4]], 202116108
++; CHECK-STACK-DAG: stg [[REG4]], 176(%r15)
++; CHECK-STACK: brasl %r14, bar@PLT
++
++  call void @bar (<1 x i8> <i8 1>,
++                  <2 x i8> <i8 2, i8 2>,
++                  <4 x i8> <i8 3, i8 3, i8 3, i8 3>,
++                  <8 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>,
++                  <1 x i8> <i8 5>,
++                  <2 x i8> <i8 6, i8 6>,
++                  <4 x i8> <i8 7, i8 7, i8 7, i8 7>,
++                  <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>,
++                  <1 x i8> <i8 9>,
++                  <2 x i8> <i8 10, i8 10>,
++                  <4 x i8> <i8 11, i8 11, i8 11, i8 11>,
++                  <8 x i8> <i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12, i8 12>)
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-05.ll
+@@ -0,0 +1,32 @@
++; Test the handling of unnamed short vector arguments.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK
++
++; This routine is called with two named vector argument (passed
++; in %v24 and %v26) and two unnamed vector arguments (passed
++; in the single-wide stack slots at 160 and 168).
++declare void @bar(<4 x i8>, <4 x i8>, ...)
++
++define void @foo() {
++; CHECK-VEC-LABEL: foo:
++; CHECK-VEC-DAG: vrepib %v24, 1
++; CHECK-VEC-DAG: vrepib %v26, 2
++; CHECK-VEC: brasl %r14, bar@PLT
++;
++; CHECK-STACK-LABEL: foo:
++; CHECK-STACK: aghi %r15, -176
++; CHECK-STACK-DAG: llihf [[REG1:%r[0-9]+]], 50529027
++; CHECK-STACK-DAG: stg [[REG1]], 160(%r15)
++; CHECK-STACK-DAG: llihf [[REG2:%r[0-9]+]], 67372036
++; CHECK-STACK-DAG: stg [[REG2]], 168(%r15)
++; CHECK-STACK: brasl %r14, bar@PLT
++
++  call void (<4 x i8>, <4 x i8>, ...)* @bar
++              (<4 x i8> <i8 1, i8 1, i8 1, i8 1>,
++               <4 x i8> <i8 2, i8 2, i8 2, i8 2>,
++               <4 x i8> <i8 3, i8 3, i8 3, i8 3>,
++               <4 x i8> <i8 4, i8 4, i8 4, i8 4>)
++  ret void
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-01.ll
+@@ -0,0 +1,9 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++define void @foo(<1 x i128>) {
++  ret void
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-02.ll
+@@ -0,0 +1,9 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++define <1 x i128> @foo() {
++  ret <1 x i128><i128 0>
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-03.ll
+@@ -0,0 +1,12 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++declare void @bar(<1 x i128>)
++
++define void @foo() {
++  call void @bar (<1 x i128> <i128 0>)
++  ret void
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-04.ll
+@@ -0,0 +1,12 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++declare <1 x i128> @bar()
++
++define void @foo() {
++  %res = call <1 x i128> @bar ()
++  ret void
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-05.ll
+@@ -0,0 +1,9 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++define void @foo(<1 x fp128>) {
++  ret void
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-06.ll
+@@ -0,0 +1,9 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++define <1 x fp128> @foo() {
++  ret <1 x fp128><fp128 0xL00000000000000000000000000000000>
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-07.ll
+@@ -0,0 +1,12 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++declare void @bar(<1 x fp128>)
++
++define void @foo() {
++  call void @bar (<1 x fp128> <fp128 0xL00000000000000000000000000000000>)
++  ret void
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-args-error-08.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-args-error-08.ll
+@@ -0,0 +1,12 @@
++; Verify that we detect unsupported single-element vector types.
++
++; RUN: not llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 2>&1 | FileCheck %s
++
++declare <1 x fp128> @bar()
++
++define void @foo() {
++  %res = call <1 x fp128> @bar ()
++  ret void
++}
++
++; CHECK: LLVM ERROR: Unsupported vector argument or return type
+Index: llvm-36/test/CodeGen/SystemZ/vec-cmp-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cmp-01.ll
+@@ -0,0 +1,228 @@
++; Test v16i8 comparisons.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test eq.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vceqb %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test ne.
++define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vceqb [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test sgt.
++define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vchb %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test sge.
++define <16 x i8> @f4(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vchb [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test sle.
++define <16 x i8> @f5(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test slt.
++define <16 x i8> @f6(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vchb %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test ugt.
++define <16 x i8> @f7(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vchlb %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test uge.
++define <16 x i8> @f8(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vchlb [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test ule.
++define <16 x i8> @f9(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test ult.
++define <16 x i8> @f10(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vchlb %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <16 x i8> %val1, %val2
++  %ret = sext <16 x i1> %cmp to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test eq selects.
++define <16 x i8> @f11(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f11:
++; CHECK: vceqb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test ne selects.
++define <16 x i8> @f12(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f12:
++; CHECK: vceqb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test sgt selects.
++define <16 x i8> @f13(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f13:
++; CHECK: vchb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test sge selects.
++define <16 x i8> @f14(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f14:
++; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test sle selects.
++define <16 x i8> @f15(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f15:
++; CHECK: vchb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test slt selects.
++define <16 x i8> @f16(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f16:
++; CHECK: vchb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test ugt selects.
++define <16 x i8> @f17(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f17:
++; CHECK: vchlb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test uge selects.
++define <16 x i8> @f18(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f18:
++; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test ule selects.
++define <16 x i8> @f19(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f19:
++; CHECK: vchlb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
++
++; Test ult selects.
++define <16 x i8> @f20(<16 x i8> %val1, <16 x i8> %val2,
++                      <16 x i8> %val3, <16 x i8> %val4) {
++; CHECK-LABEL: f20:
++; CHECK: vchlb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-cmp-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cmp-02.ll
+@@ -0,0 +1,228 @@
++; Test v8i16 comparisons.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test eq.
++define <8 x i16> @f1(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vceqh %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test ne.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vceqh [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test sgt.
++define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vchh %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test sge.
++define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vchh [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test sle.
++define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test slt.
++define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vchh %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test ugt.
++define <8 x i16> @f7(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vchlh %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test uge.
++define <8 x i16> @f8(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vchlh [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test ule.
++define <8 x i16> @f9(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test ult.
++define <8 x i16> @f10(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vchlh %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <8 x i16> %val1, %val2
++  %ret = sext <8 x i1> %cmp to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test eq selects.
++define <8 x i16> @f11(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f11:
++; CHECK: vceqh [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test ne selects.
++define <8 x i16> @f12(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f12:
++; CHECK: vceqh [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test sgt selects.
++define <8 x i16> @f13(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f13:
++; CHECK: vchh [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test sge selects.
++define <8 x i16> @f14(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f14:
++; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test sle selects.
++define <8 x i16> @f15(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f15:
++; CHECK: vchh [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test slt selects.
++define <8 x i16> @f16(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f16:
++; CHECK: vchh [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test ugt selects.
++define <8 x i16> @f17(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f17:
++; CHECK: vchlh [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test uge selects.
++define <8 x i16> @f18(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f18:
++; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test ule selects.
++define <8 x i16> @f19(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f19:
++; CHECK: vchlh [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
++
++; Test ult selects.
++define <8 x i16> @f20(<8 x i16> %val1, <8 x i16> %val2,
++                      <8 x i16> %val3, <8 x i16> %val4) {
++; CHECK-LABEL: f20:
++; CHECK: vchlh [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
++  ret <8 x i16> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-cmp-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cmp-03.ll
+@@ -0,0 +1,228 @@
++; Test v4i32 comparisons.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test eq.
++define <4 x i32> @f1(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vceqf %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ne.
++define <4 x i32> @f2(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vceqf [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test sgt.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vchf %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test sge.
++define <4 x i32> @f4(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vchf [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test sle.
++define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test slt.
++define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vchf %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ugt.
++define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vchlf %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test uge.
++define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vchlf [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ule.
++define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ult.
++define <4 x i32> @f10(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vchlf %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <4 x i32> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test eq selects.
++define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f11:
++; CHECK: vceqf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test ne selects.
++define <4 x i32> @f12(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f12:
++; CHECK: vceqf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test sgt selects.
++define <4 x i32> @f13(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f13:
++; CHECK: vchf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test sge selects.
++define <4 x i32> @f14(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f14:
++; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test sle selects.
++define <4 x i32> @f15(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f15:
++; CHECK: vchf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test slt selects.
++define <4 x i32> @f16(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f16:
++; CHECK: vchf [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test ugt selects.
++define <4 x i32> @f17(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f17:
++; CHECK: vchlf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test uge selects.
++define <4 x i32> @f18(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f18:
++; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test ule selects.
++define <4 x i32> @f19(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f19:
++; CHECK: vchlf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
++
++; Test ult selects.
++define <4 x i32> @f20(<4 x i32> %val1, <4 x i32> %val2,
++                      <4 x i32> %val3, <4 x i32> %val4) {
++; CHECK-LABEL: f20:
++; CHECK: vchlf [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
++  ret <4 x i32> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-cmp-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cmp-04.ll
+@@ -0,0 +1,228 @@
++; Test v2i64 comparisons.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test eq.
++define <2 x i64> @f1(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vceqg %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ne.
++define <2 x i64> @f2(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vceqg [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test sgt.
++define <2 x i64> @f3(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vchg %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test sge.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vchg [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test sle.
++define <2 x i64> @f5(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test slt.
++define <2 x i64> @f6(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vchg %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ugt.
++define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vchlg %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test uge.
++define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vchlg [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ule.
++define <2 x i64> @f9(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ult.
++define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vchlg %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <2 x i64> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test eq selects.
++define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f11:
++; CHECK: vceqg [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp eq <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test ne selects.
++define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f12:
++; CHECK: vceqg [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ne <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test sgt selects.
++define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f13:
++; CHECK: vchg [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sgt <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test sge selects.
++define <2 x i64> @f14(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f14:
++; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sge <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test sle selects.
++define <2 x i64> @f15(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f15:
++; CHECK: vchg [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp sle <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test slt selects.
++define <2 x i64> @f16(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f16:
++; CHECK: vchg [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp slt <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test ugt selects.
++define <2 x i64> @f17(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f17:
++; CHECK: vchlg [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ugt <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test uge selects.
++define <2 x i64> @f18(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f18:
++; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp uge <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test ule selects.
++define <2 x i64> @f19(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f19:
++; CHECK: vchlg [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ule <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
++
++; Test ult selects.
++define <2 x i64> @f20(<2 x i64> %val1, <2 x i64> %val2,
++                      <2 x i64> %val3, <2 x i64> %val4) {
++; CHECK-LABEL: f20:
++; CHECK: vchlg [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = icmp ult <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-cmp-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cmp-05.ll
+@@ -0,0 +1,472 @@
++; Test v4f32 comparisons.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test oeq.
++define <4 x i32> @f1(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f1:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfcedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfcedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oeq <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test one.
++define <4 x i32> @f2(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f2:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchdb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchdb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
++; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
++; CHECK: vo %v24, [[RES1]], [[RES0]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp one <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ogt.
++define <4 x i32> @f3(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f3:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ogt <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test oge.
++define <4 x i32> @f4(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f4:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oge <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ole.
++define <4 x i32> @f5(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f5:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ole <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test olt.
++define <4 x i32> @f6(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f6:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK: vpkg %v24, [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp olt <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ueq.
++define <4 x i32> @f7(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f7:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchdb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchdb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
++; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
++; CHECK: vno %v24, [[RES1]], [[RES0]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ueq <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test une.
++define <4 x i32> @f8(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f8:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfcedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfcedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp une <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ugt.
++define <4 x i32> @f9(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f9:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ugt <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test uge.
++define <4 x i32> @f10(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f10:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uge <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ule.
++define <4 x i32> @f11(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f11:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchdb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchdb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ule <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ult.
++define <4 x i32> @f12(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f12:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchedb [[HIGHRES:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchedb [[LOWRES:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK: vpkg [[RES:%v[0-9]+]], [[HIGHRES]], [[LOWRES]]
++; CHECK-NEXT: vno %v24, [[RES]], [[RES]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ult <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test ord.
++define <4 x i32> @f13(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f13:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchedb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchedb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
++; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
++; CHECK: vo %v24, [[RES1]], [[RES0]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ord <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test uno.
++define <4 x i32> @f14(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f14:
++; CHECK-DAG: vmrhf [[HIGH0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrlf [[LOW0E:%v[0-9]+]], %v24, %v24
++; CHECK-DAG: vmrhf [[HIGH1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vmrlf [[LOW1E:%v[0-9]+]], %v26, %v26
++; CHECK-DAG: vldeb [[HIGH0D:%v[0-9]+]], [[HIGH0E]]
++; CHECK-DAG: vldeb [[HIGH1D:%v[0-9]+]], [[HIGH1E]]
++; CHECK-DAG: vldeb [[LOW0D:%v[0-9]+]], [[LOW0E]]
++; CHECK-DAG: vldeb [[LOW1D:%v[0-9]+]], [[LOW1E]]
++; CHECK-DAG: vfchedb [[HIGHRES0:%v[0-9]+]], [[HIGH0D]], [[HIGH1D]]
++; CHECK-DAG: vfchedb [[LOWRES0:%v[0-9]+]], [[LOW0D]], [[LOW1D]]
++; CHECK-DAG: vfchdb [[HIGHRES1:%v[0-9]+]], [[HIGH1D]], [[HIGH0D]]
++; CHECK-DAG: vfchdb [[LOWRES1:%v[0-9]+]], [[LOW1D]], [[LOW0D]]
++; CHECK-DAG: vpkg [[RES0:%v[0-9]+]], [[HIGHRES0]], [[LOWRES0]]
++; CHECK-DAG: vpkg [[RES1:%v[0-9]+]], [[HIGHRES1]], [[LOWRES1]]
++; CHECK: vno %v24, [[RES1]], [[RES0]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uno <4 x float> %val1, %val2
++  %ret = sext <4 x i1> %cmp to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test oeq selects.
++define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f15:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oeq <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test one selects.
++define <4 x float> @f16(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f16:
++; CHECK: vo [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp one <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ogt selects.
++define <4 x float> @f17(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f17:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ogt <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test oge selects.
++define <4 x float> @f18(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f18:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oge <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ole selects.
++define <4 x float> @f19(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f19:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ole <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test olt selects.
++define <4 x float> @f20(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f20:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp olt <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ueq selects.
++define <4 x float> @f21(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f21:
++; CHECK: vo [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ueq <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test une selects.
++define <4 x float> @f22(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f22:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp une <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ugt selects.
++define <4 x float> @f23(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f23:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ugt <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test uge selects.
++define <4 x float> @f24(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f24:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uge <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ule selects.
++define <4 x float> @f25(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f25:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ule <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ult selects.
++define <4 x float> @f26(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f26:
++; CHECK: vpkg [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ult <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test ord selects.
++define <4 x float> @f27(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f27:
++; CHECK: vo [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ord <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
++
++; Test uno selects.
++define <4 x float> @f28(<4 x float> %val1, <4 x float> %val2,
++                        <4 x float> %val3, <4 x float> %val4) {
++; CHECK-LABEL: f28:
++; CHECK: vo [[REG:%v[0-9]+]],
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uno <4 x float> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
++  ret <4 x float> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-cmp-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cmp-06.ll
+@@ -0,0 +1,349 @@
++; Test f64 and v2f64 comparisons.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test oeq.
++define <2 x i64> @f1(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vfcedb %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oeq <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test one.
++define <2 x i64> @f2(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f2:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v28, %v26
++; CHECK-DAG: vfchdb [[REG2:%v[0-9]+]], %v26, %v28
++; CHECK: vo %v24, [[REG1]], [[REG2]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp one <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ogt.
++define <2 x i64> @f3(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vfchdb %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ogt <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test oge.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vfchedb %v24, %v26, %v28
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oge <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ole.
++define <2 x i64> @f5(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vfchedb %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ole <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test olt.
++define <2 x i64> @f6(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vfchdb %v24, %v28, %v26
++; CHECK-NEXT: br %r14
++  %cmp = fcmp olt <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ueq.
++define <2 x i64> @f7(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f7:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v28, %v26
++; CHECK-DAG: vfchdb [[REG2:%v[0-9]+]], %v26, %v28
++; CHECK: vno %v24, [[REG1]], [[REG2]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ueq <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test une.
++define <2 x i64> @f8(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vfcedb [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp une <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ugt.
++define <2 x i64> @f9(<2 x i64> %dummy, <2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vfchedb [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ugt <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test uge.
++define <2 x i64> @f10(<2 x i64> %dummy, <2 x double> %val1,
++                      <2 x double> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vfchdb [[REG:%v[0-9]+]], %v28, %v26
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uge <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ule.
++define <2 x i64> @f11(<2 x i64> %dummy, <2 x double> %val1,
++                      <2 x double> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vfchdb [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ule <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ult.
++define <2 x i64> @f12(<2 x i64> %dummy, <2 x double> %val1,
++                      <2 x double> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vfchedb [[REG:%v[0-9]+]], %v26, %v28
++; CHECK-NEXT: vno %v24, [[REG]], [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ult <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test ord.
++define <2 x i64> @f13(<2 x i64> %dummy, <2 x double> %val1,
++                      <2 x double> %val2) {
++; CHECK-LABEL: f13:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v28, %v26
++; CHECK-DAG: vfchedb [[REG2:%v[0-9]+]], %v26, %v28
++; CHECK: vo %v24, [[REG1]], [[REG2]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ord <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test uno.
++define <2 x i64> @f14(<2 x i64> %dummy, <2 x double> %val1,
++                      <2 x double> %val2) {
++; CHECK-LABEL: f14:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v28, %v26
++; CHECK-DAG: vfchedb [[REG2:%v[0-9]+]], %v26, %v28
++; CHECK: vno %v24, [[REG1]], [[REG2]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uno <2 x double> %val1, %val2
++  %ret = sext <2 x i1> %cmp to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test oeq selects.
++define <2 x double> @f15(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f15:
++; CHECK: vfcedb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oeq <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test one selects.
++define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f16:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v26, %v24
++; CHECK-DAG: vfchdb [[REG2:%v[0-9]+]], %v24, %v26
++; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp one <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ogt selects.
++define <2 x double> @f17(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f17:
++; CHECK: vfchdb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ogt <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test oge selects.
++define <2 x double> @f18(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f18:
++; CHECK: vfchedb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp oge <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ole selects.
++define <2 x double> @f19(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f19:
++; CHECK: vfchedb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ole <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test olt selects.
++define <2 x double> @f20(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f20:
++; CHECK: vfchdb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp olt <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ueq selects.
++define <2 x double> @f21(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f21:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v26, %v24
++; CHECK-DAG: vfchdb [[REG2:%v[0-9]+]], %v24, %v26
++; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ueq <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test une selects.
++define <2 x double> @f22(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f22:
++; CHECK: vfcedb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp une <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ugt selects.
++define <2 x double> @f23(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f23:
++; CHECK: vfchedb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ugt <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test uge selects.
++define <2 x double> @f24(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f24:
++; CHECK: vfchdb [[REG:%v[0-9]+]], %v26, %v24
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uge <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ule selects.
++define <2 x double> @f25(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f25:
++; CHECK: vfchdb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ule <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ult selects.
++define <2 x double> @f26(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f26:
++; CHECK: vfchedb [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ult <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test ord selects.
++define <2 x double> @f27(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f27:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v26, %v24
++; CHECK-DAG: vfchedb [[REG2:%v[0-9]+]], %v24, %v26
++; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
++; CHECK-NEXT: vsel %v24, %v28, %v30, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp ord <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test uno selects.
++define <2 x double> @f28(<2 x double> %val1, <2 x double> %val2,
++                         <2 x double> %val3, <2 x double> %val4) {
++; CHECK-LABEL: f28:
++; CHECK-DAG: vfchdb [[REG1:%v[0-9]+]], %v26, %v24
++; CHECK-DAG: vfchedb [[REG2:%v[0-9]+]], %v24, %v26
++; CHECK: vo [[REG:%v[0-9]+]], [[REG1]], [[REG2]]
++; CHECK-NEXT: vsel %v24, %v30, %v28, [[REG]]
++; CHECK-NEXT: br %r14
++  %cmp = fcmp uno <2 x double> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
++  ret <2 x double> %ret
++}
++
++; Test an f64 comparison that uses vector registers.
++define i64 @f29(i64 %a, i64 %b, double %f1, <2 x double> %vec) {
++; CHECK-LABEL: f29:
++; CHECK: wfcdb %f0, %v24
++; CHECK-NEXT: locgrne %r2, %r3
++; CHECK: br %r14
++  %f2 = extractelement <2 x double> %vec, i32 0
++  %cond = fcmp oeq double %f1, %f2
++  %res = select i1 %cond, i64 %a, i64 %b
++  ret i64 %res
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-combine-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-combine-01.ll
+@@ -0,0 +1,155 @@
++; Test various target-specific DAG combiner patterns.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Check that an extraction followed by a truncation is effectively treated
++; as a bitcast.
++define void @f1(<4 x i32> %v1, <4 x i32> %v2, i8 *%ptr1, i8 *%ptr2) {
++; CHECK-LABEL: f1:
++; CHECK: vaf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vsteb [[REG]], 0(%r2), 3
++; CHECK-DAG: vsteb [[REG]], 0(%r3), 15
++; CHECK: br %r14
++  %add = add <4 x i32> %v1, %v2
++  %elem1 = extractelement <4 x i32> %add, i32 0
++  %elem2 = extractelement <4 x i32> %add, i32 3
++  %trunc1 = trunc i32 %elem1 to i8
++  %trunc2 = trunc i32 %elem2 to i8
++  store i8 %trunc1, i8 *%ptr1
++  store i8 %trunc2, i8 *%ptr2
++  ret void
++}
++
++; Test a case where a pack-type shuffle can be eliminated.
++define i16 @f2(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
++; CHECK-LABEL: f2:
++; CHECK-NOT: vpk
++; CHECK-DAG: vaf [[REG1:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vaf [[REG2:%v[0-9]+]], %v26, %v28
++; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG1]], 3
++; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG2]], 7
++; CHECK: br %r14
++  %add1 = add <4 x i32> %v1, %v2
++  %add2 = add <4 x i32> %v2, %v3
++  %shuffle = shufflevector <4 x i32> %add1, <4 x i32> %add2,
++                           <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %bitcast = bitcast <4 x i32> %shuffle to <8 x i16>
++  %elem1 = extractelement <8 x i16> %bitcast, i32 1
++  %elem2 = extractelement <8 x i16> %bitcast, i32 7
++  %res = add i16 %elem1, %elem2
++  ret i16 %res
++}
++
++; ...and again in a case where there's also a splat and a bitcast.
++define i16 @f3(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) {
++; CHECK-LABEL: f3:
++; CHECK-NOT: vrepg
++; CHECK-NOT: vpk
++; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 6
++; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3
++; CHECK: br %r14
++  %add = add <4 x i32> %v1, %v2
++  %splat = shufflevector <2 x i64> %v3, <2 x i64> undef,
++                         <2 x i32> <i32 0, i32 0>
++  %splatcast = bitcast <2 x i64> %splat to <4 x i32>
++  %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast,
++                           <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %bitcast = bitcast <4 x i32> %shuffle to <8 x i16>
++  %elem1 = extractelement <8 x i16> %bitcast, i32 2
++  %elem2 = extractelement <8 x i16> %bitcast, i32 7
++  %res = add i16 %elem1, %elem2
++  ret i16 %res
++}
++
++; ...and again with a merge low instead of a pack.
++define i16 @f4(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) {
++; CHECK-LABEL: f4:
++; CHECK-NOT: vrepg
++; CHECK-NOT: vmr
++; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 6
++; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3
++; CHECK: br %r14
++  %add = add <4 x i32> %v1, %v2
++  %splat = shufflevector <2 x i64> %v3, <2 x i64> undef,
++                         <2 x i32> <i32 0, i32 0>
++  %splatcast = bitcast <2 x i64> %splat to <4 x i32>
++  %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast,
++                           <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++  %bitcast = bitcast <4 x i32> %shuffle to <8 x i16>
++  %elem1 = extractelement <8 x i16> %bitcast, i32 4
++  %elem2 = extractelement <8 x i16> %bitcast, i32 7
++  %res = add i16 %elem1, %elem2
++  ret i16 %res
++}
++
++; ...and again with a merge high.
++define i16 @f5(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vrepg
++; CHECK-NOT: vmr
++; CHECK-DAG: vaf [[REG:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vlgvh {{%r[0-5]}}, [[REG]], 2
++; CHECK-DAG: vlgvh {{%r[0-5]}}, %v28, 3
++; CHECK: br %r14
++  %add = add <4 x i32> %v1, %v2
++  %splat = shufflevector <2 x i64> %v3, <2 x i64> undef,
++                         <2 x i32> <i32 0, i32 0>
++  %splatcast = bitcast <2 x i64> %splat to <4 x i32>
++  %shuffle = shufflevector <4 x i32> %add, <4 x i32> %splatcast,
++                           <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++  %bitcast = bitcast <4 x i32> %shuffle to <8 x i16>
++  %elem1 = extractelement <8 x i16> %bitcast, i32 4
++  %elem2 = extractelement <8 x i16> %bitcast, i32 7
++  %res = add i16 %elem1, %elem2
++  ret i16 %res
++}
++
++; Test a case where an unpack high can be eliminated from the usual
++; load-extend sequence.
++define void @f6(<8 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
++; CHECK-LABEL: f6:
++; CHECK: vlrepg [[REG:%v[0-9]+]], 0(%r2)
++; CHECK-NOT: vup
++; CHECK-DAG: vsteb [[REG]], 0(%r3), 1
++; CHECK-DAG: vsteb [[REG]], 0(%r4), 2
++; CHECK-DAG: vsteb [[REG]], 0(%r5), 7
++; CHECK: br %r14
++  %vec = load <8 x i8> *%ptr1
++  %ext = sext <8 x i8> %vec to <8 x i16>
++  %elem1 = extractelement <8 x i16> %ext, i32 1
++  %elem2 = extractelement <8 x i16> %ext, i32 2
++  %elem3 = extractelement <8 x i16> %ext, i32 7
++  %trunc1 = trunc i16 %elem1 to i8
++  %trunc2 = trunc i16 %elem2 to i8
++  %trunc3 = trunc i16 %elem3 to i8
++  store i8 %trunc1, i8 *%ptr2
++  store i8 %trunc2, i8 *%ptr3
++  store i8 %trunc3, i8 *%ptr4
++  ret void
++}
++
++; ...and again with a bitcast inbetween.
++define void @f7(<4 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
++; CHECK-LABEL: f7:
++; CHECK: vlrepf [[REG:%v[0-9]+]], 0(%r2)
++; CHECK-NOT: vup
++; CHECK-DAG: vsteb [[REG]], 0(%r3), 0
++; CHECK-DAG: vsteb [[REG]], 0(%r4), 1
++; CHECK-DAG: vsteb [[REG]], 0(%r5), 3
++; CHECK: br %r14
++  %vec = load <4 x i8> *%ptr1
++  %ext = sext <4 x i8> %vec to <4 x i32>
++  %bitcast = bitcast <4 x i32> %ext to <8 x i16>
++  %elem1 = extractelement <8 x i16> %bitcast, i32 1
++  %elem2 = extractelement <8 x i16> %bitcast, i32 3
++  %elem3 = extractelement <8 x i16> %bitcast, i32 7
++  %trunc1 = trunc i16 %elem1 to i8
++  %trunc2 = trunc i16 %elem2 to i8
++  %trunc3 = trunc i16 %elem3 to i8
++  store i8 %trunc1, i8 *%ptr2
++  store i8 %trunc2, i8 *%ptr3
++  store i8 %trunc3, i8 *%ptr4
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-combine-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-combine-02.ll
+@@ -0,0 +1,433 @@
++; Test various representations of pack-like operations.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; One way of writing a <4 x i32> -> <8 x i16> pack.
++define <8 x i16> @f1(<4 x i32> %val0, <4 x i32> %val1) {
++; CHECK-LABEL: f1:
++; CHECK: vpkf %v24, %v24, %v26
++; CHECK: br %r14
++  %elem0 = extractelement <4 x i32> %val0, i32 0
++  %elem1 = extractelement <4 x i32> %val0, i32 1
++  %elem2 = extractelement <4 x i32> %val0, i32 2
++  %elem3 = extractelement <4 x i32> %val0, i32 3
++  %elem4 = extractelement <4 x i32> %val1, i32 0
++  %elem5 = extractelement <4 x i32> %val1, i32 1
++  %elem6 = extractelement <4 x i32> %val1, i32 2
++  %elem7 = extractelement <4 x i32> %val1, i32 3
++  %hboth0 = bitcast i32 %elem0 to <2 x i16>
++  %hboth1 = bitcast i32 %elem1 to <2 x i16>
++  %hboth2 = bitcast i32 %elem2 to <2 x i16>
++  %hboth3 = bitcast i32 %elem3 to <2 x i16>
++  %hboth4 = bitcast i32 %elem4 to <2 x i16>
++  %hboth5 = bitcast i32 %elem5 to <2 x i16>
++  %hboth6 = bitcast i32 %elem6 to <2 x i16>
++  %hboth7 = bitcast i32 %elem7 to <2 x i16>
++  %hlow0 = shufflevector <2 x i16> %hboth0, <2 x i16> %hboth1,
++                         <2 x i32> <i32 1, i32 3>
++  %hlow1 = shufflevector <2 x i16> %hboth2, <2 x i16> %hboth3,
++                         <2 x i32> <i32 1, i32 3>
++  %hlow2 = shufflevector <2 x i16> %hboth4, <2 x i16> %hboth5,
++                         <2 x i32> <i32 1, i32 3>
++  %hlow3 = shufflevector <2 x i16> %hboth6, <2 x i16> %hboth7,
++                         <2 x i32> <i32 1, i32 3>
++  %join0 = shufflevector <2 x i16> %hlow0, <2 x i16> %hlow1,
++                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
++  %join1 = shufflevector <2 x i16> %hlow2, <2 x i16> %hlow3,
++                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
++  %ret = shufflevector <4 x i16> %join0, <4 x i16> %join1,
++                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                  i32 4, i32 5, i32 6, i32 7>
++  ret <8 x i16> %ret
++}
++
++; A different way of writing a <4 x i32> -> <8 x i16> pack.
++define <8 x i16> @f2(<4 x i32> %val0, <4 x i32> %val1) {
++; CHECK-LABEL: f2:
++; CHECK: vpkf %v24, %v24, %v26
++; CHECK: br %r14
++  %elem0 = extractelement <4 x i32> %val0, i32 0
++  %elem1 = extractelement <4 x i32> %val0, i32 1
++  %elem2 = extractelement <4 x i32> %val0, i32 2
++  %elem3 = extractelement <4 x i32> %val0, i32 3
++  %elem4 = extractelement <4 x i32> %val1, i32 0
++  %elem5 = extractelement <4 x i32> %val1, i32 1
++  %elem6 = extractelement <4 x i32> %val1, i32 2
++  %elem7 = extractelement <4 x i32> %val1, i32 3
++  %wvec0 = insertelement <4 x i32> undef, i32 %elem0, i32 0
++  %wvec1 = insertelement <4 x i32> undef, i32 %elem1, i32 0
++  %wvec2 = insertelement <4 x i32> undef, i32 %elem2, i32 0
++  %wvec3 = insertelement <4 x i32> undef, i32 %elem3, i32 0
++  %wvec4 = insertelement <4 x i32> undef, i32 %elem4, i32 0
++  %wvec5 = insertelement <4 x i32> undef, i32 %elem5, i32 0
++  %wvec6 = insertelement <4 x i32> undef, i32 %elem6, i32 0
++  %wvec7 = insertelement <4 x i32> undef, i32 %elem7, i32 0
++  %hvec0 = bitcast <4 x i32> %wvec0 to <8 x i16>
++  %hvec1 = bitcast <4 x i32> %wvec1 to <8 x i16>
++  %hvec2 = bitcast <4 x i32> %wvec2 to <8 x i16>
++  %hvec3 = bitcast <4 x i32> %wvec3 to <8 x i16>
++  %hvec4 = bitcast <4 x i32> %wvec4 to <8 x i16>
++  %hvec5 = bitcast <4 x i32> %wvec5 to <8 x i16>
++  %hvec6 = bitcast <4 x i32> %wvec6 to <8 x i16>
++  %hvec7 = bitcast <4 x i32> %wvec7 to <8 x i16>
++  %hlow0 = shufflevector <8 x i16> %hvec0, <8 x i16> %hvec1,
++                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
++                                    i32 undef, i32 undef, i32 undef, i32 undef>
++  %hlow1 = shufflevector <8 x i16> %hvec2, <8 x i16> %hvec3,
++                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
++                                    i32 undef, i32 undef, i32 undef, i32 undef>
++  %hlow2 = shufflevector <8 x i16> %hvec4, <8 x i16> %hvec5,
++                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
++                                    i32 undef, i32 undef, i32 undef, i32 undef>
++  %hlow3 = shufflevector <8 x i16> %hvec6, <8 x i16> %hvec7,
++                         <8 x i32> <i32 1, i32 9, i32 undef, i32 undef,
++                                    i32 undef, i32 undef, i32 undef, i32 undef>
++  %join0 = shufflevector <8 x i16> %hlow0, <8 x i16> %hlow1,
++                         <8 x i32> <i32 0, i32 1, i32 8, i32 9,
++                                    i32 undef, i32 undef, i32 undef, i32 undef>
++  %join1 = shufflevector <8 x i16> %hlow2, <8 x i16> %hlow3,
++                         <8 x i32> <i32 0, i32 1, i32 8, i32 9,
++                                    i32 undef, i32 undef, i32 undef, i32 undef>
++  %ret = shufflevector <8 x i16> %join0, <8 x i16> %join1,
++                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                  i32 8, i32 9, i32 10, i32 11>
++  ret <8 x i16> %ret
++}
++
++; A direct pack operation.
++define <8 x i16> @f3(<4 x i32> %val0, <4 x i32> %val1) {
++; CHECK-LABEL: f3:
++; CHECK: vpkf %v24, %v24, %v26
++; CHECK: br %r14
++  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
++  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
++  %ret = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
++                       <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                  i32 9, i32 11, i32 13, i32 15>
++  ret <8 x i16> %ret
++}
++
++; One way of writing a <4 x i32> -> <16 x i8> pack.  It doesn't matter
++; whether the first pack is VPKF or VPKH since the even bytes of the
++; result are discarded.
++define <16 x i8> @f4(<4 x i32> %val0, <4 x i32> %val1,
++                     <4 x i32> %val2, <4 x i32> %val3) {
++; CHECK-LABEL: f4:
++; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
++; CHECK: vpkh %v24, [[REG1]], [[REG2]]
++; CHECK: br %r14
++  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
++  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
++  %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
++  %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
++  %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
++                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                    i32 9, i32 11, i32 13, i32 15>
++  %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
++                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                    i32 9, i32 11, i32 13, i32 15>
++  %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
++  %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
++  %ret = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
++                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15,
++                                   i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Check the same operation, but with elements being extracted from the result.
++define void @f5(<4 x i32> %val0, <4 x i32> %val1,
++                <4 x i32> %val2, <4 x i32> %val3,
++                i8 *%base) {
++; CHECK-LABEL: f5:
++; CHECK-DAG: vsteb %v24, 0(%r2), 11
++; CHECK-DAG: vsteb %v26, 1(%r2), 15
++; CHECK-DAG: vsteb %v28, 2(%r2), 3
++; CHECK-DAG: vsteb %v30, 3(%r2), 7
++; CHECK: br %r14
++  %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16>
++  %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16>
++  %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16>
++  %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16>
++  %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1,
++                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                    i32 9, i32 11, i32 13, i32 15>
++  %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3,
++                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                    i32 9, i32 11, i32 13, i32 15>
++  %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8>
++  %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8>
++  %vec = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5,
++                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15,
++                                   i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31>
++
++  %ptr0 = getelementptr i8 *%base, i64 0
++  %ptr1 = getelementptr i8 *%base, i64 1
++  %ptr2 = getelementptr i8 *%base, i64 2
++  %ptr3 = getelementptr i8 *%base, i64 3
++
++  %byte0 = extractelement <16 x i8> %vec, i32 2
++  %byte1 = extractelement <16 x i8> %vec, i32 7
++  %byte2 = extractelement <16 x i8> %vec, i32 8
++  %byte3 = extractelement <16 x i8> %vec, i32 13
++
++  store i8 %byte0, i8 *%ptr0
++  store i8 %byte1, i8 *%ptr1
++  store i8 %byte2, i8 *%ptr2
++  store i8 %byte3, i8 *%ptr3
++
++  ret void
++}
++
++; A different way of writing a <4 x i32> -> <16 x i8> pack.
++define <16 x i8> @f6(<4 x i32> %val0, <4 x i32> %val1,
++                     <4 x i32> %val2, <4 x i32> %val3) {
++; CHECK-LABEL: f6:
++; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30
++; CHECK: vpkh %v24, [[REG1]], [[REG2]]
++; CHECK: br %r14
++  %elem0 = extractelement <4 x i32> %val0, i32 0
++  %elem1 = extractelement <4 x i32> %val0, i32 1
++  %elem2 = extractelement <4 x i32> %val0, i32 2
++  %elem3 = extractelement <4 x i32> %val0, i32 3
++  %elem4 = extractelement <4 x i32> %val1, i32 0
++  %elem5 = extractelement <4 x i32> %val1, i32 1
++  %elem6 = extractelement <4 x i32> %val1, i32 2
++  %elem7 = extractelement <4 x i32> %val1, i32 3
++  %elem8 = extractelement <4 x i32> %val2, i32 0
++  %elem9 = extractelement <4 x i32> %val2, i32 1
++  %elem10 = extractelement <4 x i32> %val2, i32 2
++  %elem11 = extractelement <4 x i32> %val2, i32 3
++  %elem12 = extractelement <4 x i32> %val3, i32 0
++  %elem13 = extractelement <4 x i32> %val3, i32 1
++  %elem14 = extractelement <4 x i32> %val3, i32 2
++  %elem15 = extractelement <4 x i32> %val3, i32 3
++  %bitcast0 = bitcast i32 %elem0 to <2 x i16>
++  %bitcast1 = bitcast i32 %elem1 to <2 x i16>
++  %bitcast2 = bitcast i32 %elem2 to <2 x i16>
++  %bitcast3 = bitcast i32 %elem3 to <2 x i16>
++  %bitcast4 = bitcast i32 %elem4 to <2 x i16>
++  %bitcast5 = bitcast i32 %elem5 to <2 x i16>
++  %bitcast6 = bitcast i32 %elem6 to <2 x i16>
++  %bitcast7 = bitcast i32 %elem7 to <2 x i16>
++  %bitcast8 = bitcast i32 %elem8 to <2 x i16>
++  %bitcast9 = bitcast i32 %elem9 to <2 x i16>
++  %bitcast10 = bitcast i32 %elem10 to <2 x i16>
++  %bitcast11 = bitcast i32 %elem11 to <2 x i16>
++  %bitcast12 = bitcast i32 %elem12 to <2 x i16>
++  %bitcast13 = bitcast i32 %elem13 to <2 x i16>
++  %bitcast14 = bitcast i32 %elem14 to <2 x i16>
++  %bitcast15 = bitcast i32 %elem15 to <2 x i16>
++  %low0 = shufflevector <2 x i16> %bitcast0, <2 x i16> %bitcast1,
++                        <2 x i32> <i32 1, i32 3>
++  %low1 = shufflevector <2 x i16> %bitcast2, <2 x i16> %bitcast3,
++                        <2 x i32> <i32 1, i32 3>
++  %low2 = shufflevector <2 x i16> %bitcast4, <2 x i16> %bitcast5,
++                        <2 x i32> <i32 1, i32 3>
++  %low3 = shufflevector <2 x i16> %bitcast6, <2 x i16> %bitcast7,
++                        <2 x i32> <i32 1, i32 3>
++  %low4 = shufflevector <2 x i16> %bitcast8, <2 x i16> %bitcast9,
++                        <2 x i32> <i32 1, i32 3>
++  %low5 = shufflevector <2 x i16> %bitcast10, <2 x i16> %bitcast11,
++                        <2 x i32> <i32 1, i32 3>
++  %low6 = shufflevector <2 x i16> %bitcast12, <2 x i16> %bitcast13,
++                        <2 x i32> <i32 1, i32 3>
++  %low7 = shufflevector <2 x i16> %bitcast14, <2 x i16> %bitcast15,
++                        <2 x i32> <i32 1, i32 3>
++  %bytes0 = bitcast <2 x i16> %low0 to <4 x i8>
++  %bytes1 = bitcast <2 x i16> %low1 to <4 x i8>
++  %bytes2 = bitcast <2 x i16> %low2 to <4 x i8>
++  %bytes3 = bitcast <2 x i16> %low3 to <4 x i8>
++  %bytes4 = bitcast <2 x i16> %low4 to <4 x i8>
++  %bytes5 = bitcast <2 x i16> %low5 to <4 x i8>
++  %bytes6 = bitcast <2 x i16> %low6 to <4 x i8>
++  %bytes7 = bitcast <2 x i16> %low7 to <4 x i8>
++  %blow0 = shufflevector <4 x i8> %bytes0, <4 x i8> %bytes1,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %blow1 = shufflevector <4 x i8> %bytes2, <4 x i8> %bytes3,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %blow2 = shufflevector <4 x i8> %bytes4, <4 x i8> %bytes5,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %blow3 = shufflevector <4 x i8> %bytes6, <4 x i8> %bytes7,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %join0 = shufflevector <4 x i8> %blow0, <4 x i8> %blow1,
++                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                    i32 4, i32 5, i32 6, i32 7>
++  %join1 = shufflevector <4 x i8> %blow2, <4 x i8> %blow3,
++                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                    i32 4, i32 5, i32 6, i32 7>
++  %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
++                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                   i32 4, i32 5, i32 6, i32 7,
++                                   i32 8, i32 9, i32 10, i32 11,
++                                   i32 12, i32 13, i32 14, i32 15>
++  ret <16 x i8> %ret
++}
++
++; One way of writing a <2 x i64> -> <16 x i8> pack.
++define <16 x i8> @f7(<2 x i64> %val0, <2 x i64> %val1,
++                     <2 x i64> %val2, <2 x i64> %val3,
++                     <2 x i64> %val4, <2 x i64> %val5,
++                     <2 x i64> %val6, <2 x i64> %val7) {
++; CHECK-LABEL: f7:
++; CHECK-DAG: vpk{{[hfg]}} [[REG1:%v[0-9]+]], %v24, %v26
++; CHECK-DAG: vpk{{[hfg]}} [[REG2:%v[0-9]+]], %v28, %v30
++; CHECK-DAG: vpk{{[hfg]}} [[REG3:%v[0-9]+]], %v25, %v27
++; CHECK-DAG: vpk{{[hfg]}} [[REG4:%v[0-9]+]], %v29, %v31
++; CHECK-DAG: vpk{{[hf]}} [[REG5:%v[0-9]+]], [[REG1]], [[REG2]]
++; CHECK-DAG: vpk{{[hf]}} [[REG6:%v[0-9]+]], [[REG3]], [[REG4]]
++; CHECK: vpkh %v24, [[REG5]], [[REG6]]
++; CHECK: br %r14
++  %elem0 = extractelement <2 x i64> %val0, i32 0
++  %elem1 = extractelement <2 x i64> %val0, i32 1
++  %elem2 = extractelement <2 x i64> %val1, i32 0
++  %elem3 = extractelement <2 x i64> %val1, i32 1
++  %elem4 = extractelement <2 x i64> %val2, i32 0
++  %elem5 = extractelement <2 x i64> %val2, i32 1
++  %elem6 = extractelement <2 x i64> %val3, i32 0
++  %elem7 = extractelement <2 x i64> %val3, i32 1
++  %elem8 = extractelement <2 x i64> %val4, i32 0
++  %elem9 = extractelement <2 x i64> %val4, i32 1
++  %elem10 = extractelement <2 x i64> %val5, i32 0
++  %elem11 = extractelement <2 x i64> %val5, i32 1
++  %elem12 = extractelement <2 x i64> %val6, i32 0
++  %elem13 = extractelement <2 x i64> %val6, i32 1
++  %elem14 = extractelement <2 x i64> %val7, i32 0
++  %elem15 = extractelement <2 x i64> %val7, i32 1
++  %bitcast0 = bitcast i64 %elem0 to <2 x i32>
++  %bitcast1 = bitcast i64 %elem1 to <2 x i32>
++  %bitcast2 = bitcast i64 %elem2 to <2 x i32>
++  %bitcast3 = bitcast i64 %elem3 to <2 x i32>
++  %bitcast4 = bitcast i64 %elem4 to <2 x i32>
++  %bitcast5 = bitcast i64 %elem5 to <2 x i32>
++  %bitcast6 = bitcast i64 %elem6 to <2 x i32>
++  %bitcast7 = bitcast i64 %elem7 to <2 x i32>
++  %bitcast8 = bitcast i64 %elem8 to <2 x i32>
++  %bitcast9 = bitcast i64 %elem9 to <2 x i32>
++  %bitcast10 = bitcast i64 %elem10 to <2 x i32>
++  %bitcast11 = bitcast i64 %elem11 to <2 x i32>
++  %bitcast12 = bitcast i64 %elem12 to <2 x i32>
++  %bitcast13 = bitcast i64 %elem13 to <2 x i32>
++  %bitcast14 = bitcast i64 %elem14 to <2 x i32>
++  %bitcast15 = bitcast i64 %elem15 to <2 x i32>
++  %low0 = shufflevector <2 x i32> %bitcast0, <2 x i32> %bitcast1,
++                        <2 x i32> <i32 1, i32 3>
++  %low1 = shufflevector <2 x i32> %bitcast2, <2 x i32> %bitcast3,
++                        <2 x i32> <i32 1, i32 3>
++  %low2 = shufflevector <2 x i32> %bitcast4, <2 x i32> %bitcast5,
++                        <2 x i32> <i32 1, i32 3>
++  %low3 = shufflevector <2 x i32> %bitcast6, <2 x i32> %bitcast7,
++                        <2 x i32> <i32 1, i32 3>
++  %low4 = shufflevector <2 x i32> %bitcast8, <2 x i32> %bitcast9,
++                        <2 x i32> <i32 1, i32 3>
++  %low5 = shufflevector <2 x i32> %bitcast10, <2 x i32> %bitcast11,
++                        <2 x i32> <i32 1, i32 3>
++  %low6 = shufflevector <2 x i32> %bitcast12, <2 x i32> %bitcast13,
++                        <2 x i32> <i32 1, i32 3>
++  %low7 = shufflevector <2 x i32> %bitcast14, <2 x i32> %bitcast15,
++                        <2 x i32> <i32 1, i32 3>
++  %half0 = bitcast <2 x i32> %low0 to <4 x i16>
++  %half1 = bitcast <2 x i32> %low1 to <4 x i16>
++  %half2 = bitcast <2 x i32> %low2 to <4 x i16>
++  %half3 = bitcast <2 x i32> %low3 to <4 x i16>
++  %half4 = bitcast <2 x i32> %low4 to <4 x i16>
++  %half5 = bitcast <2 x i32> %low5 to <4 x i16>
++  %half6 = bitcast <2 x i32> %low6 to <4 x i16>
++  %half7 = bitcast <2 x i32> %low7 to <4 x i16>
++  %hlow0 = shufflevector <4 x i16> %half0, <4 x i16> %half1,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %hlow1 = shufflevector <4 x i16> %half2, <4 x i16> %half3,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %hlow2 = shufflevector <4 x i16> %half4, <4 x i16> %half5,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %hlow3 = shufflevector <4 x i16> %half6, <4 x i16> %half7,
++                         <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %bytes0 = bitcast <4 x i16> %hlow0 to <8 x i8>
++  %bytes1 = bitcast <4 x i16> %hlow1 to <8 x i8>
++  %bytes2 = bitcast <4 x i16> %hlow2 to <8 x i8>
++  %bytes3 = bitcast <4 x i16> %hlow3 to <8 x i8>
++  %join0 = shufflevector <8 x i8> %bytes0, <8 x i8> %bytes1,
++                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                    i32 9, i32 11, i32 13, i32 15>
++  %join1 = shufflevector <8 x i8> %bytes2, <8 x i8> %bytes3,
++                         <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                    i32 9, i32 11, i32 13, i32 15>
++  %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1,
++                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                   i32 4, i32 5, i32 6, i32 7,
++                                   i32 8, i32 9, i32 10, i32 11,
++                                   i32 12, i32 13, i32 14, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test a <2 x i64> -> <4 x f32> pack in which only individual elements are
++; needed.
++define float @f8(i64 %scalar0, i64 %scalar1, i64 %scalar2, i64 %scalar3) {
++; CHECK-LABEL: f8:
++; CHECK-NOT: vperm
++; CHECK-NOT: vpk
++; CHECK-NOT: vmrh
++; CHECK: aebr {{%f[0-7]}},
++; CHECK: aebr {{%f[0-7]}},
++; CHECK: meebr %f0,
++; CHECK: br %r14
++  %vec0 = insertelement <2 x i64> undef, i64 %scalar0, i32 0
++  %vec1 = insertelement <2 x i64> undef, i64 %scalar1, i32 0
++  %vec2 = insertelement <2 x i64> undef, i64 %scalar2, i32 0
++  %vec3 = insertelement <2 x i64> undef, i64 %scalar3, i32 0
++  %join0 = shufflevector <2 x i64> %vec0, <2 x i64> %vec1,
++                         <2 x i32> <i32 0, i32 2>
++  %join1 = shufflevector <2 x i64> %vec2, <2 x i64> %vec3,
++                         <2 x i32> <i32 0, i32 2>
++  %bitcast0 = bitcast <2 x i64> %join0 to <4 x float>
++  %bitcast1 = bitcast <2 x i64> %join1 to <4 x float>
++  %pack = shufflevector <4 x float> %bitcast0, <4 x float> %bitcast1,
++                        <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %elt0 = extractelement <4 x float> %pack, i32 0
++  %elt1 = extractelement <4 x float> %pack, i32 1
++  %elt2 = extractelement <4 x float> %pack, i32 2
++  %elt3 = extractelement <4 x float> %pack, i32 3
++  %add0 = fadd float %elt0, %elt2
++  %add1 = fadd float %elt1, %elt3
++  %ret = fmul float %add0, %add1
++  ret float %ret
++}
++
++; Test a <2 x f64> -> <4 x i32> pack in which only individual elements are
++; needed.
++define i32 @f9(double %scalar0, double %scalar1, double %scalar2,
++               double %scalar3) {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vperm
++; CHECK-NOT: vpk
++; CHECK-NOT: vmrh
++; CHECK: ar {{%r[0-5]}},
++; CHECK: ar {{%r[0-5]}},
++; CHECK: or %r2,
++; CHECK: br %r14
++  %vec0 = insertelement <2 x double> undef, double %scalar0, i32 0
++  %vec1 = insertelement <2 x double> undef, double %scalar1, i32 0
++  %vec2 = insertelement <2 x double> undef, double %scalar2, i32 0
++  %vec3 = insertelement <2 x double> undef, double %scalar3, i32 0
++  %join0 = shufflevector <2 x double> %vec0, <2 x double> %vec1,
++                         <2 x i32> <i32 0, i32 2>
++  %join1 = shufflevector <2 x double> %vec2, <2 x double> %vec3,
++                         <2 x i32> <i32 0, i32 2>
++  %bitcast0 = bitcast <2 x double> %join0 to <4 x i32>
++  %bitcast1 = bitcast <2 x double> %join1 to <4 x i32>
++  %pack = shufflevector <4 x i32> %bitcast0, <4 x i32> %bitcast1,
++                        <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  %elt0 = extractelement <4 x i32> %pack, i32 0
++  %elt1 = extractelement <4 x i32> %pack, i32 1
++  %elt2 = extractelement <4 x i32> %pack, i32 2
++  %elt3 = extractelement <4 x i32> %pack, i32 3
++  %add0 = add i32 %elt0, %elt2
++  %add1 = add i32 %elt1, %elt3
++  %ret = or i32 %add0, %add1
++  ret i32 %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-01.ll
+@@ -0,0 +1,103 @@
++; Test vector byte masks, v16i8 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test an all-zeros vector.
++define <16 x i8> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <16 x i8> zeroinitializer
++}
++
++; Test an all-ones vector.
++define <16 x i8> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 65535
++; CHECK: br %r14
++  ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1,
++                 i8 -1, i8 -1, i8 -1, i8 -1,
++                 i8 -1, i8 -1, i8 -1, i8 -1,
++                 i8 -1, i8 -1, i8 -1, i8 -1>
++}
++
++; Test a mixed vector (mask 0x8c75).
++define <16 x i8> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 35957
++; CHECK: br %r14
++  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0,
++                 i8 -1, i8 -1, i8 0, i8 0,
++                 i8 0, i8 -1, i8 -1, i8 -1,
++                 i8 0, i8 -1, i8 0, i8 -1>
++}
++
++; Test that undefs are treated as zero.
++define <16 x i8> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgbm %v24, 35957
++; CHECK: br %r14
++  ret <16 x i8> <i8 -1, i8 undef, i8 undef, i8 undef,
++                 i8 -1, i8 -1, i8 undef, i8 undef,
++                 i8 undef, i8 -1, i8 -1, i8 -1,
++                 i8 undef, i8 -1, i8 undef, i8 -1>
++}
++
++; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
++define <16 x i8> @f5() {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vgbm
++; CHECK: br %r14
++  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0,
++                 i8 -1, i8 -1, i8 0, i8 1,
++                 i8 0, i8 -1, i8 -1, i8 -1,
++                 i8 0, i8 -1, i8 0, i8 -1>
++}
++
++; Test an all-zeros v2i8 that gets promoted to v16i8.
++define <2 x i8> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <2 x i8> zeroinitializer
++}
++
++; Test a mixed v2i8 that gets promoted to v16i8 (mask 0x8000).
++define <2 x i8> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgbm %v24, 32768
++; CHECK: br %r14
++  ret <2 x i8> <i8 255, i8 0>
++}
++
++; Test an all-zeros v4i8 that gets promoted to v16i8.
++define <4 x i8> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <4 x i8> zeroinitializer
++}
++
++; Test a mixed v4i8 that gets promoted to v16i8 (mask 0x9000).
++define <4 x i8> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgbm %v24, 36864
++; CHECK: br %r14
++  ret <4 x i8> <i8 255, i8 0, i8 0, i8 255>
++}
++
++; Test an all-zeros v8i8 that gets promoted to v16i8.
++define <8 x i8> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <8 x i8> zeroinitializer
++}
++
++; Test a mixed v8i8 that gets promoted to v16i8 (mask 0xE500).
++define <8 x i8> @f11() {
++; CHECK-LABEL: f11:
++; CHECK: vgbm %v24, 58624
++; CHECK: br %r14
++  ret <8 x i8> <i8 255, i8 255, i8 255, i8 0, i8 0, i8 255, i8 0, i8 255>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-02.ll
+@@ -0,0 +1,79 @@
++; Test vector byte masks, v8i16 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test an all-zeros vector.
++define <8 x i16> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <8 x i16> zeroinitializer
++}
++
++; Test an all-ones vector.
++define <8 x i16> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 65535
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1,
++                 i16 -1, i16 -1, i16 -1, i16 -1>
++}
++
++; Test a mixed vector (mask 0x8c76).
++define <8 x i16> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 35958
++; CHECK: br %r14
++  ret <8 x i16> <i16 65280, i16 0, i16 65535, i16 0,
++                 i16 255, i16 65535, i16 255, i16 65280>
++}
++
++; Test that undefs are treated as zero.
++define <8 x i16> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgbm %v24, 35958
++; CHECK: br %r14
++  ret <8 x i16> <i16 65280, i16 undef, i16 65535, i16 undef,
++                 i16 255, i16 65535, i16 255, i16 65280>
++}
++
++; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
++define <8 x i16> @f5() {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vgbm
++; CHECK: br %r14
++  ret <8 x i16> <i16 65280, i16 0, i16 65535, i16 0,
++                 i16 255, i16 65535, i16 256, i16 65280>
++}
++
++; Test an all-zeros v2i16 that gets promoted to v8i16.
++define <2 x i16> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <2 x i16> zeroinitializer
++}
++
++; Test a mixed v2i16 that gets promoted to v8i16 (mask 0xc000).
++define <2 x i16> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgbm %v24, 49152
++; CHECK: br %r14
++  ret <2 x i16> <i16 65535, i16 0>
++}
++
++; Test an all-zeros v4i16 that gets promoted to v8i16.
++define <4 x i16> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <4 x i16> zeroinitializer
++}
++
++; Test a mixed v4i16 that gets promoted to v8i16 (mask 0x7200).
++define <4 x i16> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgbm %v24, 29184
++; CHECK: br %r14
++  ret <4 x i16> <i16 255, i16 65535, i16 0, i16 65280>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-03.ll
+@@ -0,0 +1,59 @@
++; Test vector byte masks, v4i32 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test an all-zeros vector.
++define <4 x i32> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <4 x i32> zeroinitializer
++}
++
++; Test an all-ones vector.
++define <4 x i32> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 65535
++; CHECK: br %r14
++  ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
++}
++
++; Test a mixed vector (mask 0x8c76).
++define <4 x i32> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 35958
++; CHECK: br %r14
++  ret <4 x i32> <i32 4278190080, i32 4294901760, i32 16777215, i32 16776960>
++}
++
++; Test that undefs are treated as zero (mask 0x8076).
++define <4 x i32> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgbm %v24, 32886
++; CHECK: br %r14
++  ret <4 x i32> <i32 4278190080, i32 undef, i32 16777215, i32 16776960>
++}
++
++; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
++define <4 x i32> @f5() {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vgbm
++; CHECK: br %r14
++  ret <4 x i32> <i32 4278190080, i32 1, i32 16777215, i32 16776960>
++}
++
++; Test an all-zeros v2i32 that gets promoted to v4i32.
++define <2 x i32> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <2 x i32> zeroinitializer
++}
++
++; Test a mixed v2i32 that gets promoted to v4i32 (mask 0xae00).
++define <2 x i32> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgbm %v24, 44544
++; CHECK: br %r14
++  ret <2 x i32> <i32 4278255360, i32 -256>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-04.ll
+@@ -0,0 +1,43 @@
++; Test vector byte masks, v2i64 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test an all-zeros vector.
++define <2 x i64> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <2 x i64> zeroinitializer
++}
++
++; Test an all-ones vector.
++define <2 x i64> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 65535
++; CHECK: br %r14
++  ret <2 x i64> <i64 -1, i64 -1>
++}
++
++; Test a mixed vector (mask 0x8c76).
++define <2 x i64> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 35958
++; CHECK: br %r14
++  ret <2 x i64> <i64 18374686483966525440, i64 72057589759737600>
++}
++
++; Test that undefs are treated as zero (mask 0x8c00).
++define <2 x i64> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgbm %v24, 35840
++; CHECK: br %r14
++  ret <2 x i64> <i64 18374686483966525440, i64 undef>
++}
++
++; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
++define <2 x i64> @f5() {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vgbm
++; CHECK: br %r14
++  ret <2 x i64> <i64 18374686483966525441, i64 72057589759737600>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-05.ll
+@@ -0,0 +1,63 @@
++; Test vector byte masks, v4f32 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test an all-zeros vector.
++define <4 x float> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <4 x float> zeroinitializer
++}
++
++; Test an all-ones vector.
++define <4 x float> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 65535
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000,
++                   float 0xffffffffe0000000, float 0xffffffffe0000000>
++}
++
++; Test a mixed vector (mask 0xc731).
++define <4 x float> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 50993
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffe00000000000, float 0x381fffffe0000000,
++                   float 0x379fffe000000000, float 0x371fe00000000000>
++}
++
++; Test that undefs are treated as zero (mask 0xc031).
++define <4 x float> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgbm %v24, 49201
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffe00000000000, float undef,
++                   float 0x379fffe000000000, float 0x371fe00000000000>
++}
++
++; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
++define <4 x float> @f5() {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vgbm
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffe00000000000, float 0x381fffffc0000000,
++                   float 0x379fffe000000000, float 0x371fe00000000000>
++}
++
++; Test an all-zeros v2f32 that gets promoted to v4f32.
++define <2 x float> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <2 x float> zeroinitializer
++}
++
++; Test a mixed v2f32 that gets promoted to v4f32 (mask 0xc700).
++define <2 x float> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgbm %v24, 50944
++; CHECK: br %r14
++  ret <2 x float> <float 0xffffe00000000000, float 0x381fffffe0000000>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-06.ll
+@@ -0,0 +1,43 @@
++; Test vector byte masks, v2f64 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test an all-zeros vector.
++define <2 x double> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK: br %r14
++  ret <2 x double> zeroinitializer
++}
++
++; Test an all-ones vector.
++define <2 x double> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 65535
++; CHECK: br %r14
++  ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
++}
++
++; Test a mixed vector (mask 0x8c76).
++define <2 x double> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 35958
++; CHECK: br %r14
++  ret <2 x double> <double 0xff000000ffff0000, double 0x00ffffff00ffff00>
++}
++
++; Test that undefs are treated as zero (mask 0x8c00).
++define <2 x double> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgbm %v24, 35840
++; CHECK: br %r14
++  ret <2 x double> <double 0xff000000ffff0000, double undef>
++}
++
++; Test that we don't use VGBM if one of the bytes is not 0 or 0xff.
++define <2 x double> @f5() {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vgbm
++; CHECK: br %r14
++  ret <2 x double> <double 0xfe000000ffff0000, double 0x00ffffff00ffff00>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-07.ll
+@@ -0,0 +1,229 @@
++; Test vector replicates, v16i8 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a byte-granularity replicate with the lowest useful value.
++define <16 x i8> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vrepib %v24, 1
++; CHECK: br %r14
++  ret <16 x i8> <i8 1, i8 1, i8 1, i8 1,
++                 i8 1, i8 1, i8 1, i8 1,
++                 i8 1, i8 1, i8 1, i8 1,
++                 i8 1, i8 1, i8 1, i8 1>
++}
++
++; Test a byte-granularity replicate with an arbitrary value.
++define <16 x i8> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vrepib %v24, -55
++; CHECK: br %r14
++  ret <16 x i8> <i8 201, i8 201, i8 201, i8 201,
++                 i8 201, i8 201, i8 201, i8 201,
++                 i8 201, i8 201, i8 201, i8 201,
++                 i8 201, i8 201, i8 201, i8 201>
++}
++
++; Test a byte-granularity replicate with the highest useful value.
++define <16 x i8> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vrepib %v24, -2
++; CHECK: br %r14
++  ret <16 x i8> <i8 254, i8 254, i8 254, i8 254,
++                 i8 254, i8 254, i8 254, i8 254,
++                 i8 254, i8 254, i8 254, i8 254,
++                 i8 254, i8 254, i8 254, i8 254>
++}
++
++; Test a halfword-granularity replicate with the lowest useful value.
++define <16 x i8> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vrepih %v24, 1
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 1, i8 0, i8 1,
++                 i8 0, i8 1, i8 0, i8 1,
++                 i8 0, i8 1, i8 0, i8 1,
++                 i8 0, i8 1, i8 0, i8 1>
++}
++
++; Test a halfword-granularity replicate with an arbitrary value.
++define <16 x i8> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vrepih %v24, 25650
++; CHECK: br %r14
++  ret <16 x i8> <i8 100, i8 50, i8 100, i8 50,
++                 i8 100, i8 50, i8 100, i8 50,
++                 i8 100, i8 50, i8 100, i8 50,
++                 i8 100, i8 50, i8 100, i8 50>
++}
++
++; Test a halfword-granularity replicate with the highest useful value.
++define <16 x i8> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vrepih %v24, -2
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 254, i8 255, i8 254,
++                 i8 255, i8 254, i8 255, i8 254,
++                 i8 255, i8 254, i8 255, i8 254,
++                 i8 255, i8 254, i8 255, i8 254>
++}
++
++; Test a word-granularity replicate with the lowest useful positive value.
++define <16 x i8> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vrepif %v24, 1
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 1,
++                 i8 0, i8 0, i8 0, i8 1,
++                 i8 0, i8 0, i8 0, i8 1,
++                 i8 0, i8 0, i8 0, i8 1>
++}
++
++; Test a word-granularity replicate with the highest in-range value.
++define <16 x i8> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vrepif %v24, 32767
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 127, i8 255,
++                 i8 0, i8 0, i8 127, i8 255,
++                 i8 0, i8 0, i8 127, i8 255,
++                 i8 0, i8 0, i8 127, i8 255>
++}
++
++; Test a word-granularity replicate with the next highest value.
++; This cannot use VREPIF.
++define <16 x i8> @f9() {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 0>
++}
++
++; Test a word-granularity replicate with the lowest in-range value.
++define <16 x i8> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vrepif %v24, -32768
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 128, i8 0,
++                 i8 255, i8 255, i8 128, i8 0,
++                 i8 255, i8 255, i8 128, i8 0,
++                 i8 255, i8 255, i8 128, i8 0>
++}
++
++; Test a word-granularity replicate with the next lowest value.
++; This cannot use VREPIF.
++define <16 x i8> @f11() {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 127, i8 255>
++}
++
++; Test a word-granularity replicate with the highest useful negative value.
++define <16 x i8> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vrepif %v24, -2
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 255, i8 254,
++                 i8 255, i8 255, i8 255, i8 254,
++                 i8 255, i8 255, i8 255, i8 254,
++                 i8 255, i8 255, i8 255, i8 254>
++}
++
++; Test a doubleword-granularity replicate with the lowest useful positive
++; value.
++define <16 x i8> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vrepig %v24, 1
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 0, i8 1,
++                 i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 0, i8 1>
++}
++
++; Test a doubleword-granularity replicate with the highest in-range value.
++define <16 x i8> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 127, i8 255,
++                 i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 127, i8 255>
++}
++
++; Test a doubleword-granularity replicate with the next highest value.
++; This cannot use VREPIG.
++define <16 x i8> @f15() {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 128, i8 0>
++}
++
++; Test a doubleword-granularity replicate with the lowest in-range value.
++define <16 x i8> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vrepig %v24, -32768
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 128, i8 0,
++                 i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 128, i8 0>
++}
++
++; Test a doubleword-granularity replicate with the next lowest value.
++; This cannot use VREPIG.
++define <16 x i8> @f17() {
++; CHECK-LABEL: f17:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 127, i8 255>
++}
++
++; Test a doubleword-granularity replicate with the highest useful negative
++; value.
++define <16 x i8> @f18() {
++; CHECK-LABEL: f18:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 255, i8 254,
++                 i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 255, i8 254>
++}
++
++; Repeat f14 with undefs optimistically treated as 0.
++define <16 x i8> @f19() {
++; CHECK-LABEL: f19:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 undef, i8 0, i8 0,
++                 i8 0, i8 0, i8 127, i8 255,
++                 i8 undef, i8 0, i8 undef, i8 0,
++                 i8 0, i8 0, i8 127, i8 255>
++}
++
++; Repeat f18 with undefs optimistically treated as -1.
++define <16 x i8> @f20() {
++; CHECK-LABEL: f20:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <16 x i8> <i8 undef, i8 255, i8 255, i8 255,
++                 i8 255, i8 255, i8 undef, i8 254,
++                 i8 255, i8 255, i8 255, i8 undef,
++                 i8 255, i8 undef, i8 255, i8 254>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-08.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-08.ll
+@@ -0,0 +1,189 @@
++; Test vector replicates, v8i16 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a byte-granularity replicate with the lowest useful value.
++define <8 x i16> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vrepib %v24, 1
++; CHECK: br %r14
++  ret <8 x i16> <i16 257, i16 257, i16 257, i16 257,
++                 i16 257, i16 257, i16 257, i16 257>
++}
++
++; Test a byte-granularity replicate with an arbitrary value.
++define <8 x i16> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vrepib %v24, -55
++; CHECK: br %r14
++  ret <8 x i16> <i16 51657, i16 51657, i16 51657, i16 51657,
++                 i16 51657, i16 51657, i16 51657, i16 51657>
++}
++
++; Test a byte-granularity replicate with the highest useful value.
++define <8 x i16> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vrepib %v24, -2
++; CHECK: br %r14
++  ret <8 x i16> <i16 -258, i16 -258, i16 -258, i16 -258,
++                 i16 -258, i16 -258, i16 -258, i16 -258>
++}
++
++; Test a halfword-granularity replicate with the lowest useful value.
++define <8 x i16> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vrepih %v24, 1
++; CHECK: br %r14
++  ret <8 x i16> <i16 1, i16 1, i16 1, i16 1,
++                 i16 1, i16 1, i16 1, i16 1>
++}
++
++; Test a halfword-granularity replicate with an arbitrary value.
++define <8 x i16> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vrepih %v24, 25650
++; CHECK: br %r14
++  ret <8 x i16> <i16 25650, i16 25650, i16 25650, i16 25650,
++                 i16 25650, i16 25650, i16 25650, i16 25650>
++}
++
++; Test a halfword-granularity replicate with the highest useful value.
++define <8 x i16> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vrepih %v24, -2
++; CHECK: br %r14
++  ret <8 x i16> <i16 65534, i16 65534, i16 65534, i16 65534,
++                 i16 65534, i16 65534, i16 65534, i16 65534>
++}
++
++; Test a word-granularity replicate with the lowest useful positive value.
++define <8 x i16> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vrepif %v24, 1
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 1, i16 0, i16 1,
++                 i16 0, i16 1, i16 0, i16 1>
++}
++
++; Test a word-granularity replicate with the highest in-range value.
++define <8 x i16> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vrepif %v24, 32767
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 32767, i16 0, i16 32767,
++                 i16 0, i16 32767, i16 0, i16 32767>
++}
++
++; Test a word-granularity replicate with the next highest value.
++; This cannot use VREPIF.
++define <8 x i16> @f9() {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 32768, i16 0, i16 32768,
++                 i16 0, i16 32768, i16 0, i16 32768>
++}
++
++; Test a word-granularity replicate with the lowest in-range value.
++define <8 x i16> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vrepif %v24, -32768
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -32768, i16 -1, i16 -32768,
++                 i16 -1, i16 -32768, i16 -1, i16 -32768>
++}
++
++; Test a word-granularity replicate with the next lowest value.
++; This cannot use VREPIF.
++define <8 x i16> @f11() {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -32769, i16 -1, i16 -32769,
++                 i16 -1, i16 -32769, i16 -1, i16 -32769>
++}
++
++; Test a word-granularity replicate with the highest useful negative value.
++define <8 x i16> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vrepif %v24, -2
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -2, i16 -1, i16 -2,
++                 i16 -1, i16 -2, i16 -1, i16 -2>
++}
++
++; Test a doubleword-granularity replicate with the lowest useful positive
++; value.
++define <8 x i16> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vrepig %v24, 1
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 0, i16 0, i16 1,
++                 i16 0, i16 0, i16 0, i16 1>
++}
++
++; Test a doubleword-granularity replicate with the highest in-range value.
++define <8 x i16> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 0, i16 0, i16 32767,
++                 i16 0, i16 0, i16 0, i16 32767>
++}
++
++; Test a doubleword-granularity replicate with the next highest value.
++; This cannot use VREPIG.
++define <8 x i16> @f15() {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 0, i16 0, i16 32768,
++                 i16 0, i16 0, i16 0, i16 32768>
++}
++
++; Test a doubleword-granularity replicate with the lowest in-range value.
++define <8 x i16> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vrepig %v24, -32768
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -32768,
++                 i16 -1, i16 -1, i16 -1, i16 -32768>
++}
++
++; Test a doubleword-granularity replicate with the next lowest value.
++; This cannot use VREPIG.
++define <8 x i16> @f17() {
++; CHECK-LABEL: f17:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -32769,
++                 i16 -1, i16 -1, i16 -1, i16 -32769>
++}
++
++; Test a doubleword-granularity replicate with the highest useful negative
++; value.
++define <8 x i16> @f18() {
++; CHECK-LABEL: f18:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -2,
++                 i16 -1, i16 -1, i16 -1, i16 -2>
++}
++
++; Repeat f14 with undefs optimistically treated as 0.
++define <8 x i16> @f19() {
++; CHECK-LABEL: f19:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 undef, i16 0, i16 32767,
++                 i16 undef, i16 0, i16 undef, i16 32767>
++}
++
++; Repeat f18 with undefs optimistically treated as -1.
++define <8 x i16> @f20() {
++; CHECK-LABEL: f20:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -1, i16 undef, i16 -2,
++                 i16 undef, i16 undef, i16 -1, i16 -2>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-09.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-09.ll
+@@ -0,0 +1,169 @@
++; Test vector replicates, v4i32 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a byte-granularity replicate with the lowest useful value.
++define <4 x i32> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vrepib %v24, 1
++; CHECK: br %r14
++  ret <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
++}
++
++; Test a byte-granularity replicate with an arbitrary value.
++define <4 x i32> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vrepib %v24, -55
++; CHECK: br %r14
++  ret <4 x i32> <i32 3385444809, i32 3385444809, i32 3385444809, i32 3385444809>
++}
++
++; Test a byte-granularity replicate with the highest useful value.
++define <4 x i32> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vrepib %v24, -2
++; CHECK: br %r14
++  ret <4 x i32> <i32 4278124286, i32 4278124286, i32 4278124286, i32 4278124286>
++}
++
++; Test a halfword-granularity replicate with the lowest useful value.
++define <4 x i32> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vrepih %v24, 1
++; CHECK: br %r14
++  ret <4 x i32> <i32 65537, i32 65537, i32 65537, i32 65537>
++}
++
++; Test a halfword-granularity replicate with an arbitrary value.
++define <4 x i32> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vrepih %v24, 25650
++; CHECK: br %r14
++  ret <4 x i32> <i32 1681024050, i32 1681024050, i32 1681024050, i32 1681024050>
++}
++
++; Test a halfword-granularity replicate with the highest useful value.
++define <4 x i32> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vrepih %v24, -2
++; CHECK: br %r14
++  ret <4 x i32> <i32 -65538, i32 -65538, i32 -65538, i32 -65538>
++}
++
++; Test a word-granularity replicate with the lowest useful positive value.
++define <4 x i32> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vrepif %v24, 1
++; CHECK: br %r14
++  ret <4 x i32> <i32 1, i32 1, i32 1, i32 1>
++}
++
++; Test a word-granularity replicate with the highest in-range value.
++define <4 x i32> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vrepif %v24, 32767
++; CHECK: br %r14
++  ret <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>
++}
++
++; Test a word-granularity replicate with the next highest value.
++; This cannot use VREPIF.
++define <4 x i32> @f9() {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <4 x i32> <i32 32768, i32 32768, i32 32768, i32 32768>
++}
++
++; Test a word-granularity replicate with the lowest in-range value.
++define <4 x i32> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vrepif %v24, -32768
++; CHECK: br %r14
++  ret <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768>
++}
++
++; Test a word-granularity replicate with the next lowest value.
++; This cannot use VREPIF.
++define <4 x i32> @f11() {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <4 x i32> <i32 -32769, i32 -32769, i32 -32769, i32 -32769>
++}
++
++; Test a word-granularity replicate with the highest useful negative value.
++define <4 x i32> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vrepif %v24, -2
++; CHECK: br %r14
++  ret <4 x i32> <i32 -2, i32 -2, i32 -2, i32 -2>
++}
++
++; Test a doubleword-granularity replicate with the lowest useful positive
++; value.
++define <4 x i32> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vrepig %v24, 1
++; CHECK: br %r14
++  ret <4 x i32> <i32 0, i32 1, i32 0, i32 1>
++}
++
++; Test a doubleword-granularity replicate with the highest in-range value.
++define <4 x i32> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <4 x i32> <i32 0, i32 32767, i32 0, i32 32767>
++}
++
++; Test a doubleword-granularity replicate with the next highest value.
++; This cannot use VREPIG.
++define <4 x i32> @f15() {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <4 x i32> <i32 0, i32 32768, i32 0, i32 32768>
++}
++
++; Test a doubleword-granularity replicate with the lowest in-range value.
++define <4 x i32> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vrepig %v24, -32768
++; CHECK: br %r14
++  ret <4 x i32> <i32 -1, i32 -32768, i32 -1, i32 -32768>
++}
++
++; Test a doubleword-granularity replicate with the next lowest value.
++; This cannot use VREPIG.
++define <4 x i32> @f17() {
++; CHECK-LABEL: f17:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <4 x i32> <i32 -1, i32 -32769, i32 -1, i32 -32769>
++}
++
++; Test a doubleword-granularity replicate with the highest useful negative
++; value.
++define <4 x i32> @f18() {
++; CHECK-LABEL: f18:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <4 x i32> <i32 -1, i32 -2, i32 -1, i32 -2>
++}
++
++; Repeat f14 with undefs optimistically treated as 0, 32767.
++define <4 x i32> @f19() {
++; CHECK-LABEL: f19:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <4 x i32> <i32 undef, i32 undef, i32 0, i32 32767>
++}
++
++; Repeat f18 with undefs optimistically treated as -2, -1.
++define <4 x i32> @f20() {
++; CHECK-LABEL: f20:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <4 x i32> <i32 -1, i32 undef, i32 undef, i32 -2>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-10.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-10.ll
+@@ -0,0 +1,169 @@
++; Test vector replicates, v2i64 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a byte-granularity replicate with the lowest useful value.
++define <2 x i64> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vrepib %v24, 1
++; CHECK: br %r14
++  ret <2 x i64> <i64 72340172838076673, i64 72340172838076673>
++}
++
++; Test a byte-granularity replicate with an arbitrary value.
++define <2 x i64> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vrepib %v24, -55
++; CHECK: br %r14
++  ret <2 x i64> <i64 -3906369333256140343, i64 -3906369333256140343>
++}
++
++; Test a byte-granularity replicate with the highest useful value.
++define <2 x i64> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vrepib %v24, -2
++; CHECK: br %r14
++  ret <2 x i64> <i64 -72340172838076674, i64 -72340172838076674>
++}
++
++; Test a halfword-granularity replicate with the lowest useful value.
++define <2 x i64> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vrepih %v24, 1
++; CHECK: br %r14
++  ret <2 x i64> <i64 281479271743489, i64 281479271743489>
++}
++
++; Test a halfword-granularity replicate with an arbitrary value.
++define <2 x i64> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vrepih %v24, 25650
++; CHECK: br %r14
++  ret <2 x i64> <i64 7219943320220492850, i64 7219943320220492850>
++}
++
++; Test a halfword-granularity replicate with the highest useful value.
++define <2 x i64> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vrepih %v24, -2
++; CHECK: br %r14
++  ret <2 x i64> <i64 -281479271743490, i64 -281479271743490>
++}
++
++; Test a word-granularity replicate with the lowest useful positive value.
++define <2 x i64> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vrepif %v24, 1
++; CHECK: br %r14
++  ret <2 x i64> <i64 4294967297, i64 4294967297>
++}
++
++; Test a word-granularity replicate with the highest in-range value.
++define <2 x i64> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vrepif %v24, 32767
++; CHECK: br %r14
++  ret <2 x i64> <i64 140733193420799, i64 140733193420799>
++}
++
++; Test a word-granularity replicate with the next highest value.
++; This cannot use VREPIF.
++define <2 x i64> @f9() {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <2 x i64> <i64 140737488388096, i64 140737488388096>
++}
++
++; Test a word-granularity replicate with the lowest in-range value.
++define <2 x i64> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vrepif %v24, -32768
++; CHECK: br %r14
++  ret <2 x i64> <i64 -140733193420800, i64 -140733193420800>
++}
++
++; Test a word-granularity replicate with the next lowest value.
++; This cannot use VREPIF.
++define <2 x i64> @f11() {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <2 x i64> <i64 -140737488388097, i64 -140737488388097>
++}
++
++; Test a word-granularity replicate with the highest useful negative value.
++define <2 x i64> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vrepif %v24, -2
++; CHECK: br %r14
++  ret <2 x i64> <i64 -4294967298, i64 -4294967298>
++}
++
++; Test a doubleword-granularity replicate with the lowest useful positive
++; value.
++define <2 x i64> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vrepig %v24, 1
++; CHECK: br %r14
++  ret <2 x i64> <i64 1, i64 1>
++}
++
++; Test a doubleword-granularity replicate with the highest in-range value.
++define <2 x i64> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <2 x i64> <i64 32767, i64 32767>
++}
++
++; Test a doubleword-granularity replicate with the next highest value.
++; This cannot use VREPIG.
++define <2 x i64> @f15() {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <2 x i64> <i64 32768, i64 32768>
++}
++
++; Test a doubleword-granularity replicate with the lowest in-range value.
++define <2 x i64> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vrepig %v24, -32768
++; CHECK: br %r14
++  ret <2 x i64> <i64 -32768, i64 -32768>
++}
++
++; Test a doubleword-granularity replicate with the next lowest value.
++; This cannot use VREPIG.
++define <2 x i64> @f17() {
++; CHECK-LABEL: f17:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <2 x i64> <i64 -32769, i64 -32769>
++}
++
++; Test a doubleword-granularity replicate with the highest useful negative
++; value.
++define <2 x i64> @f18() {
++; CHECK-LABEL: f18:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <2 x i64> <i64 -2, i64 -2>
++}
++
++; Repeat f14 with undefs optimistically treated as 32767.
++define <2 x i64> @f19() {
++; CHECK-LABEL: f19:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <2 x i64> <i64 undef, i64 32767>
++}
++
++; Repeat f18 with undefs optimistically treated as -2.
++define <2 x i64> @f20() {
++; CHECK-LABEL: f20:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <2 x i64> <i64 undef, i64 -2>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-11.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-11.ll
+@@ -0,0 +1,189 @@
++; Test vector replicates, v4f32 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a byte-granularity replicate with the lowest useful value.
++define <4 x float> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vrepib %v24, 1
++; CHECK: br %r14
++  ret <4 x float> <float 0x3820202020000000, float 0x3820202020000000,
++                   float 0x3820202020000000, float 0x3820202020000000>
++}
++
++; Test a byte-granularity replicate with an arbitrary value.
++define <4 x float> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vrepib %v24, -55
++; CHECK: br %r14
++  ret <4 x float> <float 0xc139393920000000, float 0xc139393920000000,
++                   float 0xc139393920000000, float 0xc139393920000000>
++}
++
++; Test a byte-granularity replicate with the highest useful value.
++define <4 x float> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vrepib %v24, -2
++; CHECK: br %r14
++  ret <4 x float> <float 0xc7dfdfdfc0000000, float 0xc7dfdfdfc0000000,
++                   float 0xc7dfdfdfc0000000, float 0xc7dfdfdfc0000000>
++}
++
++; Test a halfword-granularity replicate with the lowest useful value.
++define <4 x float> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vrepih %v24, 1
++; CHECK: br %r14
++  ret <4 x float> <float 0x37a0001000000000, float 0x37a0001000000000,
++                   float 0x37a0001000000000, float 0x37a0001000000000>
++}
++
++; Test a halfword-granularity replicate with an arbitrary value.
++define <4 x float> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vrepih %v24, 25650
++; CHECK: br %r14
++  ret <4 x float> <float 0x44864c8640000000, float 0x44864c8640000000,
++                   float 0x44864c8640000000, float 0x44864c8640000000>
++}
++
++; Test a halfword-granularity replicate with the highest useful value.
++define <4 x float> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vrepih %v24, -2
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffdfffc0000000, float 0xffffdfffc0000000,
++                   float 0xffffdfffc0000000, float 0xffffdfffc0000000>
++}
++
++; Test a word-granularity replicate with the lowest useful positive value.
++define <4 x float> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vrepif %v24, 1
++; CHECK: br %r14
++  ret <4 x float> <float 0x36a0000000000000, float 0x36a0000000000000,
++                   float 0x36a0000000000000, float 0x36a0000000000000>
++}
++
++; Test a word-granularity replicate with the highest in-range value.
++define <4 x float> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vrepif %v24, 32767
++; CHECK: br %r14
++  ret <4 x float> <float 0x378fffc000000000, float 0x378fffc000000000,
++                   float 0x378fffc000000000, float 0x378fffc000000000>
++}
++
++; Test a word-granularity replicate with the next highest value.
++; This cannot use VREPIF.
++define <4 x float> @f9() {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <4 x float> <float 0x3790000000000000, float 0x3790000000000000,
++                   float 0x3790000000000000, float 0x3790000000000000>
++}
++
++; Test a word-granularity replicate with the lowest in-range value.
++define <4 x float> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vrepif %v24, -32768
++; CHECK: br %r14
++  ret <4 x float> <float 0xfffff00000000000, float 0xfffff00000000000,
++                   float 0xfffff00000000000, float 0xfffff00000000000>
++}
++
++; Test a word-granularity replicate with the next lowest value.
++; This cannot use VREPIF.
++define <4 x float> @f11() {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffefffe0000000, float 0xffffefffe0000000,
++                   float 0xffffefffe0000000, float 0xffffefffe0000000>
++}
++
++; Test a word-granularity replicate with the highest useful negative value.
++define <4 x float> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vrepif %v24, -2
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffc0000000, float 0xffffffffc0000000,
++                   float 0xffffffffc0000000, float 0xffffffffc0000000>
++}
++
++; Test a doubleword-granularity replicate with the lowest useful positive
++; value.
++define <4 x float> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vrepig %v24, 1
++; CHECK: br %r14
++  ret <4 x float> <float 0.0, float 0x36a0000000000000,
++                   float 0.0, float 0x36a0000000000000>
++}
++
++; Test a doubleword-granularity replicate with the highest in-range value.
++define <4 x float> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <4 x float> <float 0.0, float 0x378fffc000000000,
++                   float 0.0, float 0x378fffc000000000>
++}
++
++; Test a doubleword-granularity replicate with the next highest value.
++; This cannot use VREPIG.
++define <4 x float> @f15() {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <4 x float> <float 0.0, float 0x3790000000000000,
++                   float 0.0, float 0x3790000000000000>
++}
++
++; Test a doubleword-granularity replicate with the lowest in-range value.
++define <4 x float> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vrepig %v24, -32768
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffe0000000, float 0xfffff00000000000,
++                   float 0xffffffffe0000000, float 0xfffff00000000000>
++}
++
++; Test a doubleword-granularity replicate with the next lowest value.
++; This cannot use VREPIG.
++define <4 x float> @f17() {
++; CHECK-LABEL: f17:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffe0000000, float 0xffffefffe0000000,
++                   float 0xffffffffe0000000, float 0xffffefffe0000000>
++}
++
++; Test a doubleword-granularity replicate with the highest useful negative
++; value.
++define <4 x float> @f18() {
++; CHECK-LABEL: f18:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffc0000000,
++                   float 0xffffffffe0000000, float 0xffffffffc0000000>
++}
++
++; Repeat f14 with undefs optimistically treated as 0, 32767.
++define <4 x float> @f19() {
++; CHECK-LABEL: f19:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <4 x float> <float undef, float undef,
++                   float 0.0, float 0x378fffc000000000>
++}
++
++; Repeat f18 with undefs optimistically treated as -2, -1.
++define <4 x float> @f20() {
++; CHECK-LABEL: f20:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffe0000000, float undef,
++                   float undef, float 0xffffffffc0000000>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-12.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-12.ll
+@@ -0,0 +1,169 @@
++; Test vector replicates, v2f64 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a byte-granularity replicate with the lowest useful value.
++define <2 x double> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vrepib %v24, 1
++; CHECK: br %r14
++  ret <2 x double> <double 0x0101010101010101, double 0x0101010101010101>
++}
++
++; Test a byte-granularity replicate with an arbitrary value.
++define <2 x double> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vrepib %v24, -55
++; CHECK: br %r14
++  ret <2 x double> <double 0xc9c9c9c9c9c9c9c9, double 0xc9c9c9c9c9c9c9c9>
++}
++
++; Test a byte-granularity replicate with the highest useful value.
++define <2 x double> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vrepib %v24, -2
++; CHECK: br %r14
++  ret <2 x double> <double 0xfefefefefefefefe, double 0xfefefefefefefefe>
++}
++
++; Test a halfword-granularity replicate with the lowest useful value.
++define <2 x double> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vrepih %v24, 1
++; CHECK: br %r14
++  ret <2 x double> <double 0x0001000100010001, double 0x0001000100010001>
++}
++
++; Test a halfword-granularity replicate with an arbitrary value.
++define <2 x double> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vrepih %v24, 25650
++; CHECK: br %r14
++  ret <2 x double> <double 0x6432643264326432, double 0x6432643264326432>
++}
++
++; Test a halfword-granularity replicate with the highest useful value.
++define <2 x double> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vrepih %v24, -2
++; CHECK: br %r14
++  ret <2 x double> <double 0xfffefffefffefffe, double 0xfffefffefffefffe>
++}
++
++; Test a word-granularity replicate with the lowest useful positive value.
++define <2 x double> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vrepif %v24, 1
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000000100000001, double 0x0000000100000001>
++}
++
++; Test a word-granularity replicate with the highest in-range value.
++define <2 x double> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vrepif %v24, 32767
++; CHECK: br %r14
++  ret <2 x double> <double 0x00007fff00007fff, double 0x00007fff00007fff>
++}
++
++; Test a word-granularity replicate with the next highest value.
++; This cannot use VREPIF.
++define <2 x double> @f9() {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000800000008000, double 0x0000800000008000>
++}
++
++; Test a word-granularity replicate with the lowest in-range value.
++define <2 x double> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vrepif %v24, -32768
++; CHECK: br %r14
++  ret <2 x double> <double 0xffff8000ffff8000, double 0xffff8000ffff8000>
++}
++
++; Test a word-granularity replicate with the next lowest value.
++; This cannot use VREPIF.
++define <2 x double> @f11() {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vrepif
++; CHECK: br %r14
++  ret <2 x double> <double 0xffff7fffffff7fff, double 0xffff7fffffff7fff>
++}
++
++; Test a word-granularity replicate with the highest useful negative value.
++define <2 x double> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vrepif %v24, -2
++; CHECK: br %r14
++  ret <2 x double> <double 0xfffffffefffffffe, double 0xfffffffefffffffe>
++}
++
++; Test a doubleword-granularity replicate with the lowest useful positive
++; value.
++define <2 x double> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vrepig %v24, 1
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000000000000001, double 0x0000000000000001>
++}
++
++; Test a doubleword-granularity replicate with the highest in-range value.
++define <2 x double> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000000000007fff, double 0x0000000000007fff>
++}
++
++; Test a doubleword-granularity replicate with the next highest value.
++; This cannot use VREPIG.
++define <2 x double> @f15() {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000000000008000, double 0x0000000000008000>
++}
++
++; Test a doubleword-granularity replicate with the lowest in-range value.
++define <2 x double> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vrepig %v24, -32768
++; CHECK: br %r14
++  ret <2 x double> <double 0xffffffffffff8000, double 0xffffffffffff8000>
++}
++
++; Test a doubleword-granularity replicate with the next lowest value.
++; This cannot use VREPIG.
++define <2 x double> @f17() {
++; CHECK-LABEL: f17:
++; CHECK-NOT: vrepig
++; CHECK: br %r14
++  ret <2 x double> <double 0xffffffffffff7fff, double 0xffffffffffff7fff>
++}
++
++; Test a doubleword-granularity replicate with the highest useful negative
++; value.
++define <2 x double> @f18() {
++; CHECK-LABEL: f18:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <2 x double> <double 0xfffffffffffffffe, double 0xfffffffffffffffe>
++}
++
++; Repeat f14 with undefs optimistically treated as 32767.
++define <2 x double> @f19() {
++; CHECK-LABEL: f19:
++; CHECK: vrepig %v24, 32767
++; CHECK: br %r14
++  ret <2 x double> <double undef, double 0x0000000000007fff>
++}
++
++; Repeat f18 with undefs optimistically treated as -2.
++define <2 x double> @f20() {
++; CHECK-LABEL: f20:
++; CHECK: vrepig %v24, -2
++; CHECK: br %r14
++  ret <2 x double> <double undef, double 0xfffffffffffffffe>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-13.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-13.ll
+@@ -0,0 +1,193 @@
++; Test vector replicates that use VECTOR GENERATE MASK, v16i8 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a word-granularity replicate with the lowest value that cannot use
++; VREPIF.
++define <16 x i8> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 0>
++}
++
++; Test a word-granularity replicate that has the lower 17 bits set.
++define <16 x i8> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgmf %v24, 15, 31
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 1, i8 255, i8 255,
++                 i8 0, i8 1, i8 255, i8 255,
++                 i8 0, i8 1, i8 255, i8 255,
++                 i8 0, i8 1, i8 255, i8 255>
++}
++
++; Test a word-granularity replicate that has the upper 15 bits set.
++define <16 x i8> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgmf %v24, 0, 14
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 254, i8 0, i8 0,
++                 i8 255, i8 254, i8 0, i8 0,
++                 i8 255, i8 254, i8 0, i8 0,
++                 i8 255, i8 254, i8 0, i8 0>
++}
++
++; Test a word-granularity replicate that has middle bits set.
++define <16 x i8> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgmf %v24, 12, 17
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 15, i8 192, i8 0,
++                 i8 0, i8 15, i8 192, i8 0,
++                 i8 0, i8 15, i8 192, i8 0,
++                 i8 0, i8 15, i8 192, i8 0>
++}
++
++; Test a word-granularity replicate with a wrap-around mask.
++define <16 x i8> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vgmf %v24, 17, 15
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 127, i8 255,
++                 i8 255, i8 255, i8 127, i8 255>
++}
++
++; Test a doubleword-granularity replicate with the lowest value that cannot
++; use VREPIG.
++define <16 x i8> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgmg %v24, 48, 48
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 128, i8 0,
++                 i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 0, i8 128, i8 0>
++}
++
++; Test a doubleword-granularity replicate that has the lower 22 bits set.
++define <16 x i8> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgmg %v24, 42, 63
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 63, i8 255, i8 255,
++                 i8 0, i8 0, i8 0, i8 0,
++                 i8 0, i8 63, i8 255, i8 255>
++}
++
++; Test a doubleword-granularity replicate that has the upper 45 bits set.
++define <16 x i8> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgmg %v24, 0, 44
++; CHECK: br %r14
++  ret <16 x i8> <i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 248, i8 0, i8 0,
++                 i8 255, i8 255, i8 255, i8 255,
++                 i8 255, i8 248, i8 0, i8 0>
++}
++
++; Test a doubleword-granularity replicate that has middle bits set.
++define <16 x i8> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgmg %v24, 31, 42
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 0, i8 0, i8 1,
++                 i8 255, i8 224, i8 0, i8 0,
++                 i8 0, i8 0, i8 0, i8 1,
++                 i8 255, i8 224, i8 0, i8 0>
++}
++
++; Test a doubleword-granularity replicate with a wrap-around mask.
++define <16 x i8> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgmg %v24, 18, 0
++; CHECK: br %r14
++  ret <16 x i8> <i8 128, i8 0, i8 63, i8 255,
++                 i8 255, i8 255, i8 255, i8 255,
++                 i8 128, i8 0, i8 63, i8 255,
++                 i8 255, i8 255, i8 255, i8 255>
++}
++
++; Retest f1 with arbitrary undefs instead of 0s.
++define <16 x i8> @f11() {
++; CHECK-LABEL: f11:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 undef, i8 128, i8 0,
++                 i8 0, i8 0, i8 128, i8 undef,
++                 i8 undef, i8 0, i8 128, i8 0,
++                 i8 undef, i8 undef, i8 128, i8 0>
++}
++
++; Try a case where we want consistent undefs to be treated as 0.
++define <16 x i8> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vgmf %v24, 15, 23
++; CHECK: br %r14
++  ret <16 x i8> <i8 undef, i8 1, i8 255, i8 0,
++                 i8 undef, i8 1, i8 255, i8 0,
++                 i8 undef, i8 1, i8 255, i8 0,
++                 i8 undef, i8 1, i8 255, i8 0>
++}
++
++; ...and again with the lower bits of the replicated constant.
++define <16 x i8> @f13() {
++; CHECK-LABEL: f13:
++; CHECK: vgmf %v24, 15, 22
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 1, i8 254, i8 undef,
++                 i8 0, i8 1, i8 254, i8 undef,
++                 i8 0, i8 1, i8 254, i8 undef,
++                 i8 0, i8 1, i8 254, i8 undef>
++}
++
++; Try a case where we want consistent undefs to be treated as -1.
++define <16 x i8> @f14() {
++; CHECK-LABEL: f14:
++; CHECK: vgmf %v24, 28, 8
++; CHECK: br %r14
++  ret <16 x i8> <i8 undef, i8 128, i8 0, i8 15,
++                 i8 undef, i8 128, i8 0, i8 15,
++                 i8 undef, i8 128, i8 0, i8 15,
++                 i8 undef, i8 128, i8 0, i8 15>
++}
++
++; ...and again with the lower bits of the replicated constant.
++define <16 x i8> @f15() {
++; CHECK-LABEL: f15:
++; CHECK: vgmf %v24, 18, 3
++; CHECK: br %r14
++  ret <16 x i8> <i8 240, i8 0, i8 63, i8 undef,
++                 i8 240, i8 0, i8 63, i8 undef,
++                 i8 240, i8 0, i8 63, i8 undef,
++                 i8 240, i8 0, i8 63, i8 undef>
++}
++
++; Repeat f9 with arbitrary undefs.
++define <16 x i8> @f16() {
++; CHECK-LABEL: f16:
++; CHECK: vgmg %v24, 31, 42
++; CHECK: br %r14
++  ret <16 x i8> <i8 undef, i8 0, i8 undef, i8 1,
++                 i8 255, i8 undef, i8 0, i8 0,
++                 i8 0, i8 0, i8 0, i8 1,
++                 i8 undef, i8 224, i8 undef, i8 undef>
++}
++
++; Try a case where we want some consistent undefs to be treated as 0
++; and some to be treated as 255.
++define <16 x i8> @f17() {
++; CHECK-LABEL: f17:
++; CHECK: vgmg %v24, 23, 35
++; CHECK: br %r14
++  ret <16 x i8> <i8 0, i8 undef, i8 1, i8 undef,
++                 i8 240, i8 undef, i8 0, i8 0,
++                 i8 0, i8 undef, i8 1, i8 undef,
++                 i8 240, i8 undef, i8 0, i8 0>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-14.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-14.ll
+@@ -0,0 +1,113 @@
++; Test vector replicates that use VECTOR GENERATE MASK, v8i16 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a word-granularity replicate with the lowest value that cannot use
++; VREPIF.
++define <8 x i16> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 32768, i16 0, i16 32768,
++                 i16 0, i16 32768, i16 0, i16 32768>
++}
++
++; Test a word-granularity replicate that has the lower 17 bits set.
++define <8 x i16> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgmf %v24, 15, 31
++; CHECK: br %r14
++  ret <8 x i16> <i16 1, i16 -1, i16 1, i16 -1,
++                 i16 1, i16 -1, i16 1, i16 -1>
++}
++
++; Test a word-granularity replicate that has the upper 15 bits set.
++define <8 x i16> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgmf %v24, 0, 14
++; CHECK: br %r14
++  ret <8 x i16> <i16 -2, i16 0, i16 -2, i16 0,
++                 i16 -2, i16 0, i16 -2, i16 0>
++}
++
++; Test a word-granularity replicate that has middle bits set.
++define <8 x i16> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgmf %v24, 12, 17
++; CHECK: br %r14
++  ret <8 x i16> <i16 15, i16 49152, i16 15, i16 49152,
++                 i16 15, i16 49152, i16 15, i16 49152>
++}
++
++; Test a word-granularity replicate with a wrap-around mask.
++define <8 x i16> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vgmf %v24, 17, 15
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 32767, i16 -1, i16 32767,
++                 i16 -1, i16 32767, i16 -1, i16 32767>
++}
++
++; Test a doubleword-granularity replicate with the lowest value that cannot
++; use VREPIG.
++define <8 x i16> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgmg %v24, 48, 48
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 0, i16 0, i16 32768,
++                 i16 0, i16 0, i16 0, i16 32768>
++}
++
++; Test a doubleword-granularity replicate that has the lower 22 bits set.
++define <8 x i16> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgmg %v24, 42, 63
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 0, i16 63, i16 -1,
++                 i16 0, i16 0, i16 63, i16 -1>
++}
++
++; Test a doubleword-granularity replicate that has the upper 45 bits set.
++define <8 x i16> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgmg %v24, 0, 44
++; CHECK: br %r14
++  ret <8 x i16> <i16 -1, i16 -1, i16 -8, i16 0,
++                 i16 -1, i16 -1, i16 -8, i16 0>
++}
++
++; Test a doubleword-granularity replicate that has middle bits set.
++define <8 x i16> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgmg %v24, 31, 42
++; CHECK: br %r14
++  ret <8 x i16> <i16 0, i16 1, i16 -32, i16 0,
++                 i16 0, i16 1, i16 -32, i16 0>
++}
++
++; Test a doubleword-granularity replicate with a wrap-around mask.
++define <8 x i16> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgmg %v24, 18, 0
++; CHECK: br %r14
++  ret <8 x i16> <i16 32768, i16 16383, i16 -1, i16 -1,
++                 i16 32768, i16 16383, i16 -1, i16 -1>
++}
++
++; Retest f1 with arbitrary undefs instead of 0s.
++define <8 x i16> @f11() {
++; CHECK-LABEL: f11:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <8 x i16> <i16 undef, i16 32768, i16 0, i16 32768,
++                 i16 0, i16 32768, i16 undef, i16 32768>
++}
++
++; ...likewise f9.
++define <8 x i16> @f12() {
++; CHECK-LABEL: f12:
++; CHECK: vgmg %v24, 31, 42
++; CHECK: br %r14
++  ret <8 x i16> <i16 undef, i16 1, i16 -32, i16 0,
++                 i16 0, i16 1, i16 -32, i16 undef>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-15.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-15.ll
+@@ -0,0 +1,85 @@
++; Test vector replicates that use VECTOR GENERATE MASK, v4i32 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a word-granularity replicate with the lowest value that cannot use
++; VREPIF.
++define <4 x i32> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <4 x i32> <i32 32768, i32 32768, i32 32768, i32 32768>
++}
++
++; Test a word-granularity replicate that has the lower 17 bits set.
++define <4 x i32> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgmf %v24, 15, 31
++; CHECK: br %r14
++  ret <4 x i32> <i32 131071, i32 131071, i32 131071, i32 131071>
++}
++
++; Test a word-granularity replicate that has the upper 15 bits set.
++define <4 x i32> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgmf %v24, 0, 14
++; CHECK: br %r14
++  ret <4 x i32> <i32 -131072, i32 -131072, i32 -131072, i32 -131072>
++}
++
++; Test a word-granularity replicate that has middle bits set.
++define <4 x i32> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgmf %v24, 12, 17
++; CHECK: br %r14
++  ret <4 x i32> <i32 1032192, i32 1032192, i32 1032192, i32 1032192>
++}
++
++; Test a word-granularity replicate with a wrap-around mask.
++define <4 x i32> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vgmf %v24, 17, 15
++; CHECK: br %r14
++  ret <4 x i32> <i32 -32769, i32 -32769, i32 -32769, i32 -32769>
++}
++
++; Test a doubleword-granularity replicate with the lowest value that cannot
++; use VREPIG.
++define <4 x i32> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgmg %v24, 48, 48
++; CHECK: br %r14
++  ret <4 x i32> <i32 0, i32 32768, i32 0, i32 32768>
++}
++
++; Test a doubleword-granularity replicate that has the lower 22 bits set.
++define <4 x i32> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgmg %v24, 42, 63
++; CHECK: br %r14
++  ret <4 x i32> <i32 0, i32 4194303, i32 0, i32 4194303>
++}
++
++; Test a doubleword-granularity replicate that has the upper 45 bits set.
++define <4 x i32> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgmg %v24, 0, 44
++; CHECK: br %r14
++  ret <4 x i32> <i32 -1, i32 -524288, i32 -1, i32 -524288>
++}
++
++; Test a doubleword-granularity replicate that has middle bits set.
++define <4 x i32> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgmg %v24, 31, 42
++; CHECK: br %r14
++  ret <4 x i32> <i32 1, i32 -2097152, i32 1, i32 -2097152>
++}
++
++; Test a doubleword-granularity replicate with a wrap-around mask.
++define <4 x i32> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgmg %v24, 18, 0
++; CHECK: br %r14
++  ret <4 x i32> <i32 -2147467265, i32 -1, i32 -2147467265, i32 -1>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-16.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-16.ll
+@@ -0,0 +1,85 @@
++; Test vector replicates that use VECTOR GENERATE MASK, v2i64 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a word-granularity replicate with the lowest value that cannot use
++; VREPIF.
++define <2 x i64> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <2 x i64> <i64 140737488388096, i64 140737488388096>
++}
++
++; Test a word-granularity replicate that has the lower 17 bits set.
++define <2 x i64> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgmf %v24, 15, 31
++; CHECK: br %r14
++  ret <2 x i64> <i64 562945658585087, i64 562945658585087>
++}
++
++; Test a word-granularity replicate that has the upper 15 bits set.
++define <2 x i64> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgmf %v24, 0, 14
++; CHECK: br %r14
++  ret <2 x i64> <i64 -562945658585088, i64 -562945658585088>
++}
++
++; Test a word-granularity replicate that has middle bits set.
++define <2 x i64> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgmf %v24, 12, 17
++; CHECK: br %r14
++  ret <2 x i64> <i64 4433230884225024, i64 4433230884225024>
++}
++
++; Test a word-granularity replicate with a wrap-around mask.
++define <2 x i64> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vgmf %v24, 17, 15
++; CHECK: br %r14
++  ret <2 x i64> <i64 -140737488388097, i64 -140737488388097>
++}
++
++; Test a doubleword-granularity replicate with the lowest value that cannot
++; use VREPIG.
++define <2 x i64> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgmg %v24, 48, 48
++; CHECK: br %r14
++  ret <2 x i64> <i64 32768, i64 32768>
++}
++
++; Test a doubleword-granularity replicate that has the lower 22 bits set.
++define <2 x i64> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgmg %v24, 42, 63
++; CHECK: br %r14
++  ret <2 x i64> <i64 4194303, i64 4194303>
++}
++
++; Test a doubleword-granularity replicate that has the upper 45 bits set.
++define <2 x i64> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgmg %v24, 0, 44
++; CHECK: br %r14
++  ret <2 x i64> <i64 -524288, i64 -524288>
++}
++
++; Test a doubleword-granularity replicate that has middle bits set.
++define <2 x i64> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgmg %v24, 31, 42
++; CHECK: br %r14
++  ret <2 x i64> <i64 8587837440, i64 8587837440>
++}
++
++; Test a doubleword-granularity replicate with a wrap-around mask.
++define <2 x i64> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgmg %v24, 18, 0
++; CHECK: br %r14
++  ret <2 x i64> <i64 -9223301668110598145, i64 -9223301668110598145>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-17.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-17.ll
+@@ -0,0 +1,95 @@
++; Test vector replicates that use VECTOR GENERATE MASK, v4f32 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a word-granularity replicate with the lowest value that cannot use
++; VREPIF.
++define <4 x float> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <4 x float> <float 0x3790000000000000, float 0x3790000000000000,
++                   float 0x3790000000000000, float 0x3790000000000000>
++}
++
++; Test a word-granularity replicate that has the lower 17 bits set.
++define <4 x float> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgmf %v24, 15, 31
++; CHECK: br %r14
++  ret <4 x float> <float 0x37affff000000000, float 0x37affff000000000,
++                   float 0x37affff000000000, float 0x37affff000000000>
++}
++
++; Test a word-granularity replicate that has the upper 15 bits set.
++define <4 x float> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgmf %v24, 0, 14
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffc00000000000, float 0xffffc00000000000,
++                   float 0xffffc00000000000, float 0xffffc00000000000>
++}
++
++; Test a word-granularity replicate that has middle bits set.
++define <4 x float> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgmf %v24, 2, 8
++; CHECK: br %r14
++  ret <4 x float> <float 0x3ff0000000000000, float 0x3ff0000000000000,
++                   float 0x3ff0000000000000, float 0x3ff0000000000000>
++}
++
++; Test a word-granularity replicate with a wrap-around mask.
++define <4 x float> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vgmf %v24, 9, 1
++; CHECK: br %r14
++  ret <4 x float> <float 0xc00fffffe0000000, float 0xc00fffffe0000000,
++                   float 0xc00fffffe0000000, float 0xc00fffffe0000000>
++}
++
++; Test a doubleword-granularity replicate with the lowest value that cannot
++; use VREPIG.
++define <4 x float> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgmg %v24, 48, 48
++; CHECK: br %r14
++  ret <4 x float> <float 0.0, float 0x3790000000000000,
++                   float 0.0, float 0x3790000000000000>
++}
++
++; Test a doubleword-granularity replicate that has the lower 22 bits set.
++define <4 x float> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgmg %v24, 42, 63
++; CHECK: br %r14
++  ret <4 x float> <float 0.0, float 0x37ffffff80000000,
++                   float 0.0, float 0x37ffffff80000000>
++}
++
++; Test a doubleword-granularity replicate that has the upper 45 bits set.
++define <4 x float> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgmg %v24, 0, 44
++; CHECK: br %r14
++  ret <4 x float> <float 0xffffffffe0000000, float 0xffff000000000000,
++                   float 0xffffffffe0000000, float 0xffff000000000000>
++}
++
++; Test a doubleword-granularity replicate that has middle bits set.
++define <4 x float> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgmg %v24, 34, 41
++; CHECK: br %r14
++  ret <4 x float> <float 0.0, float 0x3ff8000000000000,
++                   float 0.0, float 0x3ff8000000000000>
++}
++
++; Test a doubleword-granularity replicate with a wrap-around mask.
++define <4 x float> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgmg %v24, 32, 0
++; CHECK: br %r14
++  ret <4 x float> <float 0x8000000000000000, float 0xffffffffe0000000,
++                   float 0x8000000000000000, float 0xffffffffe0000000>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-const-18.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-const-18.ll
+@@ -0,0 +1,85 @@
++; Test vector replicates that use VECTOR GENERATE MASK, v2f64 version.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a word-granularity replicate with the lowest value that cannot use
++; VREPIF.
++define <2 x double> @f1() {
++; CHECK-LABEL: f1:
++; CHECK: vgmf %v24, 16, 16
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000800000008000, double 0x0000800000008000>
++}
++
++; Test a word-granularity replicate that has the lower 17 bits set.
++define <2 x double> @f2() {
++; CHECK-LABEL: f2:
++; CHECK: vgmf %v24, 15, 31
++; CHECK: br %r14
++  ret <2 x double> <double 0x0001ffff0001ffff, double 0x0001ffff0001ffff>
++}
++
++; Test a word-granularity replicate that has the upper 15 bits set.
++define <2 x double> @f3() {
++; CHECK-LABEL: f3:
++; CHECK: vgmf %v24, 0, 14
++; CHECK: br %r14
++  ret <2 x double> <double 0xfffe0000fffe0000, double 0xfffe0000fffe0000>
++}
++
++; Test a word-granularity replicate that has middle bits set.
++define <2 x double> @f4() {
++; CHECK-LABEL: f4:
++; CHECK: vgmf %v24, 2, 11
++; CHECK: br %r14
++  ret <2 x double> <double 0x3ff000003ff00000, double 0x3ff000003ff00000>
++}
++
++; Test a word-granularity replicate with a wrap-around mask.
++define <2 x double> @f5() {
++; CHECK-LABEL: f5:
++; CHECK: vgmf %v24, 17, 15
++; CHECK: br %r14
++  ret <2 x double> <double 0xffff7fffffff7fff, double 0xffff7fffffff7fff>
++}
++
++; Test a doubleword-granularity replicate with the lowest value that cannot
++; use VREPIG.
++define <2 x double> @f6() {
++; CHECK-LABEL: f6:
++; CHECK: vgmg %v24, 48, 48
++; CHECK: br %r14
++  ret <2 x double> <double 0x0000000000008000, double 0x0000000000008000>
++}
++
++; Test a doubleword-granularity replicate that has the lower 22 bits set.
++define <2 x double> @f7() {
++; CHECK-LABEL: f7:
++; CHECK: vgmg %v24, 42, 63
++; CHECK: br %r14
++  ret <2 x double> <double 0x000000000003fffff, double 0x000000000003fffff>
++}
++
++; Test a doubleword-granularity replicate that has the upper 45 bits set.
++define <2 x double> @f8() {
++; CHECK-LABEL: f8:
++; CHECK: vgmg %v24, 0, 44
++; CHECK: br %r14
++  ret <2 x double> <double 0xfffffffffff80000, double 0xfffffffffff80000>
++}
++
++; Test a doubleword-granularity replicate that has middle bits set.
++define <2 x double> @f9() {
++; CHECK-LABEL: f9:
++; CHECK: vgmg %v24, 2, 11
++; CHECK: br %r14
++  ret <2 x double> <double 0x3ff0000000000000, double 0x3ff0000000000000>
++}
++
++; Test a doubleword-granularity replicate with a wrap-around mask.
++define <2 x double> @f10() {
++; CHECK-LABEL: f10:
++; CHECK: vgmg %v24, 10, 0
++; CHECK: br %r14
++  ret <2 x double> <double 0x803fffffffffffff, double 0x803fffffffffffff>
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-conv-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-conv-01.ll
+@@ -0,0 +1,95 @@
++; Test conversions between integer and float elements.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test conversion of f64s to signed i64s.
++define <2 x i64> @f1(<2 x double> %doubles) {
++; CHECK-LABEL: f1:
++; CHECK: vcgdb %v24, %v24, 0, 5
++; CHECK: br %r14
++  %dwords = fptosi <2 x double> %doubles to <2 x i64>
++  ret <2 x i64> %dwords
++}
++
++; Test conversion of f64s to unsigned i64s.
++define <2 x i64> @f2(<2 x double> %doubles) {
++; CHECK-LABEL: f2:
++; CHECK: vclgdb %v24, %v24, 0, 5
++; CHECK: br %r14
++  %dwords = fptoui <2 x double> %doubles to <2 x i64>
++  ret <2 x i64> %dwords
++}
++
++; Test conversion of signed i64s to f64s.
++define <2 x double> @f3(<2 x i64> %dwords) {
++; CHECK-LABEL: f3:
++; CHECK: vcdgb %v24, %v24, 0, 0
++; CHECK: br %r14
++  %doubles = sitofp <2 x i64> %dwords to <2 x double>
++  ret <2 x double> %doubles
++}
++
++; Test conversion of unsigned i64s to f64s.
++define <2 x double> @f4(<2 x i64> %dwords) {
++; CHECK-LABEL: f4:
++; CHECK: vcdlgb %v24, %v24, 0, 0
++; CHECK: br %r14
++  %doubles = uitofp <2 x i64> %dwords to <2 x double>
++  ret <2 x double> %doubles
++}
++
++; Test conversion of f64s to signed i32s, which must compile.
++define void @f5(<2 x double> %doubles, <2 x i32> *%ptr) {
++  %words = fptosi <2 x double> %doubles to <2 x i32>
++  store <2 x i32> %words, <2 x i32> *%ptr
++  ret void
++}
++
++; Test conversion of f64s to unsigned i32s, which must compile.
++define void @f6(<2 x double> %doubles, <2 x i32> *%ptr) {
++  %words = fptoui <2 x double> %doubles to <2 x i32>
++  store <2 x i32> %words, <2 x i32> *%ptr
++  ret void
++}
++
++; Test conversion of signed i32s to f64s, which must compile.
++define <2 x double> @f7(<2 x i32> *%ptr) {
++  %words = load <2 x i32> *%ptr
++  %doubles = sitofp <2 x i32> %words to <2 x double>
++  ret <2 x double> %doubles
++}
++
++; Test conversion of unsigned i32s to f64s, which must compile.
++define <2 x double> @f8(<2 x i32> *%ptr) {
++  %words = load <2 x i32> *%ptr
++  %doubles = uitofp <2 x i32> %words to <2 x double>
++  ret <2 x double> %doubles
++}
++
++; Test conversion of f32s to signed i64s, which must compile.
++define <2 x i64> @f9(<2 x float> *%ptr) {
++  %floats = load <2 x float> *%ptr
++  %dwords = fptosi <2 x float> %floats to <2 x i64>
++  ret <2 x i64> %dwords
++}
++
++; Test conversion of f32s to unsigned i64s, which must compile.
++define <2 x i64> @f10(<2 x float> *%ptr) {
++  %floats = load <2 x float> *%ptr
++  %dwords = fptoui <2 x float> %floats to <2 x i64>
++  ret <2 x i64> %dwords
++}
++
++; Test conversion of signed i64s to f32, which must compile.
++define void @f11(<2 x i64> %dwords, <2 x float> *%ptr) {
++  %floats = sitofp <2 x i64> %dwords to <2 x float>
++  store <2 x float> %floats, <2 x float> *%ptr
++  ret void
++}
++
++; Test conversion of unsigned i64s to f32, which must compile.
++define void @f12(<2 x i64> %dwords, <2 x float> *%ptr) {
++  %floats = uitofp <2 x i64> %dwords to <2 x float>
++  store <2 x float> %floats, <2 x float> *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-conv-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-conv-02.ll
+@@ -0,0 +1,33 @@
++; Test conversions between different-sized float elements.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test cases where both elements of a v2f64 are converted to f32s.
++define void @f1(<2 x double> %val, <2 x float> *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vledb {{%v[0-9]+}}, %v24, 0, 0
++; CHECK: br %r14
++  %res = fptrunc <2 x double> %val to <2 x float>
++  store <2 x float> %res, <2 x float> *%ptr
++  ret void
++}
++
++; Test conversion of an f64 in a vector register to an f32.
++define float @f2(<2 x double> %vec) {
++; CHECK-LABEL: f2:
++; CHECK: wledb %f0, %v24
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %vec, i32 0
++  %ret = fptrunc double %scalar to float
++  ret float %ret
++}
++
++; Test conversion of an f32 in a vector register to an f64.
++define double @f3(<4 x float> %vec) {
++; CHECK-LABEL: f3:
++; CHECK: wldeb %f0, %v24
++; CHECK: br %r14
++  %scalar = extractelement <4 x float> %vec, i32 0
++  %ret = fpext float %scalar to double
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-ctlz-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-ctlz-01.ll
+@@ -0,0 +1,81 @@
++; Test vector count leading zeros
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %src, i1 %is_zero_undef)
++declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %src, i1 %is_zero_undef)
++declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %src, i1 %is_zero_undef)
++declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %src, i1 %is_zero_undef)
++
++define <16 x i8> @f1(<16 x i8> %a) {
++; CHECK-LABEL: f1:
++; CHECK: vclzb %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @f2(<16 x i8> %a) {
++; CHECK-LABEL: f2:
++; CHECK: vclzb %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 true)
++  ret <16 x i8> %res
++}
++
++define <8 x i16> @f3(<8 x i16> %a) {
++; CHECK-LABEL: f3:
++; CHECK: vclzh %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @f4(<8 x i16> %a) {
++; CHECK-LABEL: f4:
++; CHECK: vclzh %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 true)
++  ret <8 x i16> %res
++}
++
++define <4 x i32> @f5(<4 x i32> %a) {
++; CHECK-LABEL: f5:
++; CHECK: vclzf %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @f6(<4 x i32> %a) {
++; CHECK-LABEL: f6:
++; CHECK: vclzf %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 true)
++  ret <4 x i32> %res
++}
++
++define <2 x i64> @f7(<2 x i64> %a) {
++; CHECK-LABEL: f7:
++; CHECK: vclzg %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @f8(<2 x i64> %a) {
++; CHECK-LABEL: f8:
++; CHECK: vclzg %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true)
++  ret <2 x i64> %res
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-ctpop-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-ctpop-01.ll
+@@ -0,0 +1,53 @@
++; Test vector population-count instruction
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
++declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
++declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
++declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
++
++define <16 x i8> @f1(<16 x i8> %a) {
++; CHECK-LABEL: f1:
++; CHECK: vpopct  %v24, %v24, 0
++; CHECK: br      %r14
++
++  %popcnt = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
++  ret <16 x i8> %popcnt
++}
++
++define <8 x i16> @f2(<8 x i16> %a) {
++; CHECK-LABEL: f2:
++; CHECK: vpopct  [[T1:%v[0-9]+]], %v24, 0
++; CHECK: veslh   [[T2:%v[0-9]+]], [[T1]], 8
++; CHECK: vah     [[T3:%v[0-9]+]], [[T1]], [[T2]]
++; CHECK: vesrlh  %v24, [[T3]], 8
++; CHECK: br      %r14
++
++  %popcnt = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
++  ret <8 x i16> %popcnt
++}
++
++define <4 x i32> @f3(<4 x i32> %a) {
++; CHECK-LABEL: f3:
++; CHECK: vpopct  [[T1:%v[0-9]+]], %v24, 0
++; CHECK: vgbm    [[T2:%v[0-9]+]], 0
++; CHECK: vsumb   %v24, [[T1]], [[T2]]
++; CHECK: br      %r14
++
++  %popcnt = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
++  ret <4 x i32> %popcnt
++}
++
++define <2 x i64> @f4(<2 x i64> %a) {
++; CHECK-LABEL: f4:
++; CHECK: vpopct  [[T1:%v[0-9]+]], %v24, 0
++; CHECK: vgbm    [[T2:%v[0-9]+]], 0
++; CHECK: vsumb   [[T3:%v[0-9]+]], [[T1]], [[T2]]
++; CHECK: vsumgf  %v24, [[T3]], [[T2]]
++; CHECK: br      %r14
++
++  %popcnt = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
++  ret <2 x i64> %popcnt
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-cttz-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-cttz-01.ll
+@@ -0,0 +1,81 @@
++; Test vector count trailing zeros
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare <16 x i8> @llvm.cttz.v16i8(<16 x i8> %src, i1 %is_zero_undef)
++declare <8 x i16> @llvm.cttz.v8i16(<8 x i16> %src, i1 %is_zero_undef)
++declare <4 x i32> @llvm.cttz.v4i32(<4 x i32> %src, i1 %is_zero_undef)
++declare <2 x i64> @llvm.cttz.v2i64(<2 x i64> %src, i1 %is_zero_undef)
++
++define <16 x i8> @f1(<16 x i8> %a) {
++; CHECK-LABEL: f1:
++; CHECK: vctzb %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @f2(<16 x i8> %a) {
++; CHECK-LABEL: f2:
++; CHECK: vctzb %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
++  ret <16 x i8> %res
++}
++
++define <8 x i16> @f3(<8 x i16> %a) {
++; CHECK-LABEL: f3:
++; CHECK: vctzh %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @f4(<8 x i16> %a) {
++; CHECK-LABEL: f4:
++; CHECK: vctzh %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
++  ret <8 x i16> %res
++}
++
++define <4 x i32> @f5(<4 x i32> %a) {
++; CHECK-LABEL: f5:
++; CHECK: vctzf %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @f6(<4 x i32> %a) {
++; CHECK-LABEL: f6:
++; CHECK: vctzf %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
++  ret <4 x i32> %res
++}
++
++define <2 x i64> @f7(<2 x i64> %a) {
++; CHECK-LABEL: f7:
++; CHECK: vctzg %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @f8(<2 x i64> %a) {
++; CHECK-LABEL: f8:
++; CHECK: vctzg %v24, %v24
++; CHECK: br    %r14
++
++  %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
++  ret <2 x i64> %res
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-div-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-div-01.ll
+@@ -0,0 +1,83 @@
++; Test vector division.  There is no native integer support for this,
++; so the integer cases are really a test of the operation legalization code.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 division.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vlvgp [[REG:%v[0-9]+]],
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 0
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 1
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 2
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 3
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 4
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 5
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 6
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 8
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 9
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 10
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 11
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 12
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 13
++; CHECK-DAG: vlvgb [[REG]], {{%r[0-5]}}, 14
++; CHECK: br %r14
++  %ret = sdiv <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 division.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vlvgp [[REG:%v[0-9]+]],
++; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 0
++; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 1
++; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 2
++; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 4
++; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 5
++; CHECK-DAG: vlvgh [[REG]], {{%r[0-5]}}, 6
++; CHECK: br %r14
++  %ret = sdiv <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 division.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vlvgp [[REG:%v[0-9]+]],
++; CHECK-DAG: vlvgf [[REG]], {{%r[0-5]}}, 0
++; CHECK-DAG: vlvgf [[REG]], {{%r[0-5]}}, 2
++; CHECK: br %r14
++  %ret = sdiv <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 division.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vlvgp %v24,
++; CHECK: br %r14
++  %ret = sdiv <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v2f64 division.
++define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
++                        <2 x double> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vfddb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = fdiv <2 x double> %val1, %val2
++  ret <2 x double> %ret
++}
++
++; Test an f64 division that uses vector registers.
++define double @f6(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: wfddb %f0, %v24, %v26
++; CHECK: br %r14
++  %scalar1 = extractelement <2 x double> %val1, i32 0
++  %scalar2 = extractelement <2 x double> %val2, i32 0
++  %ret = fdiv double %scalar1, %scalar2
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-extract-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-extract-01.ll
+@@ -0,0 +1,13 @@
++; Verify ReplaceExtractVectorEltOfLoadWithNarrowedLoad fixes
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a memory copy of a v2i32 (via the constant pool).
++define void @f1(<2 x i32> *%dest) {
++; CHECK-LABEL: f1:
++; CHECK: lgrl [[REG:%r[0-5]]], {{[._A-Za-z0-9]}}
++; CHECK: stg [[REG]], 0(%r2)
++; CHECK: br %r14
++  store <2 x i32> <i32 1000000, i32 99999>, <2 x i32> *%dest
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-extract-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-extract-02.ll
+@@ -0,0 +1,15 @@
++; Verify ReplaceExtractVectorEltOfLoadWithNarrowedLoad fixes
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a case where a vector extraction can be simplified to a scalar load.
++; The index must be extended from i32 to i64.
++define i32 @f1(<4 x i32> *%ptr, i32 %index) {
++; CHECK-LABEL: f1:
++; CHECK: risbg {{%r[0-5]}}, %r3, 30, 189, 2
++; CHECK: l %r2,
++; CHECK: br %r14
++  %vec = load <4 x i32> *%ptr
++  %res = extractelement <4 x i32> %vec, i32 %index
++  ret i32 %res
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-intrinsics.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-intrinsics.ll
+@@ -0,0 +1,3335 @@
++; Test vector intrinsics.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare i32 @llvm.s390.lcbb(i8 *, i32)
++declare <16 x i8> @llvm.s390.vlbb(i8 *, i32)
++declare <16 x i8> @llvm.s390.vll(i32, i8 *)
++declare <2 x i64> @llvm.s390.vpdi(<2 x i64>, <2 x i64>, i32)
++declare <16 x i8> @llvm.s390.vperm(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vpksh(<8 x i16>, <8 x i16>)
++declare <8 x i16> @llvm.s390.vpksf(<4 x i32>, <4 x i32>)
++declare <4 x i32> @llvm.s390.vpksg(<2 x i64>, <2 x i64>)
++declare {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16>, <8 x i16>)
++declare {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32>, <4 x i32>)
++declare {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vpklsh(<8 x i16>, <8 x i16>)
++declare <8 x i16> @llvm.s390.vpklsf(<4 x i32>, <4 x i32>)
++declare <4 x i32> @llvm.s390.vpklsg(<2 x i64>, <2 x i64>)
++declare {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16>, <8 x i16>)
++declare {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32>, <4 x i32>)
++declare {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64>, <2 x i64>)
++declare void @llvm.s390.vstl(<16 x i8>, i32, i8 *)
++declare <8 x i16> @llvm.s390.vuphb(<16 x i8>)
++declare <4 x i32> @llvm.s390.vuphh(<8 x i16>)
++declare <2 x i64> @llvm.s390.vuphf(<4 x i32>)
++declare <8 x i16> @llvm.s390.vuplhb(<16 x i8>)
++declare <4 x i32> @llvm.s390.vuplhh(<8 x i16>)
++declare <2 x i64> @llvm.s390.vuplhf(<4 x i32>)
++declare <8 x i16> @llvm.s390.vuplb(<16 x i8>)
++declare <4 x i32> @llvm.s390.vuplhw(<8 x i16>)
++declare <2 x i64> @llvm.s390.vuplf(<4 x i32>)
++declare <8 x i16> @llvm.s390.vupllb(<16 x i8>)
++declare <4 x i32> @llvm.s390.vupllh(<8 x i16>)
++declare <2 x i64> @llvm.s390.vupllf(<4 x i32>)
++declare <16 x i8> @llvm.s390.vaccb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vacch(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vaccf(<4 x i32>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vaccg(<2 x i64>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vaq(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vacq(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vaccq(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vacccq(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vavgb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vavgh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vavgf(<4 x i32>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vavgg(<2 x i64>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vavglb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vavglh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vavglf(<4 x i32>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vavglg(<2 x i64>, <2 x i64>)
++declare <4 x i32> @llvm.s390.vcksm(<4 x i32>, <4 x i32>)
++declare <8 x i16> @llvm.s390.vgfmb(<16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vgfmh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vgfmf(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vgfmg(<2 x i64>, <2 x i64>)
++declare <8 x i16> @llvm.s390.vgfmab(<16 x i8>, <16 x i8>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vgfmah(<8 x i16>, <8 x i16>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vgfmaf(<4 x i32>, <4 x i32>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vmahb(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vmahh(<8 x i16>, <8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmahf(<4 x i32>, <4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vmalhb(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vmalhh(<8 x i16>, <8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmalhf(<4 x i32>, <4 x i32>, <4 x i32>)
++declare <8 x i16> @llvm.s390.vmaeb(<16 x i8>, <16 x i8>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmaeh(<8 x i16>, <8 x i16>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vmaef(<4 x i32>, <4 x i32>, <2 x i64>)
++declare <8 x i16> @llvm.s390.vmaleb(<16 x i8>, <16 x i8>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmaleh(<8 x i16>, <8 x i16>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vmalef(<4 x i32>, <4 x i32>, <2 x i64>)
++declare <8 x i16> @llvm.s390.vmaob(<16 x i8>, <16 x i8>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmaoh(<8 x i16>, <8 x i16>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vmaof(<4 x i32>, <4 x i32>, <2 x i64>)
++declare <8 x i16> @llvm.s390.vmalob(<16 x i8>, <16 x i8>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmaloh(<8 x i16>, <8 x i16>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vmalof(<4 x i32>, <4 x i32>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vmhb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vmhh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmhf(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vmlhb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vmlhh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vmlhf(<4 x i32>, <4 x i32>)
++declare <8 x i16> @llvm.s390.vmeb(<16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vmeh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vmef(<4 x i32>, <4 x i32>)
++declare <8 x i16> @llvm.s390.vmleb(<16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vmleh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vmlef(<4 x i32>, <4 x i32>)
++declare <8 x i16> @llvm.s390.vmob(<16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vmoh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>)
++declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>)
++declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>)
++declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32)
++declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32)
++declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32)
++declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32)
++declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
++declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
++declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
++declare <2 x i64> @llvm.s390.verimg(<2 x i64>, <2 x i64>, <2 x i64>, i32)
++declare <16 x i8> @llvm.s390.vsl(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vslb(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsra(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsrab(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsrl(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsrlb(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsldb(<16 x i8>, <16 x i8>, i32)
++declare <16 x i8> @llvm.s390.vscbib(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vscbih(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vscbif(<4 x i32>, <4 x i32>)
++declare <2 x i64> @llvm.s390.vscbig(<2 x i64>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vsq(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsbiq(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vscbiq(<16 x i8>, <16 x i8>)
++declare <16 x i8> @llvm.s390.vsbcbiq(<16 x i8>, <16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vsumb(<16 x i8>, <16 x i8>)
++declare <4 x i32> @llvm.s390.vsumh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>)
++declare <2 x i64> @llvm.s390.vsumgf(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vsumqf(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vsumqg(<2 x i64>, <2 x i64>)
++declare i32 @llvm.s390.vtm(<16 x i8>, <16 x i8>)
++declare {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32>, <4 x i32>)
++declare {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64>, <2 x i64>)
++declare {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32>, <4 x i32>)
++declare {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64>, <2 x i64>)
++declare {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32>, <4 x i32>)
++declare {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64>, <2 x i64>)
++declare <16 x i8> @llvm.s390.vfaeb(<16 x i8>, <16 x i8>, i32)
++declare <8 x i16> @llvm.s390.vfaeh(<8 x i16>, <8 x i16>, i32)
++declare <4 x i32> @llvm.s390.vfaef(<4 x i32>, <4 x i32>, i32)
++declare {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8>, <16 x i8>, i32)
++declare {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16>, <8 x i16>, i32)
++declare {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32>, <4 x i32>, i32)
++declare <16 x i8> @llvm.s390.vfaezb(<16 x i8>, <16 x i8>, i32)
++declare <8 x i16> @llvm.s390.vfaezh(<8 x i16>, <8 x i16>, i32)
++declare <4 x i32> @llvm.s390.vfaezf(<4 x i32>, <4 x i32>, i32)
++declare {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8>, <16 x i8>, i32)
++declare {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16>, <8 x i16>, i32)
++declare {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32>, <4 x i32>, i32)
++declare <16 x i8> @llvm.s390.vfeeb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vfeeh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vfeef(<4 x i32>, <4 x i32>)
++declare {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vfeezb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vfeezh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vfeezf(<4 x i32>, <4 x i32>)
++declare {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vfeneb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vfeneh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vfenef(<4 x i32>, <4 x i32>)
++declare {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vfenezb(<16 x i8>, <16 x i8>)
++declare <8 x i16> @llvm.s390.vfenezh(<8 x i16>, <8 x i16>)
++declare <4 x i32> @llvm.s390.vfenezf(<4 x i32>, <4 x i32>)
++declare {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8>, <16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16>, <8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32>, <4 x i32>)
++declare <16 x i8> @llvm.s390.vistrb(<16 x i8>)
++declare <8 x i16> @llvm.s390.vistrh(<8 x i16>)
++declare <4 x i32> @llvm.s390.vistrf(<4 x i32>)
++declare {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8>)
++declare {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16>)
++declare {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32>)
++declare <16 x i8> @llvm.s390.vstrcb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
++declare <8 x i16> @llvm.s390.vstrch(<8 x i16>, <8 x i16>, <8 x i16>, i32)
++declare <4 x i32> @llvm.s390.vstrcf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
++declare {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8>, <16 x i8>, <16 x i8>,
++                                            i32)
++declare {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16>, <8 x i16>, <8 x i16>,
++                                            i32)
++declare {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32>, <4 x i32>, <4 x i32>,
++                                            i32)
++declare <16 x i8> @llvm.s390.vstrczb(<16 x i8>, <16 x i8>, <16 x i8>, i32)
++declare <8 x i16> @llvm.s390.vstrczh(<8 x i16>, <8 x i16>, <8 x i16>, i32)
++declare <4 x i32> @llvm.s390.vstrczf(<4 x i32>, <4 x i32>, <4 x i32>, i32)
++declare {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8>, <16 x i8>, <16 x i8>,
++                                             i32)
++declare {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16>, <8 x i16>, <8 x i16>,
++                                             i32)
++declare {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32>, <4 x i32>, <4 x i32>,
++                                             i32)
++declare {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double>, <2 x double>)
++declare {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double>, <2 x double>)
++declare {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double>, <2 x double>)
++declare {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double>, i32)
++declare <2 x double> @llvm.s390.vfidb(<2 x double>, i32, i32)
++
++; LCBB with the lowest M3 operand.
++define i32 @test_lcbb1(i8 *%ptr) {
++; CHECK-LABEL: test_lcbb1:
++; CHECK: lcbb %r2, 0(%r2), 0
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 0)
++  ret i32 %res
++}
++
++; LCBB with the highest M3 operand.
++define i32 @test_lcbb2(i8 *%ptr) {
++; CHECK-LABEL: test_lcbb2:
++; CHECK: lcbb %r2, 0(%r2), 15
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 15)
++  ret i32 %res
++}
++
++; LCBB with a displacement and index.
++define i32 @test_lcbb3(i8 *%base, i64 %index) {
++; CHECK-LABEL: test_lcbb3:
++; CHECK: lcbb %r2, 4095({{%r2,%r3|%r3,%r2}}), 4
++; CHECK: br %r14
++  %add = add i64 %index, 4095
++  %ptr = getelementptr i8 *%base, i64 %add
++  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 4)
++  ret i32 %res
++}
++
++; LCBB with an out-of-range displacement.
++define i32 @test_lcbb4(i8 *%base) {
++; CHECK-LABEL: test_lcbb4:
++; CHECK: lcbb %r2, 0({{%r[1-5]}}), 5
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4096
++  %res = call i32 @llvm.s390.lcbb(i8 *%ptr, i32 5)
++  ret i32 %res
++}
++
++; VLBB with the lowest M3 operand.
++define <16 x i8> @test_vlbb1(i8 *%ptr) {
++; CHECK-LABEL: test_vlbb1:
++; CHECK: vlbb %v24, 0(%r2), 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 0)
++  ret <16 x i8> %res
++}
++
++; VLBB with the highest M3 operand.
++define <16 x i8> @test_vlbb2(i8 *%ptr) {
++; CHECK-LABEL: test_vlbb2:
++; CHECK: vlbb %v24, 0(%r2), 15
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 15)
++  ret <16 x i8> %res
++}
++
++; VLBB with a displacement and index.
++define <16 x i8> @test_vlbb3(i8 *%base, i64 %index) {
++; CHECK-LABEL: test_vlbb3:
++; CHECK: vlbb %v24, 4095({{%r2,%r3|%r3,%r2}}), 4
++; CHECK: br %r14
++  %add = add i64 %index, 4095
++  %ptr = getelementptr i8 *%base, i64 %add
++  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 4)
++  ret <16 x i8> %res
++}
++
++; VLBB with an out-of-range displacement.
++define <16 x i8> @test_vlbb4(i8 *%base) {
++; CHECK-LABEL: test_vlbb4:
++; CHECK: vlbb %v24, 0({{%r[1-5]}}), 5
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4096
++  %res = call <16 x i8> @llvm.s390.vlbb(i8 *%ptr, i32 5)
++  ret <16 x i8> %res
++}
++
++; VLL with the lowest in-range displacement.
++define <16 x i8> @test_vll1(i8 *%ptr, i32 %length) {
++; CHECK-LABEL: test_vll1:
++; CHECK: vll %v24, %r3, 0(%r2)
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
++  ret <16 x i8> %res
++}
++
++; VLL with the highest in-range displacement.
++define <16 x i8> @test_vll2(i8 *%base, i32 %length) {
++; CHECK-LABEL: test_vll2:
++; CHECK: vll %v24, %r3, 4095(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4095
++  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
++  ret <16 x i8> %res
++}
++
++; VLL with an out-of-range displacementa.
++define <16 x i8> @test_vll3(i8 *%base, i32 %length) {
++; CHECK-LABEL: test_vll3:
++; CHECK: vll %v24, %r3, 0({{%r[1-5]}})
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4096
++  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
++  ret <16 x i8> %res
++}
++
++; Check that VLL doesn't allow an index.
++define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) {
++; CHECK-LABEL: test_vll4:
++; CHECK: vll %v24, %r4, 0({{%r[1-5]}})
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 %index
++  %res = call <16 x i8> @llvm.s390.vll(i32 %length, i8 *%ptr)
++  ret <16 x i8> %res
++}
++
++; VPDI taking element 0 from each half.
++define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vpdi1:
++; CHECK: vpdi %v24, %v24, %v26, 0
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 0)
++  ret <2 x i64> %res
++}
++
++; VPDI taking element 1 from each half.
++define <2 x i64> @test_vpdi2(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vpdi2:
++; CHECK: vpdi %v24, %v24, %v26, 10
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vpdi(<2 x i64> %a, <2 x i64> %b, i32 10)
++  ret <2 x i64> %res
++}
++
++; VPERM.
++define <16 x i8> @test_vperm(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vperm:
++; CHECK: vperm %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vperm(<16 x i8> %a, <16 x i8> %b,
++                                         <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VPKSH.
++define <16 x i8> @test_vpksh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vpksh:
++; CHECK: vpksh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vpksh(<8 x i16> %a, <8 x i16> %b)
++  ret <16 x i8> %res
++}
++
++; VPKSF.
++define <8 x i16> @test_vpksf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vpksf:
++; CHECK: vpksf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vpksf(<4 x i32> %a, <4 x i32> %b)
++  ret <8 x i16> %res
++}
++
++; VPKSG.
++define <4 x i32> @test_vpksg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vpksg:
++; CHECK: vpksg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vpksg(<2 x i64> %a, <2 x i64> %b)
++  ret <4 x i32> %res
++}
++
++; VPKSHS with no processing of the result.
++define <16 x i8> @test_vpkshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vpkshs:
++; CHECK: vpkshs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VPKSHS, storing to %ptr if all values were saturated.
++define <16 x i8> @test_vpkshs_all_store(<8 x i16> %a, <8 x i16> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vpkshs_all_store:
++; CHECK: vpkshs %v24, %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vpkshs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp uge i32 %cc, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <16 x i8> %res
++}
++
++; VPKSFS with no processing of the result.
++define <8 x i16> @test_vpksfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vpksfs:
++; CHECK: vpksfs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VPKSFS, storing to %ptr if any values were saturated.
++define <8 x i16> @test_vpksfs_any_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vpksfs_any_store:
++; CHECK: vpksfs %v24, %v24, %v26
++; CHECK-NEXT: {{jhe|je}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vpksfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp ugt i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <8 x i16> %res
++}
++
++; VPKSGS with no processing of the result.
++define <4 x i32> @test_vpksgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vpksgs:
++; CHECK: vpksgs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VPKSGS, storing to %ptr if no elements were saturated
++define <4 x i32> @test_vpksgs_none_store(<2 x i64> %a, <2 x i64> %b,
++                                         i32 *%ptr) {
++; CHECK-LABEL: test_vpksgs_none_store:
++; CHECK: vpksgs %v24, %v24, %v26
++; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vpksgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp sle i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <4 x i32> %res
++}
++
++; VPKLSH.
++define <16 x i8> @test_vpklsh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vpklsh:
++; CHECK: vpklsh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vpklsh(<8 x i16> %a, <8 x i16> %b)
++  ret <16 x i8> %res
++}
++
++; VPKLSF.
++define <8 x i16> @test_vpklsf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vpklsf:
++; CHECK: vpklsf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vpklsf(<4 x i32> %a, <4 x i32> %b)
++  ret <8 x i16> %res
++}
++
++; VPKLSG.
++define <4 x i32> @test_vpklsg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vpklsg:
++; CHECK: vpklsg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vpklsg(<2 x i64> %a, <2 x i64> %b)
++  ret <4 x i32> %res
++}
++
++; VPKLSHS with no processing of the result.
++define <16 x i8> @test_vpklshs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vpklshs:
++; CHECK: vpklshs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VPKLSHS, storing to %ptr if all values were saturated.
++define <16 x i8> @test_vpklshs_all_store(<8 x i16> %a, <8 x i16> %b,
++                                         i32 *%ptr) {
++; CHECK-LABEL: test_vpklshs_all_store:
++; CHECK: vpklshs %v24, %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vpklshs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp eq i32 %cc, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <16 x i8> %res
++}
++
++; VPKLSFS with no processing of the result.
++define <8 x i16> @test_vpklsfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vpklsfs:
++; CHECK: vpklsfs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VPKLSFS, storing to %ptr if any values were saturated.
++define <8 x i16> @test_vpklsfs_any_store(<4 x i32> %a, <4 x i32> %b,
++                                         i32 *%ptr) {
++; CHECK-LABEL: test_vpklsfs_any_store:
++; CHECK: vpklsfs %v24, %v24, %v26
++; CHECK-NEXT: {{jhe|je}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vpklsfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp ne i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <8 x i16> %res
++}
++
++; VPKLSGS with no processing of the result.
++define <4 x i32> @test_vpklsgs(<2 x i64> %a, <2 x i64> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vpklsgs:
++; CHECK: vpklsgs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VPKLSGS, storing to %ptr if no elements were saturated
++define <4 x i32> @test_vpklsgs_none_store(<2 x i64> %a, <2 x i64> %b,
++                                          i32 *%ptr) {
++; CHECK-LABEL: test_vpklsgs_none_store:
++; CHECK: vpklsgs %v24, %v24, %v26
++; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vpklsgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp eq i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <4 x i32> %res
++}
++
++; VSTL with the lowest in-range displacement.
++define void @test_vstl1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
++; CHECK-LABEL: test_vstl1:
++; CHECK: vstl %v24, %r3, 0(%r2)
++; CHECK: br %r14
++  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
++  ret void
++}
++
++; VSTL with the highest in-range displacement.
++define void @test_vstl2(<16 x i8> %vec, i8 *%base, i32 %length) {
++; CHECK-LABEL: test_vstl2:
++; CHECK: vstl %v24, %r3, 4095(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4095
++  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
++  ret void
++}
++
++; VSTL with an out-of-range displacement.
++define void @test_vstl3(<16 x i8> %vec, i8 *%base, i32 %length) {
++; CHECK-LABEL: test_vstl3:
++; CHECK: vstl %v24, %r3, 0({{%r[1-5]}})
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4096
++  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
++  ret void
++}
++
++; Check that VSTL doesn't allow an index.
++define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
++; CHECK-LABEL: test_vstl4:
++; CHECK: vstl %v24, %r4, 0({{%r[1-5]}})
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 %index
++  call void @llvm.s390.vstl(<16 x i8> %vec, i32 %length, i8 *%ptr)
++  ret void
++}
++
++; VUPHB.
++define <8 x i16> @test_vuphb(<16 x i8> %a) {
++; CHECK-LABEL: test_vuphb:
++; CHECK: vuphb %v24, %v24
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vuphb(<16 x i8> %a)
++  ret <8 x i16> %res
++}
++
++; VUPHH.
++define <4 x i32> @test_vuphh(<8 x i16> %a) {
++; CHECK-LABEL: test_vuphh:
++; CHECK: vuphh %v24, %v24
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vuphh(<8 x i16> %a)
++  ret <4 x i32> %res
++}
++
++; VUPHF.
++define <2 x i64> @test_vuphf(<4 x i32> %a) {
++; CHECK-LABEL: test_vuphf:
++; CHECK: vuphf %v24, %v24
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vuphf(<4 x i32> %a)
++  ret <2 x i64> %res
++}
++
++; VUPLHB.
++define <8 x i16> @test_vuplhb(<16 x i8> %a) {
++; CHECK-LABEL: test_vuplhb:
++; CHECK: vuplhb %v24, %v24
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vuplhb(<16 x i8> %a)
++  ret <8 x i16> %res
++}
++
++; VUPLHH.
++define <4 x i32> @test_vuplhh(<8 x i16> %a) {
++; CHECK-LABEL: test_vuplhh:
++; CHECK: vuplhh %v24, %v24
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vuplhh(<8 x i16> %a)
++  ret <4 x i32> %res
++}
++
++; VUPLHF.
++define <2 x i64> @test_vuplhf(<4 x i32> %a) {
++; CHECK-LABEL: test_vuplhf:
++; CHECK: vuplhf %v24, %v24
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vuplhf(<4 x i32> %a)
++  ret <2 x i64> %res
++}
++
++; VUPLB.
++define <8 x i16> @test_vuplb(<16 x i8> %a) {
++; CHECK-LABEL: test_vuplb:
++; CHECK: vuplb %v24, %v24
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vuplb(<16 x i8> %a)
++  ret <8 x i16> %res
++}
++
++; VUPLHW.
++define <4 x i32> @test_vuplhw(<8 x i16> %a) {
++; CHECK-LABEL: test_vuplhw:
++; CHECK: vuplhw %v24, %v24
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vuplhw(<8 x i16> %a)
++  ret <4 x i32> %res
++}
++
++; VUPLF.
++define <2 x i64> @test_vuplf(<4 x i32> %a) {
++; CHECK-LABEL: test_vuplf:
++; CHECK: vuplf %v24, %v24
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vuplf(<4 x i32> %a)
++  ret <2 x i64> %res
++}
++
++; VUPLLB.
++define <8 x i16> @test_vupllb(<16 x i8> %a) {
++; CHECK-LABEL: test_vupllb:
++; CHECK: vupllb %v24, %v24
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vupllb(<16 x i8> %a)
++  ret <8 x i16> %res
++}
++
++; VUPLLH.
++define <4 x i32> @test_vupllh(<8 x i16> %a) {
++; CHECK-LABEL: test_vupllh:
++; CHECK: vupllh %v24, %v24
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vupllh(<8 x i16> %a)
++  ret <4 x i32> %res
++}
++
++; VUPLLF.
++define <2 x i64> @test_vupllf(<4 x i32> %a) {
++; CHECK-LABEL: test_vupllf:
++; CHECK: vupllf %v24, %v24
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vupllf(<4 x i32> %a)
++  ret <2 x i64> %res
++}
++
++; VACCB.
++define <16 x i8> @test_vaccb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vaccb:
++; CHECK: vaccb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vaccb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VACCH.
++define <8 x i16> @test_vacch(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vacch:
++; CHECK: vacch %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vacch(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VACCF.
++define <4 x i32> @test_vaccf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vaccf:
++; CHECK: vaccf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vaccf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VACCG.
++define <2 x i64> @test_vaccg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vaccg:
++; CHECK: vaccg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vaccg(<2 x i64> %a, <2 x i64> %b)
++  ret <2 x i64> %res
++}
++
++; VAQ.
++define <16 x i8> @test_vaq(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vaq:
++; CHECK: vaq %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vaq(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VACQ.
++define <16 x i8> @test_vacq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vacq:
++; CHECK: vacq %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vacq(<16 x i8> %a, <16 x i8> %b,
++                                        <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VACCQ.
++define <16 x i8> @test_vaccq(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vaccq:
++; CHECK: vaccq %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vaccq(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VACCCQ.
++define <16 x i8> @test_vacccq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vacccq:
++; CHECK: vacccq %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vacccq(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VAVGB.
++define <16 x i8> @test_vavgb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vavgb:
++; CHECK: vavgb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vavgb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VAVGH.
++define <8 x i16> @test_vavgh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vavgh:
++; CHECK: vavgh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vavgh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VAVGF.
++define <4 x i32> @test_vavgf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vavgf:
++; CHECK: vavgf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vavgf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VAVGG.
++define <2 x i64> @test_vavgg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vavgg:
++; CHECK: vavgg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vavgg(<2 x i64> %a, <2 x i64> %b)
++  ret <2 x i64> %res
++}
++
++; VAVGLB.
++define <16 x i8> @test_vavglb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vavglb:
++; CHECK: vavglb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vavglb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VAVGLH.
++define <8 x i16> @test_vavglh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vavglh:
++; CHECK: vavglh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vavglh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VAVGLF.
++define <4 x i32> @test_vavglf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vavglf:
++; CHECK: vavglf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vavglf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VAVGLG.
++define <2 x i64> @test_vavglg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vavglg:
++; CHECK: vavglg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vavglg(<2 x i64> %a, <2 x i64> %b)
++  ret <2 x i64> %res
++}
++
++; VCKSM.
++define <4 x i32> @test_vcksm(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vcksm:
++; CHECK: vcksm %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vcksm(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VGFMB.
++define <8 x i16> @test_vgfmb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vgfmb:
++; CHECK: vgfmb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %a, <16 x i8> %b)
++  ret <8 x i16> %res
++}
++
++; VGFMH.
++define <4 x i32> @test_vgfmh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vgfmh:
++; CHECK: vgfmh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vgfmh(<8 x i16> %a, <8 x i16> %b)
++  ret <4 x i32> %res
++}
++
++; VGFMF.
++define <2 x i64> @test_vgfmf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vgfmf:
++; CHECK: vgfmf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %a, <4 x i32> %b)
++  ret <2 x i64> %res
++}
++
++; VGFMG.
++define <16 x i8> @test_vgfmg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vgfmg:
++; CHECK: vgfmg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b)
++  ret <16 x i8> %res
++}
++
++; VGFMAB.
++define <8 x i16> @test_vgfmab(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vgfmab:
++; CHECK: vgfmab %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vgfmab(<16 x i8> %a, <16 x i8> %b,
++                                          <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VGFMAH.
++define <4 x i32> @test_vgfmah(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vgfmah:
++; CHECK: vgfmah %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vgfmah(<8 x i16> %a, <8 x i16> %b,
++                                          <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VGFMAF.
++define <2 x i64> @test_vgfmaf(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
++; CHECK-LABEL: test_vgfmaf:
++; CHECK: vgfmaf %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %a, <4 x i32> %b,
++                                          <2 x i64> %c)
++  ret <2 x i64> %res
++}
++
++; VGFMAG.
++define <16 x i8> @test_vgfmag(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vgfmag:
++; CHECK: vgfmag %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b,
++                                          <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VMAHB.
++define <16 x i8> @test_vmahb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vmahb:
++; CHECK: vmahb %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vmahb(<16 x i8> %a, <16 x i8> %b,
++                                         <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VMAHH.
++define <8 x i16> @test_vmahh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vmahh:
++; CHECK: vmahh %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmahh(<8 x i16> %a, <8 x i16> %b,
++                                         <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VMAHF.
++define <4 x i32> @test_vmahf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vmahf:
++; CHECK: vmahf %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmahf(<4 x i32> %a, <4 x i32> %b,
++                                         <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VMALHB.
++define <16 x i8> @test_vmalhb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vmalhb:
++; CHECK: vmalhb %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vmalhb(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VMALHH.
++define <8 x i16> @test_vmalhh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vmalhh:
++; CHECK: vmalhh %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmalhh(<8 x i16> %a, <8 x i16> %b,
++                                          <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VMALHF.
++define <4 x i32> @test_vmalhf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vmalhf:
++; CHECK: vmalhf %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmalhf(<4 x i32> %a, <4 x i32> %b,
++                                          <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VMAEB.
++define <8 x i16> @test_vmaeb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vmaeb:
++; CHECK: vmaeb %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmaeb(<16 x i8> %a, <16 x i8> %b,
++                                         <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VMAEH.
++define <4 x i32> @test_vmaeh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vmaeh:
++; CHECK: vmaeh %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmaeh(<8 x i16> %a, <8 x i16> %b,
++                                         <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VMAEF.
++define <2 x i64> @test_vmaef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
++; CHECK-LABEL: test_vmaef:
++; CHECK: vmaef %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmaef(<4 x i32> %a, <4 x i32> %b,
++                                         <2 x i64> %c)
++  ret <2 x i64> %res
++}
++
++; VMALEB.
++define <8 x i16> @test_vmaleb(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vmaleb:
++; CHECK: vmaleb %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmaleb(<16 x i8> %a, <16 x i8> %b,
++                                          <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VMALEH.
++define <4 x i32> @test_vmaleh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vmaleh:
++; CHECK: vmaleh %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmaleh(<8 x i16> %a, <8 x i16> %b,
++                                          <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VMALEF.
++define <2 x i64> @test_vmalef(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
++; CHECK-LABEL: test_vmalef:
++; CHECK: vmalef %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmalef(<4 x i32> %a, <4 x i32> %b,
++                                          <2 x i64> %c)
++  ret <2 x i64> %res
++}
++
++; VMAOB.
++define <8 x i16> @test_vmaob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vmaob:
++; CHECK: vmaob %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmaob(<16 x i8> %a, <16 x i8> %b,
++                                         <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VMAOH.
++define <4 x i32> @test_vmaoh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vmaoh:
++; CHECK: vmaoh %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmaoh(<8 x i16> %a, <8 x i16> %b,
++                                         <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VMAOF.
++define <2 x i64> @test_vmaof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
++; CHECK-LABEL: test_vmaof:
++; CHECK: vmaof %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmaof(<4 x i32> %a, <4 x i32> %b,
++                                         <2 x i64> %c)
++  ret <2 x i64> %res
++}
++
++; VMALOB.
++define <8 x i16> @test_vmalob(<16 x i8> %a, <16 x i8> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vmalob:
++; CHECK: vmalob %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmalob(<16 x i8> %a, <16 x i8> %b,
++                                          <8 x i16> %c)
++  ret <8 x i16> %res
++}
++
++; VMALOH.
++define <4 x i32> @test_vmaloh(<8 x i16> %a, <8 x i16> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vmaloh:
++; CHECK: vmaloh %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmaloh(<8 x i16> %a, <8 x i16> %b,
++                                          <4 x i32> %c)
++  ret <4 x i32> %res
++}
++
++; VMALOF.
++define <2 x i64> @test_vmalof(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) {
++; CHECK-LABEL: test_vmalof:
++; CHECK: vmalof %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmalof(<4 x i32> %a, <4 x i32> %b,
++                                          <2 x i64> %c)
++  ret <2 x i64> %res
++}
++
++; VMHB.
++define <16 x i8> @test_vmhb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vmhb:
++; CHECK: vmhb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vmhb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VMHH.
++define <8 x i16> @test_vmhh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vmhh:
++; CHECK: vmhh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmhh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VMHF.
++define <4 x i32> @test_vmhf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vmhf:
++; CHECK: vmhf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmhf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VMLHB.
++define <16 x i8> @test_vmlhb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vmlhb:
++; CHECK: vmlhb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vmlhb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VMLHH.
++define <8 x i16> @test_vmlhh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vmlhh:
++; CHECK: vmlhh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmlhh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VMLHF.
++define <4 x i32> @test_vmlhf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vmlhf:
++; CHECK: vmlhf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmlhf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VMEB.
++define <8 x i16> @test_vmeb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vmeb:
++; CHECK: vmeb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmeb(<16 x i8> %a, <16 x i8> %b)
++  ret <8 x i16> %res
++}
++
++; VMEH.
++define <4 x i32> @test_vmeh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vmeh:
++; CHECK: vmeh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmeh(<8 x i16> %a, <8 x i16> %b)
++  ret <4 x i32> %res
++}
++
++; VMEF.
++define <2 x i64> @test_vmef(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vmef:
++; CHECK: vmef %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmef(<4 x i32> %a, <4 x i32> %b)
++  ret <2 x i64> %res
++}
++
++; VMLEB.
++define <8 x i16> @test_vmleb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vmleb:
++; CHECK: vmleb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmleb(<16 x i8> %a, <16 x i8> %b)
++  ret <8 x i16> %res
++}
++
++; VMLEH.
++define <4 x i32> @test_vmleh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vmleh:
++; CHECK: vmleh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmleh(<8 x i16> %a, <8 x i16> %b)
++  ret <4 x i32> %res
++}
++
++; VMLEF.
++define <2 x i64> @test_vmlef(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vmlef:
++; CHECK: vmlef %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmlef(<4 x i32> %a, <4 x i32> %b)
++  ret <2 x i64> %res
++}
++
++; VMOB.
++define <8 x i16> @test_vmob(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vmob:
++; CHECK: vmob %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmob(<16 x i8> %a, <16 x i8> %b)
++  ret <8 x i16> %res
++}
++
++; VMOH.
++define <4 x i32> @test_vmoh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vmoh:
++; CHECK: vmoh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmoh(<8 x i16> %a, <8 x i16> %b)
++  ret <4 x i32> %res
++}
++
++; VMOF.
++define <2 x i64> @test_vmof(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vmof:
++; CHECK: vmof %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmof(<4 x i32> %a, <4 x i32> %b)
++  ret <2 x i64> %res
++}
++
++; VMLOB.
++define <8 x i16> @test_vmlob(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vmlob:
++; CHECK: vmlob %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vmlob(<16 x i8> %a, <16 x i8> %b)
++  ret <8 x i16> %res
++}
++
++; VMLOH.
++define <4 x i32> @test_vmloh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vmloh:
++; CHECK: vmloh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vmloh(<8 x i16> %a, <8 x i16> %b)
++  ret <4 x i32> %res
++}
++
++; VMLOF.
++define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vmlof:
++; CHECK: vmlof %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vmlof(<4 x i32> %a, <4 x i32> %b)
++  ret <2 x i64> %res
++}
++
++; VERLLVB.
++define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_verllvb:
++; CHECK: verllvb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VERLLVH.
++define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_verllvh:
++; CHECK: verllvh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VERLLVF.
++define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_verllvf:
++; CHECK: verllvf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VERLLVG.
++define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_verllvg:
++; CHECK: verllvg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b)
++  ret <2 x i64> %res
++}
++
++; VERLLB.
++define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) {
++; CHECK-LABEL: test_verllb:
++; CHECK: verllb %v24, %v24, 0(%r2)
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b)
++  ret <16 x i8> %res
++}
++
++; VERLLH.
++define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) {
++; CHECK-LABEL: test_verllh:
++; CHECK: verllh %v24, %v24, 0(%r2)
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b)
++  ret <8 x i16> %res
++}
++
++; VERLLF.
++define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) {
++; CHECK-LABEL: test_verllf:
++; CHECK: verllf %v24, %v24, 0(%r2)
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b)
++  ret <4 x i32> %res
++}
++
++; VERLLG.
++define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) {
++; CHECK-LABEL: test_verllg:
++; CHECK: verllg %v24, %v24, 0(%r2)
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b)
++  ret <2 x i64> %res
++}
++
++; VERLLB with the smallest count.
++define <16 x i8> @test_verllb_1(<16 x i8> %a) {
++; CHECK-LABEL: test_verllb_1:
++; CHECK: verllb %v24, %v24, 1
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1)
++  ret <16 x i8> %res
++}
++
++; VERLLB with the largest count.
++define <16 x i8> @test_verllb_4095(<16 x i8> %a) {
++; CHECK-LABEL: test_verllb_4095:
++; CHECK: verllb %v24, %v24, 4095
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095)
++  ret <16 x i8> %res
++}
++
++; VERLLB with the largest count + 1.
++define <16 x i8> @test_verllb_4096(<16 x i8> %a) {
++; CHECK-LABEL: test_verllb_4096:
++; CHECK: lhi [[REG:%r[1-5]]], 4096
++; CHECK: verllb %v24, %v24, 0([[REG]])
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096)
++  ret <16 x i8> %res
++}
++
++; VERIMB.
++define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_verimb:
++; CHECK: verimb %v24, %v26, %v28, 1
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 1)
++  ret <16 x i8> %res
++}
++
++; VERIMH.
++define <8 x i16> @test_verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_verimh:
++; CHECK: verimh %v24, %v26, %v28, 1
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.verimh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, i32 1)
++  ret <8 x i16> %res
++}
++
++; VERIMF.
++define <4 x i32> @test_verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_verimf:
++; CHECK: verimf %v24, %v26, %v28, 1
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.verimf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 1)
++  ret <4 x i32> %res
++}
++
++; VERIMG.
++define <2 x i64> @test_verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
++; CHECK-LABEL: test_verimg:
++; CHECK: verimg %v24, %v26, %v28, 1
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.verimg(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, i32 1)
++  ret <2 x i64> %res
++}
++
++; VERIMB with a different mask.
++define <16 x i8> @test_verimb_254(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_verimb_254:
++; CHECK: verimb %v24, %v26, %v28, 254
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, i32 254)
++  ret <16 x i8> %res
++}
++
++; VSL.
++define <16 x i8> @test_vsl(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsl:
++; CHECK: vsl %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsl(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSLB.
++define <16 x i8> @test_vslb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vslb:
++; CHECK: vslb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vslb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSRA.
++define <16 x i8> @test_vsra(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsra:
++; CHECK: vsra %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsra(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSRAB.
++define <16 x i8> @test_vsrab(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsrab:
++; CHECK: vsrab %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsrab(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSRL.
++define <16 x i8> @test_vsrl(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsrl:
++; CHECK: vsrl %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsrl(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSRLB.
++define <16 x i8> @test_vsrlb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsrlb:
++; CHECK: vsrlb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsrlb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSLDB with the minimum useful value.
++define <16 x i8> @test_vsldb_1(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsldb_1:
++; CHECK: vsldb %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 1)
++  ret <16 x i8> %res
++}
++
++; VSLDB with the maximum value.
++define <16 x i8> @test_vsldb_15(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsldb_15:
++; CHECK: vsldb %v24, %v24, %v26, 15
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsldb(<16 x i8> %a, <16 x i8> %b, i32 15)
++  ret <16 x i8> %res
++}
++
++; VSCBIB.
++define <16 x i8> @test_vscbib(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vscbib:
++; CHECK: vscbib %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vscbib(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSCBIH.
++define <8 x i16> @test_vscbih(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vscbih:
++; CHECK: vscbih %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vscbih(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VSCBIF.
++define <4 x i32> @test_vscbif(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vscbif:
++; CHECK: vscbif %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vscbif(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VSCBIG.
++define <2 x i64> @test_vscbig(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vscbig:
++; CHECK: vscbig %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vscbig(<2 x i64> %a, <2 x i64> %b)
++  ret <2 x i64> %res
++}
++
++; VSQ.
++define <16 x i8> @test_vsq(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsq:
++; CHECK: vsq %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsq(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSBIQ.
++define <16 x i8> @test_vsbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vsbiq:
++; CHECK: vsbiq %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %a, <16 x i8> %b,
++                                         <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VSCBIQ.
++define <16 x i8> @test_vscbiq(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vscbiq:
++; CHECK: vscbiq %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VSBCBIQ.
++define <16 x i8> @test_vsbcbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vsbcbiq:
++; CHECK: vsbcbiq %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %a, <16 x i8> %b,
++                                           <16 x i8> %c)
++  ret <16 x i8> %res
++}
++
++; VSUMB.
++define <4 x i32> @test_vsumb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vsumb:
++; CHECK: vsumb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vsumb(<16 x i8> %a, <16 x i8> %b)
++  ret <4 x i32> %res
++}
++
++; VSUMH.
++define <4 x i32> @test_vsumh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vsumh:
++; CHECK: vsumh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vsumh(<8 x i16> %a, <8 x i16> %b)
++  ret <4 x i32> %res
++}
++
++; VSUMGH.
++define <2 x i64> @test_vsumgh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vsumgh:
++; CHECK: vsumgh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vsumgh(<8 x i16> %a, <8 x i16> %b)
++  ret <2 x i64> %res
++}
++
++; VSUMGF.
++define <2 x i64> @test_vsumgf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vsumgf:
++; CHECK: vsumgf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %a, <4 x i32> %b)
++  ret <2 x i64> %res
++}
++
++; VSUMQF.
++define <16 x i8> @test_vsumqf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vsumqf:
++; CHECK: vsumqf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b)
++  ret <16 x i8> %res
++}
++
++; VSUMQG.
++define <16 x i8> @test_vsumqg(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vsumqg:
++; CHECK: vsumqg %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b)
++  ret <16 x i8> %res
++}
++
++; VTM with no processing of the result.
++define i32 @test_vtm(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vtm:
++; CHECK: vtm %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
++  ret i32 %res
++}
++
++; VTM, storing to %ptr if all bits are set.
++define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vtm_all_store:
++; CHECK-NOT: %r
++; CHECK: vtm %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b)
++  %cmp = icmp sge i32 %res, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret void
++}
++
++; VCEQBS with no processing of the result.
++define i32 @test_vceqbs(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vceqbs:
++; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCEQBS, returning 1 if any elements are equal (CC != 3).
++define i32 @test_vceqbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vceqbs_any_bool:
++; CHECK: vceqbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -536870912
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp ne i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCEQBS, storing to %ptr if any elements are equal.
++define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vceqbs_any_store:
++; CHECK-NOT: %r
++; CHECK: vceqbs %v24, %v24, %v26
++; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp ule i32 %cc, 2
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <16 x i8> %res
++}
++
++; VCEQHS with no processing of the result.
++define i32 @test_vceqhs(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vceqhs:
++; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCEQHS, returning 1 if not all elements are equal.
++define i32 @test_vceqhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vceqhs_notall_bool:
++; CHECK: vceqhs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 36
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp sge i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCEQHS, storing to %ptr if not all elements are equal.
++define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b,
++                                           i32 *%ptr) {
++; CHECK-LABEL: test_vceqhs_notall_store:
++; CHECK-NOT: %r
++; CHECK: vceqhs %v24, %v24, %v26
++; CHECK-NEXT: {{jhe|je}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp ugt i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <8 x i16> %res
++}
++
++; VCEQFS with no processing of the result.
++define i32 @test_vceqfs(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vceqfs:
++; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCEQFS, returning 1 if no elements are equal.
++define i32 @test_vceqfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vceqfs_none_bool:
++; CHECK: vceqfs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 35
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp eq i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCEQFS, storing to %ptr if no elements are equal.
++define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b,
++                                         i32 *%ptr) {
++; CHECK-LABEL: test_vceqfs_none_store:
++; CHECK-NOT: %r
++; CHECK: vceqfs %v24, %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vceqfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp uge i32 %cc, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <4 x i32> %res
++}
++
++; VCEQGS with no processing of the result.
++define i32 @test_vceqgs(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vceqgs:
++; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCEQGS returning 1 if all elements are equal (CC == 0).
++define i32 @test_vceqgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vceqgs_all_bool:
++; CHECK: vceqgs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -268435456
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp ult i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCEQGS, storing to %ptr if all elements are equal.
++define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vceqgs_all_store:
++; CHECK-NOT: %r
++; CHECK: vceqgs %v24, %v24, %v26
++; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 0
++  %cc = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp sle i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <2 x i64> %res
++}
++
++; VCHBS with no processing of the result.
++define i32 @test_vchbs(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vchbs:
++; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHBS, returning 1 if any elements are higher (CC != 3).
++define i32 @test_vchbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vchbs_any_bool:
++; CHECK: vchbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -536870912
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp ne i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHBS, storing to %ptr if any elements are higher.
++define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vchbs_any_store:
++; CHECK-NOT: %r
++; CHECK: vchbs %v24, %v24, %v26
++; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp ule i32 %cc, 2
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <16 x i8> %res
++}
++
++; VCHHS with no processing of the result.
++define i32 @test_vchhs(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vchhs:
++; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHHS, returning 1 if not all elements are higher.
++define i32 @test_vchhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vchhs_notall_bool:
++; CHECK: vchhs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 36
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp sge i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHHS, storing to %ptr if not all elements are higher.
++define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b,
++                                          i32 *%ptr) {
++; CHECK-LABEL: test_vchhs_notall_store:
++; CHECK-NOT: %r
++; CHECK: vchhs %v24, %v24, %v26
++; CHECK-NEXT: {{jhe|je}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vchhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp ugt i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <8 x i16> %res
++}
++
++; VCHFS with no processing of the result.
++define i32 @test_vchfs(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vchfs:
++; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHFS, returning 1 if no elements are higher.
++define i32 @test_vchfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vchfs_none_bool:
++; CHECK: vchfs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 35
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp eq i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHFS, storing to %ptr if no elements are higher.
++define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vchfs_none_store:
++; CHECK-NOT: %r
++; CHECK: vchfs %v24, %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp uge i32 %cc, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <4 x i32> %res
++}
++
++; VCHGS with no processing of the result.
++define i32 @test_vchgs(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vchgs:
++; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHGS returning 1 if all elements are higher (CC == 0).
++define i32 @test_vchgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vchgs_all_bool:
++; CHECK: vchgs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -268435456
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp ult i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHGS, storing to %ptr if all elements are higher.
++define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vchgs_all_store:
++; CHECK-NOT: %r
++; CHECK: vchgs %v24, %v24, %v26
++; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 0
++  %cc = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp sle i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <2 x i64> %res
++}
++
++; VCHLBS with no processing of the result.
++define i32 @test_vchlbs(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vchlbs:
++; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHLBS, returning 1 if any elements are higher (CC != 3).
++define i32 @test_vchlbs_any_bool(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vchlbs_any_bool:
++; CHECK: vchlbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -536870912
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp ne i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHLBS, storing to %ptr if any elements are higher.
++define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vchlbs_any_store:
++; CHECK-NOT: %r
++; CHECK: vchlbs %v24, %v24, %v26
++; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  %cmp = icmp sle i32 %cc, 2
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <16 x i8> %res
++}
++
++; VCHLHS with no processing of the result.
++define i32 @test_vchlhs(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vchlhs:
++; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHLHS, returning 1 if not all elements are higher.
++define i32 @test_vchlhs_notall_bool(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vchlhs_notall_bool:
++; CHECK: vchlhs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 36
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp uge i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHLHS, storing to %ptr if not all elements are higher.
++define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b,
++                                           i32 *%ptr) {
++; CHECK-LABEL: test_vchlhs_notall_store:
++; CHECK-NOT: %r
++; CHECK: vchlhs %v24, %v24, %v26
++; CHECK-NEXT: {{jhe|je}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vchlhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  %cmp = icmp sgt i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <8 x i16> %res
++}
++
++; VCHLFS with no processing of the result.
++define i32 @test_vchlfs(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vchlfs:
++; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHLFS, returning 1 if no elements are higher.
++define i32 @test_vchlfs_none_bool(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vchlfs_none_bool:
++; CHECK: vchlfs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 35
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp eq i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHLFS, storing to %ptr if no elements are higher.
++define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b,
++                                         i32 *%ptr) {
++; CHECK-LABEL: test_vchlfs_none_store:
++; CHECK-NOT: %r
++; CHECK: vchlfs %v24, %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vchlfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  %cmp = icmp sge i32 %cc, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <4 x i32> %res
++}
++
++; VCHLGS with no processing of the result.
++define i32 @test_vchlgs(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vchlgs:
++; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VCHLGS returning 1 if all elements are higher (CC == 0).
++define i32 @test_vchlgs_all_bool(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: test_vchlgs_all_bool:
++; CHECK: vchlgs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -268435456
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp slt i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VCHLGS, storing to %ptr if all elements are higher.
++define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, i32 *%ptr) {
++; CHECK-LABEL: test_vchlgs_all_store:
++; CHECK-NOT: %r
++; CHECK: vchlgs %v24, %v24, %v26
++; CHECK-NEXT: {{jnhe|jne}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 0
++  %cc = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp ule i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <2 x i64> %res
++}
++
++; VFAEB with !IN !RT.
++define <16 x i8> @test_vfaeb_0(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaeb_0:
++; CHECK: vfaeb %v24, %v24, %v26, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 0)
++  ret <16 x i8> %res
++}
++
++; VFAEB with !IN RT.
++define <16 x i8> @test_vfaeb_4(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaeb_4:
++; CHECK: vfaeb %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 4)
++  ret <16 x i8> %res
++}
++
++; VFAEB with IN !RT.
++define <16 x i8> @test_vfaeb_8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaeb_8:
++; CHECK: vfaeb %v24, %v24, %v26, 8
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 8)
++  ret <16 x i8> %res
++}
++
++; VFAEB with IN RT.
++define <16 x i8> @test_vfaeb_12(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaeb_12:
++; CHECK: vfaeb %v24, %v24, %v26, 12
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 12)
++  ret <16 x i8> %res
++}
++
++; VFAEB with CS -- should be ignored.
++define <16 x i8> @test_vfaeb_1(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaeb_1:
++; CHECK: vfaeb %v24, %v24, %v26, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaeb(<16 x i8> %a, <16 x i8> %b, i32 1)
++  ret <16 x i8> %res
++}
++
++; VFAEH.
++define <8 x i16> @test_vfaeh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vfaeh:
++; CHECK: vfaeh %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vfaeh(<8 x i16> %a, <8 x i16> %b, i32 4)
++  ret <8 x i16> %res
++}
++
++; VFAEF.
++define <4 x i32> @test_vfaef(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vfaef:
++; CHECK: vfaef %v24, %v24, %v26, 8
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vfaef(<4 x i32> %a, <4 x i32> %b, i32 8)
++  ret <4 x i32> %res
++}
++
++; VFAEBS.
++define <16 x i8> @test_vfaebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfaebs:
++; CHECK: vfaebs %v24, %v24, %v26, 0
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vfaebs(<16 x i8> %a, <16 x i8> %b,
++                                                  i32 0)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VFAEHS.
++define <8 x i16> @test_vfaehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfaehs:
++; CHECK: vfaehs %v24, %v24, %v26, 4
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vfaehs(<8 x i16> %a, <8 x i16> %b,
++                                                  i32 4)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VFAEFS.
++define <4 x i32> @test_vfaefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfaefs:
++; CHECK: vfaefs %v24, %v24, %v26, 8
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vfaefs(<4 x i32> %a, <4 x i32> %b,
++                                                  i32 8)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VFAEZB with !IN !RT.
++define <16 x i8> @test_vfaezb_0(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaezb_0:
++; CHECK: vfaezb %v24, %v24, %v26, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 0)
++  ret <16 x i8> %res
++}
++
++; VFAEZB with !IN RT.
++define <16 x i8> @test_vfaezb_4(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaezb_4:
++; CHECK: vfaezb %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 4)
++  ret <16 x i8> %res
++}
++
++; VFAEZB with IN !RT.
++define <16 x i8> @test_vfaezb_8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaezb_8:
++; CHECK: vfaezb %v24, %v24, %v26, 8
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 8)
++  ret <16 x i8> %res
++}
++
++; VFAEZB with IN RT.
++define <16 x i8> @test_vfaezb_12(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaezb_12:
++; CHECK: vfaezb %v24, %v24, %v26, 12
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 12)
++  ret <16 x i8> %res
++}
++
++; VFAEZB with CS -- should be ignored.
++define <16 x i8> @test_vfaezb_1(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfaezb_1:
++; CHECK: vfaezb %v24, %v24, %v26, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfaezb(<16 x i8> %a, <16 x i8> %b, i32 1)
++  ret <16 x i8> %res
++}
++
++; VFAEZH.
++define <8 x i16> @test_vfaezh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vfaezh:
++; CHECK: vfaezh %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vfaezh(<8 x i16> %a, <8 x i16> %b, i32 4)
++  ret <8 x i16> %res
++}
++
++; VFAEZF.
++define <4 x i32> @test_vfaezf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vfaezf:
++; CHECK: vfaezf %v24, %v24, %v26, 8
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vfaezf(<4 x i32> %a, <4 x i32> %b, i32 8)
++  ret <4 x i32> %res
++}
++
++; VFAEZBS.
++define <16 x i8> @test_vfaezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfaezbs:
++; CHECK: vfaezbs %v24, %v24, %v26, 0
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vfaezbs(<16 x i8> %a, <16 x i8> %b,
++                                                   i32 0)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VFAEZHS.
++define <8 x i16> @test_vfaezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfaezhs:
++; CHECK: vfaezhs %v24, %v24, %v26, 4
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vfaezhs(<8 x i16> %a, <8 x i16> %b,
++                                                   i32 4)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VFAEZFS.
++define <4 x i32> @test_vfaezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfaezfs:
++; CHECK: vfaezfs %v24, %v24, %v26, 8
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vfaezfs(<4 x i32> %a, <4 x i32> %b,
++                                                   i32 8)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VFEEB.
++define <16 x i8> @test_vfeeb_0(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfeeb_0:
++; CHECK: vfeeb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfeeb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VFEEH.
++define <8 x i16> @test_vfeeh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vfeeh:
++; CHECK: vfeeh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vfeeh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VFEEF.
++define <4 x i32> @test_vfeef(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vfeef:
++; CHECK: vfeef %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vfeef(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VFEEBS.
++define <16 x i8> @test_vfeebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfeebs:
++; CHECK: vfeebs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vfeebs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VFEEHS.
++define <8 x i16> @test_vfeehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfeehs:
++; CHECK: vfeehs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vfeehs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VFEEFS.
++define <4 x i32> @test_vfeefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfeefs:
++; CHECK: vfeefs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vfeefs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VFEEZB.
++define <16 x i8> @test_vfeezb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfeezb:
++; CHECK: vfeezb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfeezb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VFEEZH.
++define <8 x i16> @test_vfeezh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vfeezh:
++; CHECK: vfeezh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vfeezh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VFEEZF.
++define <4 x i32> @test_vfeezf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vfeezf:
++; CHECK: vfeezf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vfeezf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VFEEZBS.
++define <16 x i8> @test_vfeezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfeezbs:
++; CHECK: vfeezbs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vfeezbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VFEEZHS.
++define <8 x i16> @test_vfeezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfeezhs:
++; CHECK: vfeezhs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vfeezhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VFEEZFS.
++define <4 x i32> @test_vfeezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfeezfs:
++; CHECK: vfeezfs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vfeezfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VFENEB.
++define <16 x i8> @test_vfeneb_0(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfeneb_0:
++; CHECK: vfeneb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfeneb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VFENEH.
++define <8 x i16> @test_vfeneh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vfeneh:
++; CHECK: vfeneh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vfeneh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VFENEF.
++define <4 x i32> @test_vfenef(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vfenef:
++; CHECK: vfenef %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vfenef(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VFENEBS.
++define <16 x i8> @test_vfenebs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfenebs:
++; CHECK: vfenebs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vfenebs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VFENEHS.
++define <8 x i16> @test_vfenehs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfenehs:
++; CHECK: vfenehs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vfenehs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VFENEFS.
++define <4 x i32> @test_vfenefs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfenefs:
++; CHECK: vfenefs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vfenefs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VFENEZB.
++define <16 x i8> @test_vfenezb(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: test_vfenezb:
++; CHECK: vfenezb %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vfenezb(<16 x i8> %a, <16 x i8> %b)
++  ret <16 x i8> %res
++}
++
++; VFENEZH.
++define <8 x i16> @test_vfenezh(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: test_vfenezh:
++; CHECK: vfenezh %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vfenezh(<8 x i16> %a, <8 x i16> %b)
++  ret <8 x i16> %res
++}
++
++; VFENEZF.
++define <4 x i32> @test_vfenezf(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: test_vfenezf:
++; CHECK: vfenezf %v24, %v24, %v26
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vfenezf(<4 x i32> %a, <4 x i32> %b)
++  ret <4 x i32> %res
++}
++
++; VFENEZBS.
++define <16 x i8> @test_vfenezbs(<16 x i8> %a, <16 x i8> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfenezbs:
++; CHECK: vfenezbs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vfenezbs(<16 x i8> %a, <16 x i8> %b)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VFENEZHS.
++define <8 x i16> @test_vfenezhs(<8 x i16> %a, <8 x i16> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfenezhs:
++; CHECK: vfenezhs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vfenezhs(<8 x i16> %a, <8 x i16> %b)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VFENEZFS.
++define <4 x i32> @test_vfenezfs(<4 x i32> %a, <4 x i32> %b, i32 *%ccptr) {
++; CHECK-LABEL: test_vfenezfs:
++; CHECK: vfenezfs %v24, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vfenezfs(<4 x i32> %a, <4 x i32> %b)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VISTRB.
++define <16 x i8> @test_vistrb(<16 x i8> %a) {
++; CHECK-LABEL: test_vistrb:
++; CHECK: vistrb %v24, %v24
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vistrb(<16 x i8> %a)
++  ret <16 x i8> %res
++}
++
++; VISTRH.
++define <8 x i16> @test_vistrh(<8 x i16> %a) {
++; CHECK-LABEL: test_vistrh:
++; CHECK: vistrh %v24, %v24
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vistrh(<8 x i16> %a)
++  ret <8 x i16> %res
++}
++
++; VISTRF.
++define <4 x i32> @test_vistrf(<4 x i32> %a) {
++; CHECK-LABEL: test_vistrf:
++; CHECK: vistrf %v24, %v24
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vistrf(<4 x i32> %a)
++  ret <4 x i32> %res
++}
++
++; VISTRBS.
++define <16 x i8> @test_vistrbs(<16 x i8> %a, i32 *%ccptr) {
++; CHECK-LABEL: test_vistrbs:
++; CHECK: vistrbs %v24, %v24
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vistrbs(<16 x i8> %a)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VISTRHS.
++define <8 x i16> @test_vistrhs(<8 x i16> %a, i32 *%ccptr) {
++; CHECK-LABEL: test_vistrhs:
++; CHECK: vistrhs %v24, %v24
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vistrhs(<8 x i16> %a)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VISTRFS.
++define <4 x i32> @test_vistrfs(<4 x i32> %a, i32 *%ccptr) {
++; CHECK-LABEL: test_vistrfs:
++; CHECK: vistrfs %v24, %v24
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vistrfs(<4 x i32> %a)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VSTRCB with !IN !RT.
++define <16 x i8> @test_vstrcb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrcb_0:
++; CHECK: vstrcb %v24, %v24, %v26, %v28, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c, i32 0)
++  ret <16 x i8> %res
++}
++
++; VSTRCB with !IN RT.
++define <16 x i8> @test_vstrcb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrcb_4:
++; CHECK: vstrcb %v24, %v24, %v26, %v28, 4
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c, i32 4)
++  ret <16 x i8> %res
++}
++
++; VSTRCB with IN !RT.
++define <16 x i8> @test_vstrcb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrcb_8:
++; CHECK: vstrcb %v24, %v24, %v26, %v28, 8
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c, i32 8)
++  ret <16 x i8> %res
++}
++
++; VSTRCB with IN RT.
++define <16 x i8> @test_vstrcb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrcb_12:
++; CHECK: vstrcb %v24, %v24, %v26, %v28, 12
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c, i32 12)
++  ret <16 x i8> %res
++}
++
++; VSTRCB with CS -- should be ignored.
++define <16 x i8> @test_vstrcb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrcb_1:
++; CHECK: vstrcb %v24, %v24, %v26, %v28, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrcb(<16 x i8> %a, <16 x i8> %b,
++                                          <16 x i8> %c, i32 1)
++  ret <16 x i8> %res
++}
++
++; VSTRCH.
++define <8 x i16> @test_vstrch(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vstrch:
++; CHECK: vstrch %v24, %v24, %v26, %v28, 4
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vstrch(<8 x i16> %a, <8 x i16> %b,
++                                          <8 x i16> %c, i32 4)
++  ret <8 x i16> %res
++}
++
++; VSTRCF.
++define <4 x i32> @test_vstrcf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vstrcf:
++; CHECK: vstrcf %v24, %v24, %v26, %v28, 8
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vstrcf(<4 x i32> %a, <4 x i32> %b,
++                                          <4 x i32> %c, i32 8)
++  ret <4 x i32> %res
++}
++
++; VSTRCBS.
++define <16 x i8> @test_vstrcbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
++                               i32 *%ccptr) {
++; CHECK-LABEL: test_vstrcbs:
++; CHECK: vstrcbs %v24, %v24, %v26, %v28, 0
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vstrcbs(<16 x i8> %a, <16 x i8> %b,
++                                                   <16 x i8> %c, i32 0)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VSTRCHS.
++define <8 x i16> @test_vstrchs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c,
++                               i32 *%ccptr) {
++; CHECK-LABEL: test_vstrchs:
++; CHECK: vstrchs %v24, %v24, %v26, %v28, 4
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vstrchs(<8 x i16> %a, <8 x i16> %b,
++                                                   <8 x i16> %c, i32 4)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VSTRCFS.
++define <4 x i32> @test_vstrcfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
++                               i32 *%ccptr) {
++; CHECK-LABEL: test_vstrcfs:
++; CHECK: vstrcfs %v24, %v24, %v26, %v28, 8
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vstrcfs(<4 x i32> %a, <4 x i32> %b,
++                                                   <4 x i32> %c, i32 8)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VSTRCZB with !IN !RT.
++define <16 x i8> @test_vstrczb_0(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrczb_0:
++; CHECK: vstrczb %v24, %v24, %v26, %v28, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
++                                           <16 x i8> %c, i32 0)
++  ret <16 x i8> %res
++}
++
++; VSTRCZB with !IN RT.
++define <16 x i8> @test_vstrczb_4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrczb_4:
++; CHECK: vstrczb %v24, %v24, %v26, %v28, 4
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
++                                           <16 x i8> %c, i32 4)
++  ret <16 x i8> %res
++}
++
++; VSTRCZB with IN !RT.
++define <16 x i8> @test_vstrczb_8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrczb_8:
++; CHECK: vstrczb %v24, %v24, %v26, %v28, 8
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
++                                           <16 x i8> %c, i32 8)
++  ret <16 x i8> %res
++}
++
++; VSTRCZB with IN RT.
++define <16 x i8> @test_vstrczb_12(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrczb_12:
++; CHECK: vstrczb %v24, %v24, %v26, %v28, 12
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
++                                           <16 x i8> %c, i32 12)
++  ret <16 x i8> %res
++}
++
++; VSTRCZB with CS -- should be ignored.
++define <16 x i8> @test_vstrczb_1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
++; CHECK-LABEL: test_vstrczb_1:
++; CHECK: vstrczb %v24, %v24, %v26, %v28, 0
++; CHECK: br %r14
++  %res = call <16 x i8> @llvm.s390.vstrczb(<16 x i8> %a, <16 x i8> %b,
++                                           <16 x i8> %c, i32 1)
++  ret <16 x i8> %res
++}
++
++; VSTRCZH.
++define <8 x i16> @test_vstrczh(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
++; CHECK-LABEL: test_vstrczh:
++; CHECK: vstrczh %v24, %v24, %v26, %v28, 4
++; CHECK: br %r14
++  %res = call <8 x i16> @llvm.s390.vstrczh(<8 x i16> %a, <8 x i16> %b,
++                                           <8 x i16> %c,  i32 4)
++  ret <8 x i16> %res
++}
++
++; VSTRCZF.
++define <4 x i32> @test_vstrczf(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
++; CHECK-LABEL: test_vstrczf:
++; CHECK: vstrczf %v24, %v24, %v26, %v28, 8
++; CHECK: br %r14
++  %res = call <4 x i32> @llvm.s390.vstrczf(<4 x i32> %a, <4 x i32> %b,
++                                           <4 x i32> %c, i32 8)
++  ret <4 x i32> %res
++}
++
++; VSTRCZBS.
++define <16 x i8> @test_vstrczbs(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
++                                i32 *%ccptr) {
++; CHECK-LABEL: test_vstrczbs:
++; CHECK: vstrczbs %v24, %v24, %v26, %v28, 0
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<16 x i8>, i32} @llvm.s390.vstrczbs(<16 x i8> %a, <16 x i8> %b,
++                                                    <16 x i8> %c, i32 0)
++  %res = extractvalue {<16 x i8>, i32} %call, 0
++  %cc = extractvalue {<16 x i8>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <16 x i8> %res
++}
++
++; VSTRCZHS.
++define <8 x i16> @test_vstrczhs(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c,
++                                i32 *%ccptr) {
++; CHECK-LABEL: test_vstrczhs:
++; CHECK: vstrczhs %v24, %v24, %v26, %v28, 4
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<8 x i16>, i32} @llvm.s390.vstrczhs(<8 x i16> %a, <8 x i16> %b,
++                                                    <8 x i16> %c, i32 4)
++  %res = extractvalue {<8 x i16>, i32} %call, 0
++  %cc = extractvalue {<8 x i16>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <8 x i16> %res
++}
++
++; VSTRCZFS.
++define <4 x i32> @test_vstrczfs(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
++                                i32 *%ccptr) {
++; CHECK-LABEL: test_vstrczfs:
++; CHECK: vstrczfs %v24, %v24, %v26, %v28, 8
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: srl [[REG]], 28
++; CHECK: st [[REG]], 0(%r2)
++; CHECK: br %r14
++  %call = call {<4 x i32>, i32} @llvm.s390.vstrczfs(<4 x i32> %a, <4 x i32> %b,
++                                                    <4 x i32> %c, i32 8)
++  %res = extractvalue {<4 x i32>, i32} %call, 0
++  %cc = extractvalue {<4 x i32>, i32} %call, 1
++  store i32 %cc, i32 *%ccptr
++  ret <4 x i32> %res
++}
++
++; VFCEDBS with no processing of the result.
++define i32 @test_vfcedbs(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: test_vfcedbs:
++; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
++                                                   <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VFCEDBS, returning 1 if any elements are equal (CC != 3).
++define i32 @test_vfcedbs_any_bool(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: test_vfcedbs_any_bool:
++; CHECK: vfcedbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: afi %r2, -536870912
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
++                                                   <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp ne i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VFCEDBS, storing to %ptr if any elements are equal.
++define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b,
++                                         i32 *%ptr) {
++; CHECK-LABEL: test_vfcedbs_any_store:
++; CHECK-NOT: %r
++; CHECK: vfcedbs %v24, %v24, %v26
++; CHECK-NEXT: {{jo|jnle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfcedbs(<2 x double> %a,
++                                                   <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 0
++  %cc = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp ule i32 %cc, 2
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <2 x i64> %res
++}
++
++; VFCHDBS with no processing of the result.
++define i32 @test_vfchdbs(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: test_vfchdbs:
++; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
++                                                   <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VFCHDBS, returning 1 if not all elements are higher.
++define i32 @test_vfchdbs_notall_bool(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: test_vfchdbs_notall_bool:
++; CHECK: vfchdbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 36
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
++                                                   <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp sge i32 %res, 1
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VFCHDBS, storing to %ptr if not all elements are higher.
++define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b,
++                                            i32 *%ptr) {
++; CHECK-LABEL: test_vfchdbs_notall_store:
++; CHECK-NOT: %r
++; CHECK: vfchdbs %v24, %v24, %v26
++; CHECK-NEXT: {{jhe|je}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfchdbs(<2 x double> %a,
++                                                   <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 0
++  %cc = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp ugt i32 %cc, 0
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <2 x i64> %res
++}
++
++; VFCHEDBS with no processing of the result.
++define i32 @test_vfchedbs(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: test_vfchedbs:
++; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
++						    <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VFCHEDBS, returning 1 if neither element is higher or equal.
++define i32 @test_vfchedbs_none_bool(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: test_vfchedbs_none_bool:
++; CHECK: vfchedbs {{%v[0-9]+}}, %v24, %v26
++; CHECK: ipm [[REG:%r[0-5]]]
++; CHECK: risblg %r2, [[REG]], 31, 159, 35
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
++						    <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp eq i32 %res, 3
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VFCHEDBS, storing to %ptr if neither element is higher or equal.
++define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b,
++                                           i32 *%ptr) {
++; CHECK-LABEL: test_vfchedbs_none_store:
++; CHECK-NOT: %r
++; CHECK: vfchedbs %v24, %v24, %v26
++; CHECK-NEXT: {{jno|jle}} {{\.L*}}
++; CHECK: mvhi 0(%r2), 0
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vfchedbs(<2 x double> %a,
++						    <2 x double> %b)
++  %res = extractvalue {<2 x i64>, i32} %call, 0
++  %cc = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp uge i32 %cc, 3
++  br i1 %cmp, label %store, label %exit
++
++store:
++  store i32 0, i32 *%ptr
++  br label %exit
++
++exit:
++  ret <2 x i64> %res
++}
++
++; VFTCIDB with the lowest useful class selector and no processing of the result.
++define i32 @test_vftcidb(<2 x double> %a) {
++; CHECK-LABEL: test_vftcidb:
++; CHECK: vftcidb {{%v[0-9]+}}, %v24, 1
++; CHECK: ipm %r2
++; CHECK: srl %r2, 28
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 1)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  ret i32 %res
++}
++
++; VFTCIDB with the highest useful class selector, returning 1 if all elements
++; have the right class (CC == 0).
++define i32 @test_vftcidb_all_bool(<2 x double> %a) {
++; CHECK-LABEL: test_vftcidb_all_bool:
++; CHECK: vftcidb {{%v[0-9]+}}, %v24, 4094
++; CHECK: afi %r2, -268435456
++; CHECK: srl %r2, 31
++; CHECK: br %r14
++  %call = call {<2 x i64>, i32} @llvm.s390.vftcidb(<2 x double> %a, i32 4094)
++  %res = extractvalue {<2 x i64>, i32} %call, 1
++  %cmp = icmp eq i32 %res, 0
++  %ext = zext i1 %cmp to i32
++  ret i32 %ext
++}
++
++; VFIDB with a rounding mode not usable via standard intrinsics.
++define <2 x double> @test_vfidb_0_4(<2 x double> %a) {
++; CHECK-LABEL: test_vfidb_0_4:
++; CHECK: vfidb %v24, %v24, 0, 4
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 0, i32 4)
++  ret <2 x double> %res
++}
++
++; VFIDB with IEEE-inexact exception suppressed.
++define <2 x double> @test_vfidb_4_0(<2 x double> %a) {
++; CHECK-LABEL: test_vfidb_4_0:
++; CHECK: vfidb %v24, %v24, 4, 0
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.s390.vfidb(<2 x double> %a, i32 4, i32 0)
++  ret <2 x double> %res
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-log-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-log-01.ll
+@@ -0,0 +1,15 @@
++; Test v2f64 logarithm.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare <2 x double> @llvm.log.v2f64(<2 x double>)
++
++define <2 x double> @f1(<2 x double> %val) {
++; CHECK-LABEL: f1:
++; CHECK: brasl %r14, log@PLT
++; CHECK: brasl %r14, log@PLT
++; CHECK: vmrhg %v24,
++; CHECK: br %r14
++  %ret = call <2 x double> @llvm.log.v2f64(<2 x double> %val)
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-max-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-max-01.ll
+@@ -0,0 +1,83 @@
++; Test v16i8 maximum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with sle.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with sgt.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
++
++; Test with sge.
++define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmxb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
++
++; Test with ult.
++define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with ule.
++define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with ugt.
++define <16 x i8> @f7(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
++
++; Test with uge.
++define <16 x i8> @f8(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmxlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <16 x i8> %val1, %val2
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-max-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-max-02.ll
+@@ -0,0 +1,83 @@
++; Test v8i16 maximum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with sle.
++define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with sgt.
++define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
++
++; Test with sge.
++define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmxh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
++
++; Test with ult.
++define <8 x i16> @f5(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with ule.
++define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with ugt.
++define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
++
++; Test with uge.
++define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmxlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <8 x i16> %val1, %val2
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-max-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-max-03.ll
+@@ -0,0 +1,83 @@
++; Test v4i32 maximum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with sle.
++define <4 x i32> @f2(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with sgt.
++define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
++
++; Test with sge.
++define <4 x i32> @f4(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmxf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
++
++; Test with ult.
++define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with ule.
++define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with ugt.
++define <4 x i32> @f7(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
++
++; Test with uge.
++define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmxlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <4 x i32> %val1, %val2
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-max-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-max-04.ll
+@@ -0,0 +1,83 @@
++; Test v2i64 maximum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with sle.
++define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with sgt.
++define <2 x i64> @f3(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
++
++; Test with sge.
++define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmxg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
++
++; Test with ult.
++define <2 x i64> @f5(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with ule.
++define <2 x i64> @f6(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with ugt.
++define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
++
++; Test with uge.
++define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmxlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <2 x i64> %val1, %val2
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-min-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-min-01.ll
+@@ -0,0 +1,83 @@
++; Test v16i8 minimum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with sle.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with sgt.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
++
++; Test with sge.
++define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmnb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
++
++; Test with ult.
++define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with ule.
++define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val2, <16 x i8> %val1
++  ret <16 x i8> %ret
++}
++
++; Test with ugt.
++define <16 x i8> @f7(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
++
++; Test with uge.
++define <16 x i8> @f8(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmnlb %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <16 x i8> %val2, %val1
++  %ret = select <16 x i1> %cmp, <16 x i8> %val1, <16 x i8> %val2
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-min-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-min-02.ll
+@@ -0,0 +1,83 @@
++; Test v8i16 minimum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with sle.
++define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with sgt.
++define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
++
++; Test with sge.
++define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmnh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
++
++; Test with ult.
++define <8 x i16> @f5(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with ule.
++define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val2, <8 x i16> %val1
++  ret <8 x i16> %ret
++}
++
++; Test with ugt.
++define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
++
++; Test with uge.
++define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmnlh %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <8 x i16> %val2, %val1
++  %ret = select <8 x i1> %cmp, <8 x i16> %val1, <8 x i16> %val2
++  ret <8 x i16> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-min-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-min-03.ll
+@@ -0,0 +1,83 @@
++; Test v4i32 minimum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with sle.
++define <4 x i32> @f2(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with sgt.
++define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
++
++; Test with sge.
++define <4 x i32> @f4(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmnf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
++
++; Test with ult.
++define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with ule.
++define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val2, <4 x i32> %val1
++  ret <4 x i32> %ret
++}
++
++; Test with ugt.
++define <4 x i32> @f7(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
++
++; Test with uge.
++define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmnlf %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <4 x i32> %val2, %val1
++  %ret = select <4 x i1> %cmp, <4 x i32> %val1, <4 x i32> %val2
++  ret <4 x i32> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-min-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-min-04.ll
+@@ -0,0 +1,83 @@
++; Test v2i64 minimum.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test with slt.
++define <2 x i64> @f1(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp slt <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with sle.
++define <2 x i64> @f2(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sle <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with sgt.
++define <2 x i64> @f3(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sgt <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
++
++; Test with sge.
++define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmng %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp sge <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
++
++; Test with ult.
++define <2 x i64> @f5(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ult <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with ule.
++define <2 x i64> @f6(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ule <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val2, <2 x i64> %val1
++  ret <2 x i64> %ret
++}
++
++; Test with ugt.
++define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp ugt <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
++
++; Test with uge.
++define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmnlg %v24, {{%v24, %v26|%v26, %v24}}
++; CHECK: br %r14
++  %cmp = icmp uge <2 x i64> %val2, %val1
++  %ret = select <2 x i1> %cmp, <2 x i64> %val1, <2 x i64> %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-01.ll
+@@ -0,0 +1,107 @@
++; Test vector register moves.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 moves.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <16 x i8> %val2
++}
++
++; Test v8i16 moves.
++define <8 x i16> @f2(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <8 x i16> %val2
++}
++
++; Test v4i32 moves.
++define <4 x i32> @f3(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <4 x i32> %val2
++}
++
++; Test v2i64 moves.
++define <2 x i64> @f4(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <2 x i64> %val2
++}
++
++; Test v4f32 moves.
++define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <4 x float> %val2
++}
++
++; Test v2f64 moves.
++define <2 x double> @f6(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <2 x double> %val2
++}
++
++; Test v2i8 moves.
++define <2 x i8> @f7(<2 x i8> %val1, <2 x i8> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <2 x i8> %val2
++}
++
++; Test v4i8 moves.
++define <4 x i8> @f8(<4 x i8> %val1, <4 x i8> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <4 x i8> %val2
++}
++
++; Test v8i8 moves.
++define <8 x i8> @f9(<8 x i8> %val1, <8 x i8> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <8 x i8> %val2
++}
++
++; Test v2i16 moves.
++define <2 x i16> @f10(<2 x i16> %val1, <2 x i16> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <2 x i16> %val2
++}
++
++; Test v4i16 moves.
++define <4 x i16> @f11(<4 x i16> %val1, <4 x i16> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <4 x i16> %val2
++}
++
++; Test v2i32 moves.
++define <2 x i32> @f12(<2 x i32> %val1, <2 x i32> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <2 x i32> %val2
++}
++
++; Test v2f32 moves.
++define <2 x float> @f13(<2 x float> %val1, <2 x float> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vlr %v24, %v26
++; CHECK: br %r14
++  ret <2 x float> %val2
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-02.ll
+@@ -0,0 +1,174 @@
++; Test vector loads.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 loads.
++define <16 x i8> @f1(<16 x i8> *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <16 x i8> *%ptr
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 loads.
++define <8 x i16> @f2(<8 x i16> *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <8 x i16> *%ptr
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 loads.
++define <4 x i32> @f3(<4 x i32> *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <4 x i32> *%ptr
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 loads.
++define <2 x i64> @f4(<2 x i64> *%ptr) {
++; CHECK-LABEL: f4:
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <2 x i64> *%ptr
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 loads.
++define <4 x float> @f5(<4 x float> *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <4 x float> *%ptr
++  ret <4 x float> %ret
++}
++
++; Test v2f64 loads.
++define <2 x double> @f6(<2 x double> *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <2 x double> *%ptr
++  ret <2 x double> %ret
++}
++
++; Test the highest aligned in-range offset.
++define <16 x i8> @f7(<16 x i8> *%base) {
++; CHECK-LABEL: f7:
++; CHECK: vl %v24, 4080(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr <16 x i8> *%base, i64 255
++  %ret = load <16 x i8> *%ptr
++  ret <16 x i8> %ret
++}
++
++; Test the highest unaligned in-range offset.
++define <16 x i8> @f8(i8 *%base) {
++; CHECK-LABEL: f8:
++; CHECK: vl %v24, 4095(%r2)
++; CHECK: br %r14
++  %addr = getelementptr i8 *%base, i64 4095
++  %ptr = bitcast i8 *%addr to <16 x i8> *
++  %ret = load <16 x i8> *%ptr, align 1
++  ret <16 x i8> %ret
++}
++
++; Test the next offset up, which requires separate address logic,
++define <16 x i8> @f9(<16 x i8> *%base) {
++; CHECK-LABEL: f9:
++; CHECK: aghi %r2, 4096
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr <16 x i8> *%base, i64 256
++  %ret = load <16 x i8> *%ptr
++  ret <16 x i8> %ret
++}
++
++; Test negative offsets, which also require separate address logic,
++define <16 x i8> @f10(<16 x i8> *%base) {
++; CHECK-LABEL: f10:
++; CHECK: aghi %r2, -16
++; CHECK: vl %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr <16 x i8> *%base, i64 -1
++  %ret = load <16 x i8> *%ptr
++  ret <16 x i8> %ret
++}
++
++; Check that indexes are allowed.
++define <16 x i8> @f11(i8 *%base, i64 %index) {
++; CHECK-LABEL: f11:
++; CHECK: vl %v24, 0(%r3,%r2)
++; CHECK: br %r14
++  %addr = getelementptr i8 *%base, i64 %index
++  %ptr = bitcast i8 *%addr to <16 x i8> *
++  %ret = load <16 x i8> *%ptr, align 1
++  ret <16 x i8> %ret
++}
++
++; Test v2i8 loads.
++define <2 x i8> @f12(<2 x i8> *%ptr) {
++; CHECK-LABEL: f12:
++; CHECK: vlreph %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <2 x i8> *%ptr
++  ret <2 x i8> %ret
++}
++
++; Test v4i8 loads.
++define <4 x i8> @f13(<4 x i8> *%ptr) {
++; CHECK-LABEL: f13:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <4 x i8> *%ptr
++  ret <4 x i8> %ret
++}
++
++; Test v8i8 loads.
++define <8 x i8> @f14(<8 x i8> *%ptr) {
++; CHECK-LABEL: f14:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <8 x i8> *%ptr
++  ret <8 x i8> %ret
++}
++
++; Test v2i16 loads.
++define <2 x i16> @f15(<2 x i16> *%ptr) {
++; CHECK-LABEL: f15:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <2 x i16> *%ptr
++  ret <2 x i16> %ret
++}
++
++; Test v4i16 loads.
++define <4 x i16> @f16(<4 x i16> *%ptr) {
++; CHECK-LABEL: f16:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <4 x i16> *%ptr
++  ret <4 x i16> %ret
++}
++
++; Test v2i32 loads.
++define <2 x i32> @f17(<2 x i32> *%ptr) {
++; CHECK-LABEL: f17:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <2 x i32> *%ptr
++  ret <2 x i32> %ret
++}
++
++; Test v2f32 loads.
++define <2 x float> @f18(<2 x float> *%ptr) {
++; CHECK-LABEL: f18:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = load <2 x float> *%ptr
++  ret <2 x float> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-03.ll
+@@ -0,0 +1,174 @@
++; Test vector stores.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 stores.
++define void @f1(<16 x i8> %val, <16 x i8> *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  store <16 x i8> %val, <16 x i8> *%ptr
++  ret void
++}
++
++; Test v8i16 stores.
++define void @f2(<8 x i16> %val, <8 x i16> *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  store <8 x i16> %val, <8 x i16> *%ptr
++  ret void
++}
++
++; Test v4i32 stores.
++define void @f3(<4 x i32> %val, <4 x i32> *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  store <4 x i32> %val, <4 x i32> *%ptr
++  ret void
++}
++
++; Test v2i64 stores.
++define void @f4(<2 x i64> %val, <2 x i64> *%ptr) {
++; CHECK-LABEL: f4:
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  store <2 x i64> %val, <2 x i64> *%ptr
++  ret void
++}
++
++; Test v4f32 stores.
++define void @f5(<4 x float> %val, <4 x float> *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  store <4 x float> %val, <4 x float> *%ptr
++  ret void
++}
++
++; Test v2f64 stores.
++define void @f6(<2 x double> %val, <2 x double> *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  store <2 x double> %val, <2 x double> *%ptr
++  ret void
++}
++
++; Test the highest aligned in-range offset.
++define void @f7(<16 x i8> %val, <16 x i8> *%base) {
++; CHECK-LABEL: f7:
++; CHECK: vst %v24, 4080(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr <16 x i8> *%base, i64 255
++  store <16 x i8> %val, <16 x i8> *%ptr
++  ret void
++}
++
++; Test the highest unaligned in-range offset.
++define void @f8(<16 x i8> %val, i8 *%base) {
++; CHECK-LABEL: f8:
++; CHECK: vst %v24, 4095(%r2)
++; CHECK: br %r14
++  %addr = getelementptr i8 *%base, i64 4095
++  %ptr = bitcast i8 *%addr to <16 x i8> *
++  store <16 x i8> %val, <16 x i8> *%ptr, align 1
++  ret void
++}
++
++; Test the next offset up, which requires separate address logic,
++define void @f9(<16 x i8> %val, <16 x i8> *%base) {
++; CHECK-LABEL: f9:
++; CHECK: aghi %r2, 4096
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr <16 x i8> *%base, i64 256
++  store <16 x i8> %val, <16 x i8> *%ptr
++  ret void
++}
++
++; Test negative offsets, which also require separate address logic,
++define void @f10(<16 x i8> %val, <16 x i8> *%base) {
++; CHECK-LABEL: f10:
++; CHECK: aghi %r2, -16
++; CHECK: vst %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr <16 x i8> *%base, i64 -1
++  store <16 x i8> %val, <16 x i8> *%ptr
++  ret void
++}
++
++; Check that indexes are allowed.
++define void @f11(<16 x i8> %val, i8 *%base, i64 %index) {
++; CHECK-LABEL: f11:
++; CHECK: vst %v24, 0(%r3,%r2)
++; CHECK: br %r14
++  %addr = getelementptr i8 *%base, i64 %index
++  %ptr = bitcast i8 *%addr to <16 x i8> *
++  store <16 x i8> %val, <16 x i8> *%ptr, align 1
++  ret void
++}
++
++; Test v2i8 stores.
++define void @f12(<2 x i8> %val, <2 x i8> *%ptr) {
++; CHECK-LABEL: f12:
++; CHECK: vsteh %v24, 0(%r2), 0
++; CHECK: br %r14
++  store <2 x i8> %val, <2 x i8> *%ptr
++  ret void
++}
++
++; Test v4i8 stores.
++define void @f13(<4 x i8> %val, <4 x i8> *%ptr) {
++; CHECK-LABEL: f13:
++; CHECK: vstef %v24, 0(%r2)
++; CHECK: br %r14
++  store <4 x i8> %val, <4 x i8> *%ptr
++  ret void
++}
++
++; Test v8i8 stores.
++define void @f14(<8 x i8> %val, <8 x i8> *%ptr) {
++; CHECK-LABEL: f14:
++; CHECK: vsteg %v24, 0(%r2)
++; CHECK: br %r14
++  store <8 x i8> %val, <8 x i8> *%ptr
++  ret void
++}
++
++; Test v2i16 stores.
++define void @f15(<2 x i16> %val, <2 x i16> *%ptr) {
++; CHECK-LABEL: f15:
++; CHECK: vstef %v24, 0(%r2), 0
++; CHECK: br %r14
++  store <2 x i16> %val, <2 x i16> *%ptr
++  ret void
++}
++
++; Test v4i16 stores.
++define void @f16(<4 x i16> %val, <4 x i16> *%ptr) {
++; CHECK-LABEL: f16:
++; CHECK: vsteg %v24, 0(%r2)
++; CHECK: br %r14
++  store <4 x i16> %val, <4 x i16> *%ptr
++  ret void
++}
++
++; Test v2i32 stores.
++define void @f17(<2 x i32> %val, <2 x i32> *%ptr) {
++; CHECK-LABEL: f17:
++; CHECK: vsteg %v24, 0(%r2), 0
++; CHECK: br %r14
++  store <2 x i32> %val, <2 x i32> *%ptr
++  ret void
++}
++
++; Test v2f32 stores.
++define void @f18(<2 x float> %val, <2 x float> *%ptr) {
++; CHECK-LABEL: f18:
++; CHECK: vsteg %v24, 0(%r2), 0
++; CHECK: br %r14
++  store <2 x float> %val, <2 x float> *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-04.ll
+@@ -0,0 +1,179 @@
++; Test vector insertion of register variables.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 insertion into the first element.
++define <16 x i8> @f1(<16 x i8> %val, i8 %element) {
++; CHECK-LABEL: f1:
++; CHECK: vlvgb %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 0
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into the last element.
++define <16 x i8> @f2(<16 x i8> %val, i8 %element) {
++; CHECK-LABEL: f2:
++; CHECK: vlvgb %v24, %r2, 15
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 15
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into a variable element.
++define <16 x i8> @f3(<16 x i8> %val, i8 %element, i32 %index) {
++; CHECK-LABEL: f3:
++; CHECK: vlvgb %v24, %r2, 0(%r3)
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 %index
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 insertion into the first element.
++define <8 x i16> @f4(<8 x i16> %val, i16 %element) {
++; CHECK-LABEL: f4:
++; CHECK: vlvgh %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 0
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into the last element.
++define <8 x i16> @f5(<8 x i16> %val, i16 %element) {
++; CHECK-LABEL: f5:
++; CHECK: vlvgh %v24, %r2, 7
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 7
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into a variable element.
++define <8 x i16> @f6(<8 x i16> %val, i16 %element, i32 %index) {
++; CHECK-LABEL: f6:
++; CHECK: vlvgh %v24, %r2, 0(%r3)
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 %index
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 insertion into the first element.
++define <4 x i32> @f7(<4 x i32> %val, i32 %element) {
++; CHECK-LABEL: f7:
++; CHECK: vlvgf %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 0
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into the last element.
++define <4 x i32> @f8(<4 x i32> %val, i32 %element) {
++; CHECK-LABEL: f8:
++; CHECK: vlvgf %v24, %r2, 3
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into a variable element.
++define <4 x i32> @f9(<4 x i32> %val, i32 %element, i32 %index) {
++; CHECK-LABEL: f9:
++; CHECK: vlvgf %v24, %r2, 0(%r3)
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 %index
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 insertion into the first element.
++define <2 x i64> @f10(<2 x i64> %val, i64 %element) {
++; CHECK-LABEL: f10:
++; CHECK: vlvgg %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion into the last element.
++define <2 x i64> @f11(<2 x i64> %val, i64 %element) {
++; CHECK-LABEL: f11:
++; CHECK: vlvgg %v24, %r2, 1
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion into a variable element.
++define <2 x i64> @f12(<2 x i64> %val, i64 %element, i32 %index) {
++; CHECK-LABEL: f12:
++; CHECK: vlvgg %v24, %r2, 0(%r3)
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 %index
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 insertion into the first element.
++define <4 x float> @f13(<4 x float> %val, float %element) {
++; CHECK-LABEL: f13:
++; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
++; CHECK: vlvgf %v24, [[REG]], 0
++; CHECK: br %r14
++  %ret = insertelement <4 x float> %val, float %element, i32 0
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion into the last element.
++define <4 x float> @f14(<4 x float> %val, float %element) {
++; CHECK-LABEL: f14:
++; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
++; CHECK: vlvgf %v24, [[REG]], 3
++; CHECK: br %r14
++  %ret = insertelement <4 x float> %val, float %element, i32 3
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion into a variable element.
++define <4 x float> @f15(<4 x float> %val, float %element, i32 %index) {
++; CHECK-LABEL: f15:
++; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0
++; CHECK: vlvgf %v24, [[REG]], 0(%r2)
++; CHECK: br %r14
++  %ret = insertelement <4 x float> %val, float %element, i32 %index
++  ret <4 x float> %ret
++}
++
++; Test v2f64 insertion into the first element.
++define <2 x double> @f16(<2 x double> %val, double %element) {
++; CHECK-LABEL: f16:
++; CHECK: vpdi %v24, %v0, %v24, 1
++; CHECK: br %r14
++  %ret = insertelement <2 x double> %val, double %element, i32 0
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion into the last element.
++define <2 x double> @f17(<2 x double> %val, double %element) {
++; CHECK-LABEL: f17:
++; CHECK: vpdi %v24, %v24, %v0, 0
++; CHECK: br %r14
++  %ret = insertelement <2 x double> %val, double %element, i32 1
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion into a variable element.
++define <2 x double> @f18(<2 x double> %val, double %element, i32 %index) {
++; CHECK-LABEL: f18:
++; CHECK: lgdr [[REG:%r[0-5]]], %f0
++; CHECK: vlvgg %v24, [[REG]], 0(%r2)
++; CHECK: br %r14
++  %ret = insertelement <2 x double> %val, double %element, i32 %index
++  ret <2 x double> %ret
++}
++
++; Test v16i8 insertion into a variable element plus one.
++define <16 x i8> @f19(<16 x i8> %val, i8 %element, i32 %index) {
++; CHECK-LABEL: f19:
++; CHECK: vlvgb %v24, %r2, 1(%r3)
++; CHECK: br %r14
++  %add = add i32 %index, 1
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 %add
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-05.ll
+@@ -0,0 +1,249 @@
++; Test vector extraction.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 extraction of the first element.
++define i8 @f1(<16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlgvb %r2, %v24, 0
++; CHECK: br %r14
++  %ret = extractelement <16 x i8> %val, i32 0
++  ret i8 %ret
++}
++
++; Test v16i8 extraction of the last element.
++define i8 @f2(<16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlgvb %r2, %v24, 15
++; CHECK: br %r14
++  %ret = extractelement <16 x i8> %val, i32 15
++  ret i8 %ret
++}
++
++; Test v16i8 extractions of an absurd element number.  This must compile
++; but we don't care what it does.
++define i8 @f3(<16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK-NOT: vlgvb %r2, %v24, 100000
++; CHECK: br %r14
++  %ret = extractelement <16 x i8> %val, i32 100000
++  ret i8 %ret
++}
++
++; Test v16i8 extraction of a variable element.
++define i8 @f4(<16 x i8> %val, i32 %index) {
++; CHECK-LABEL: f4:
++; CHECK: vlgvb %r2, %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = extractelement <16 x i8> %val, i32 %index
++  ret i8 %ret
++}
++
++; Test v8i16 extraction of the first element.
++define i16 @f5(<8 x i16> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlgvh %r2, %v24, 0
++; CHECK: br %r14
++  %ret = extractelement <8 x i16> %val, i32 0
++  ret i16 %ret
++}
++
++; Test v8i16 extraction of the last element.
++define i16 @f6(<8 x i16> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlgvh %r2, %v24, 7
++; CHECK: br %r14
++  %ret = extractelement <8 x i16> %val, i32 7
++  ret i16 %ret
++}
++
++; Test v8i16 extractions of an absurd element number.  This must compile
++; but we don't care what it does.
++define i16 @f7(<8 x i16> %val) {
++; CHECK-LABEL: f7:
++; CHECK-NOT: vlgvh %r2, %v24, 100000
++; CHECK: br %r14
++  %ret = extractelement <8 x i16> %val, i32 100000
++  ret i16 %ret
++}
++
++; Test v8i16 extraction of a variable element.
++define i16 @f8(<8 x i16> %val, i32 %index) {
++; CHECK-LABEL: f8:
++; CHECK: vlgvh %r2, %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = extractelement <8 x i16> %val, i32 %index
++  ret i16 %ret
++}
++
++; Test v4i32 extraction of the first element.
++define i32 @f9(<4 x i32> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vlgvf %r2, %v24, 0
++; CHECK: br %r14
++  %ret = extractelement <4 x i32> %val, i32 0
++  ret i32 %ret
++}
++
++; Test v4i32 extraction of the last element.
++define i32 @f10(<4 x i32> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vlgvf %r2, %v24, 3
++; CHECK: br %r14
++  %ret = extractelement <4 x i32> %val, i32 3
++  ret i32 %ret
++}
++
++; Test v4i32 extractions of an absurd element number.  This must compile
++; but we don't care what it does.
++define i32 @f11(<4 x i32> %val) {
++; CHECK-LABEL: f11:
++; CHECK-NOT: vlgvf %r2, %v24, 100000
++; CHECK: br %r14
++  %ret = extractelement <4 x i32> %val, i32 100000
++  ret i32 %ret
++}
++
++; Test v4i32 extraction of a variable element.
++define i32 @f12(<4 x i32> %val, i32 %index) {
++; CHECK-LABEL: f12:
++; CHECK: vlgvf %r2, %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = extractelement <4 x i32> %val, i32 %index
++  ret i32 %ret
++}
++
++; Test v2i64 extraction of the first element.
++define i64 @f13(<2 x i64> %val) {
++; CHECK-LABEL: f13:
++; CHECK: vlgvg %r2, %v24, 0
++; CHECK: br %r14
++  %ret = extractelement <2 x i64> %val, i32 0
++  ret i64 %ret
++}
++
++; Test v2i64 extraction of the last element.
++define i64 @f14(<2 x i64> %val) {
++; CHECK-LABEL: f14:
++; CHECK: vlgvg %r2, %v24, 1
++; CHECK: br %r14
++  %ret = extractelement <2 x i64> %val, i32 1
++  ret i64 %ret
++}
++
++; Test v2i64 extractions of an absurd element number.  This must compile
++; but we don't care what it does.
++define i64 @f15(<2 x i64> %val) {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vlgvg %r2, %v24, 100000
++; CHECK: br %r14
++  %ret = extractelement <2 x i64> %val, i32 100000
++  ret i64 %ret
++}
++
++; Test v2i64 extraction of a variable element.
++define i64 @f16(<2 x i64> %val, i32 %index) {
++; CHECK-LABEL: f16:
++; CHECK: vlgvg %r2, %v24, 0(%r2)
++; CHECK: br %r14
++  %ret = extractelement <2 x i64> %val, i32 %index
++  ret i64 %ret
++}
++
++; Test v4f32 extraction of element 0.
++define float @f17(<4 x float> %val) {
++; CHECK-LABEL: f17:
++; CHECK: vlr %v0, %v24
++; CHECK: br %r14
++  %ret = extractelement <4 x float> %val, i32 0
++  ret float %ret
++}
++
++; Test v4f32 extraction of element 1.
++define float @f18(<4 x float> %val) {
++; CHECK-LABEL: f18:
++; CHECK: vrepf %v0, %v24, 1
++; CHECK: br %r14
++  %ret = extractelement <4 x float> %val, i32 1
++  ret float %ret
++}
++
++; Test v4f32 extraction of element 2.
++define float @f19(<4 x float> %val) {
++; CHECK-LABEL: f19:
++; CHECK: vrepf %v0, %v24, 2
++; CHECK: br %r14
++  %ret = extractelement <4 x float> %val, i32 2
++  ret float %ret
++}
++
++; Test v4f32 extraction of element 3.
++define float @f20(<4 x float> %val) {
++; CHECK-LABEL: f20:
++; CHECK: vrepf %v0, %v24, 3
++; CHECK: br %r14
++  %ret = extractelement <4 x float> %val, i32 3
++  ret float %ret
++}
++
++; Test v4f32 extractions of an absurd element number.  This must compile
++; but we don't care what it does.
++define float @f21(<4 x float> %val) {
++  %ret = extractelement <4 x float> %val, i32 100000
++  ret float %ret
++}
++
++; Test v4f32 extraction of a variable element.
++define float @f22(<4 x float> %val, i32 %index) {
++; CHECK-LABEL: f22:
++; CHECK: vlgvf [[REG:%r[0-5]]], %v24, 0(%r2)
++; CHECK: vlvgf %v0, [[REG]], 0
++; CHECK: br %r14
++  %ret = extractelement <4 x float> %val, i32 %index
++  ret float %ret
++}
++
++; Test v2f64 extraction of the first element.
++define double @f23(<2 x double> %val) {
++; CHECK-LABEL: f23:
++; CHECK: vlr %v0, %v24
++; CHECK: br %r14
++  %ret = extractelement <2 x double> %val, i32 0
++  ret double %ret
++}
++
++; Test v2f64 extraction of the last element.
++define double @f24(<2 x double> %val) {
++; CHECK-LABEL: f24:
++; CHECK: vrepg %v0, %v24, 1
++; CHECK: br %r14
++  %ret = extractelement <2 x double> %val, i32 1
++  ret double %ret
++}
++
++; Test v2f64 extractions of an absurd element number.  This must compile
++; but we don't care what it does.
++define double @f25(<2 x double> %val) {
++  %ret = extractelement <2 x double> %val, i32 100000
++  ret double %ret
++}
++
++; Test v2f64 extraction of a variable element.
++define double @f26(<2 x double> %val, i32 %index) {
++; CHECK-LABEL: f26:
++; CHECK: vlgvg [[REG:%r[0-5]]], %v24, 0(%r2)
++; CHECK: ldgr %f0, [[REG]]
++; CHECK: br %r14
++  %ret = extractelement <2 x double> %val, i32 %index
++  ret double %ret
++}
++
++; Test v16i8 extraction of a variable element with an offset.
++define i8 @f27(<16 x i8> %val, i32 %index) {
++; CHECK-LABEL: f27:
++; CHECK: vlgvb %r2, %v24, 1(%r2)
++; CHECK: br %r14
++  %add = add i32 %index, 1
++  %ret = extractelement <16 x i8> %val, i32 %add
++  ret i8 %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-06.ll
+@@ -0,0 +1,13 @@
++; Test vector builds using VLVGP.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test the basic v2i64 usage.
++define <2 x i64> @f1(i64 %a, i64 %b) {
++; CHECK-LABEL: f1:
++; CHECK: vlvgp %v24, %r2, %r3
++; CHECK: br %r14
++  %veca = insertelement <2 x i64> undef, i64 %a, i32 0
++  %vecb = insertelement <2 x i64> %veca, i64 %b, i32 1
++  ret <2 x i64> %vecb
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-07.ll
+@@ -0,0 +1,57 @@
++; Test scalar_to_vector expansion.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8.
++define <16 x i8> @f1(i8 %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlvgb %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 0
++  ret <16 x i8> %ret
++}
++
++; Test v8i16.
++define <8 x i16> @f2(i16 %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlvgh %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 0
++  ret <8 x i16> %ret
++}
++
++; Test v4i32.
++define <4 x i32> @f3(i32 %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlvgf %v24, %r2, 0
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 0
++  ret <4 x i32> %ret
++}
++
++; Test v2i64.  Here we load %val into both halves.
++define <2 x i64> @f4(i64 %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> undef, i64 %val, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v4f32, which is just a move.
++define <4 x float> @f5(float %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlr %v24, %v0
++; CHECK: br %r14
++  %ret = insertelement <4 x float> undef, float %val, i32 0
++  ret <4 x float> %ret
++}
++
++; Likewise v2f64.
++define <2 x double> @f6(double %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlr %v24, %v0
++; CHECK: br %r14
++  %ret = insertelement <2 x double> undef, double %val, i32 0
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-08.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-08.ll
+@@ -0,0 +1,444 @@
++; Test vector insertion of memory values.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 insertion into the first element.
++define <16 x i8> @f1(<16 x i8> %val, i8 *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vleb %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = load i8 *%ptr
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 0
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into the last element.
++define <16 x i8> @f2(<16 x i8> %val, i8 *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: vleb %v24, 0(%r2), 15
++; CHECK: br %r14
++  %element = load i8 *%ptr
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 15
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion with the highest in-range offset.
++define <16 x i8> @f3(<16 x i8> %val, i8 *%base) {
++; CHECK-LABEL: f3:
++; CHECK: vleb %v24, 4095(%r2), 10
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i32 4095
++  %element = load i8 *%ptr
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 10
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion with the first ouf-of-range offset.
++define <16 x i8> @f4(<16 x i8> %val, i8 *%base) {
++; CHECK-LABEL: f4:
++; CHECK: aghi %r2, 4096
++; CHECK: vleb %v24, 0(%r2), 5
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i32 4096
++  %element = load i8 *%ptr
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 5
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into a variable element.
++define <16 x i8> @f5(<16 x i8> %val, i8 *%ptr, i32 %index) {
++; CHECK-LABEL: f5:
++; CHECK-NOT: vleb
++; CHECK: br %r14
++  %element = load i8 *%ptr
++  %ret = insertelement <16 x i8> %val, i8 %element, i32 %index
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 insertion into the first element.
++define <8 x i16> @f6(<8 x i16> %val, i16 *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vleh %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = load i16 *%ptr
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 0
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into the last element.
++define <8 x i16> @f7(<8 x i16> %val, i16 *%ptr) {
++; CHECK-LABEL: f7:
++; CHECK: vleh %v24, 0(%r2), 7
++; CHECK: br %r14
++  %element = load i16 *%ptr
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 7
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion with the highest in-range offset.
++define <8 x i16> @f8(<8 x i16> %val, i16 *%base) {
++; CHECK-LABEL: f8:
++; CHECK: vleh %v24, 4094(%r2), 5
++; CHECK: br %r14
++  %ptr = getelementptr i16 *%base, i32 2047
++  %element = load i16 *%ptr
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 5
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion with the first ouf-of-range offset.
++define <8 x i16> @f9(<8 x i16> %val, i16 *%base) {
++; CHECK-LABEL: f9:
++; CHECK: aghi %r2, 4096
++; CHECK: vleh %v24, 0(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr i16 *%base, i32 2048
++  %element = load i16 *%ptr
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 1
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into a variable element.
++define <8 x i16> @f10(<8 x i16> %val, i16 *%ptr, i32 %index) {
++; CHECK-LABEL: f10:
++; CHECK-NOT: vleh
++; CHECK: br %r14
++  %element = load i16 *%ptr
++  %ret = insertelement <8 x i16> %val, i16 %element, i32 %index
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 insertion into the first element.
++define <4 x i32> @f11(<4 x i32> %val, i32 *%ptr) {
++; CHECK-LABEL: f11:
++; CHECK: vlef %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 0
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into the last element.
++define <4 x i32> @f12(<4 x i32> %val, i32 *%ptr) {
++; CHECK-LABEL: f12:
++; CHECK: vlef %v24, 0(%r2), 3
++; CHECK: br %r14
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion with the highest in-range offset.
++define <4 x i32> @f13(<4 x i32> %val, i32 *%base) {
++; CHECK-LABEL: f13:
++; CHECK: vlef %v24, 4092(%r2), 2
++; CHECK: br %r14
++  %ptr = getelementptr i32 *%base, i32 1023
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 2
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion with the first ouf-of-range offset.
++define <4 x i32> @f14(<4 x i32> %val, i32 *%base) {
++; CHECK-LABEL: f14:
++; CHECK: aghi %r2, 4096
++; CHECK: vlef %v24, 0(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr i32 *%base, i32 1024
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into a variable element.
++define <4 x i32> @f15(<4 x i32> %val, i32 *%ptr, i32 %index) {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vlef
++; CHECK: br %r14
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 %index
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 insertion into the first element.
++define <2 x i64> @f16(<2 x i64> %val, i64 *%ptr) {
++; CHECK-LABEL: f16:
++; CHECK: vleg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion into the last element.
++define <2 x i64> @f17(<2 x i64> %val, i64 *%ptr) {
++; CHECK-LABEL: f17:
++; CHECK: vleg %v24, 0(%r2), 1
++; CHECK: br %r14
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion with the highest in-range offset.
++define <2 x i64> @f18(<2 x i64> %val, i64 *%base) {
++; CHECK-LABEL: f18:
++; CHECK: vleg %v24, 4088(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i32 511
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion with the first ouf-of-range offset.
++define <2 x i64> @f19(<2 x i64> %val, i64 *%base) {
++; CHECK-LABEL: f19:
++; CHECK: aghi %r2, 4096
++; CHECK: vleg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i32 512
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion into a variable element.
++define <2 x i64> @f20(<2 x i64> %val, i64 *%ptr, i32 %index) {
++; CHECK-LABEL: f20:
++; CHECK-NOT: vleg
++; CHECK: br %r14
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 %index
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 insertion into the first element.
++define <4 x float> @f21(<4 x float> %val, float *%ptr) {
++; CHECK-LABEL: f21:
++; CHECK: vlef %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 0
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion into the last element.
++define <4 x float> @f22(<4 x float> %val, float *%ptr) {
++; CHECK-LABEL: f22:
++; CHECK: vlef %v24, 0(%r2), 3
++; CHECK: br %r14
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 3
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion with the highest in-range offset.
++define <4 x float> @f23(<4 x float> %val, float *%base) {
++; CHECK-LABEL: f23:
++; CHECK: vlef %v24, 4092(%r2), 2
++; CHECK: br %r14
++  %ptr = getelementptr float *%base, i32 1023
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 2
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion with the first ouf-of-range offset.
++define <4 x float> @f24(<4 x float> %val, float *%base) {
++; CHECK-LABEL: f24:
++; CHECK: aghi %r2, 4096
++; CHECK: vlef %v24, 0(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr float *%base, i32 1024
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 1
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion into a variable element.
++define <4 x float> @f25(<4 x float> %val, float *%ptr, i32 %index) {
++; CHECK-LABEL: f25:
++; CHECK-NOT: vlef
++; CHECK: br %r14
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 %index
++  ret <4 x float> %ret
++}
++
++; Test v2f64 insertion into the first element.
++define <2 x double> @f26(<2 x double> %val, double *%ptr) {
++; CHECK-LABEL: f26:
++; CHECK: vleg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 0
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion into the last element.
++define <2 x double> @f27(<2 x double> %val, double *%ptr) {
++; CHECK-LABEL: f27:
++; CHECK: vleg %v24, 0(%r2), 1
++; CHECK: br %r14
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 1
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion with the highest in-range offset.
++define <2 x double> @f28(<2 x double> %val, double *%base) {
++; CHECK-LABEL: f28:
++; CHECK: vleg %v24, 4088(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr double *%base, i32 511
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 1
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion with the first ouf-of-range offset.
++define <2 x double> @f29(<2 x double> %val, double *%base) {
++; CHECK-LABEL: f29:
++; CHECK: aghi %r2, 4096
++; CHECK: vleg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %ptr = getelementptr double *%base, i32 512
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 0
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion into a variable element.
++define <2 x double> @f30(<2 x double> %val, double *%ptr, i32 %index) {
++; CHECK-LABEL: f30:
++; CHECK-NOT: vleg
++; CHECK: br %r14
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 %index
++  ret <2 x double> %ret
++}
++
++; Test a v4i32 gather of the first element.
++define <4 x i32> @f31(<4 x i32> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f31:
++; CHECK: vgef %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 0
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to i32 *
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 0
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 gather of the last element.
++define <4 x i32> @f32(<4 x i32> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f32:
++; CHECK: vgef %v24, 0(%v26,%r2), 3
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 3
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to i32 *
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 gather with the highest in-range offset.
++define <4 x i32> @f33(<4 x i32> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f33:
++; CHECK: vgef %v24, 4095(%v26,%r2), 1
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 1
++  %ext = zext i32 %elem to i64
++  %add1 = add i64 %base, %ext
++  %add2 = add i64 %add1, 4095
++  %ptr = inttoptr i64 %add2 to i32 *
++  %element = load i32 *%ptr
++  %ret = insertelement <4 x i32> %val, i32 %element, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 gather of the first element.
++define <2 x i64> @f34(<2 x i64> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f34:
++; CHECK: vgeg %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 0
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to i64 *
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 gather of the last element.
++define <2 x i64> @f35(<2 x i64> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f35:
++; CHECK: vgeg %v24, 0(%v26,%r2), 1
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 1
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to i64 *
++  %element = load i64 *%ptr
++  %ret = insertelement <2 x i64> %val, i64 %element, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test a v4f32 gather of the first element.
++define <4 x float> @f36(<4 x float> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f36:
++; CHECK: vgef %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 0
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to float *
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 0
++  ret <4 x float> %ret
++}
++
++; Test a v4f32 gather of the last element.
++define <4 x float> @f37(<4 x float> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f37:
++; CHECK: vgef %v24, 0(%v26,%r2), 3
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 3
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to float *
++  %element = load float *%ptr
++  %ret = insertelement <4 x float> %val, float %element, i32 3
++  ret <4 x float> %ret
++}
++
++; Test a v2f64 gather of the first element.
++define <2 x double> @f38(<2 x double> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f38:
++; CHECK: vgeg %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 0
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to double *
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 0
++  ret <2 x double> %ret
++}
++
++; Test a v2f64 gather of the last element.
++define <2 x double> @f39(<2 x double> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f39:
++; CHECK: vgeg %v24, 0(%v26,%r2), 1
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 1
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to double *
++  %element = load double *%ptr
++  %ret = insertelement <2 x double> %val, double %element, i32 1
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-09.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-09.ll
+@@ -0,0 +1,291 @@
++; Test vector insertion of constants.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 insertion into the first element.
++define <16 x i8> @f1(<16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vleib %v24, 0, 0
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 0, i32 0
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into the last element.
++define <16 x i8> @f2(<16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vleib %v24, 100, 15
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 100, i32 15
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion with the maximum signed value.
++define <16 x i8> @f3(<16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vleib %v24, 127, 10
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 127, i32 10
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion with the minimum signed value.
++define <16 x i8> @f4(<16 x i8> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vleib %v24, -128, 11
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 128, i32 11
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion with the maximum unsigned value.
++define <16 x i8> @f5(<16 x i8> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vleib %v24, -1, 12
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 255, i32 12
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into a variable element.
++define <16 x i8> @f6(<16 x i8> %val, i32 %index) {
++; CHECK-LABEL: f6:
++; CHECK-NOT: vleib
++; CHECK: br %r14
++  %ret = insertelement <16 x i8> %val, i8 0, i32 %index
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 insertion into the first element.
++define <8 x i16> @f7(<8 x i16> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vleih %v24, 0, 0
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 0, i32 0
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into the last element.
++define <8 x i16> @f8(<8 x i16> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vleih %v24, 0, 7
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 0, i32 7
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion with the maximum signed value.
++define <8 x i16> @f9(<8 x i16> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vleih %v24, 32767, 4
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 32767, i32 4
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion with the minimum signed value.
++define <8 x i16> @f10(<8 x i16> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vleih %v24, -32768, 5
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 32768, i32 5
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion with the maximum unsigned value.
++define <8 x i16> @f11(<8 x i16> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vleih %v24, -1, 6
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 65535, i32 6
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into a variable element.
++define <8 x i16> @f12(<8 x i16> %val, i32 %index) {
++; CHECK-LABEL: f12:
++; CHECK-NOT: vleih
++; CHECK: br %r14
++  %ret = insertelement <8 x i16> %val, i16 0, i32 %index
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 insertion into the first element.
++define <4 x i32> @f13(<4 x i32> %val) {
++; CHECK-LABEL: f13:
++; CHECK: vleif %v24, 0, 0
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 0, i32 0
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into the last element.
++define <4 x i32> @f14(<4 x i32> %val) {
++; CHECK-LABEL: f14:
++; CHECK: vleif %v24, 0, 3
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 0, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion with the maximum value allowed by VLEIF.
++define <4 x i32> @f15(<4 x i32> %val) {
++; CHECK-LABEL: f15:
++; CHECK: vleif %v24, 32767, 1
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 32767, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion with the next value up.
++define <4 x i32> @f16(<4 x i32> %val) {
++; CHECK-LABEL: f16:
++; CHECK-NOT: vleif
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 32768, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion with the minimum value allowed by VLEIF.
++define <4 x i32> @f17(<4 x i32> %val) {
++; CHECK-LABEL: f17:
++; CHECK: vleif %v24, -32768, 2
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 -32768, i32 2
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion with the next value down.
++define <4 x i32> @f18(<4 x i32> %val) {
++; CHECK-LABEL: f18:
++; CHECK-NOT: vleif
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 -32769, i32 2
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into a variable element.
++define <4 x i32> @f19(<4 x i32> %val, i32 %index) {
++; CHECK-LABEL: f19:
++; CHECK-NOT: vleif
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> %val, i32 0, i32 %index
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 insertion into the first element.
++define <2 x i64> @f20(<2 x i64> %val) {
++; CHECK-LABEL: f20:
++; CHECK: vleig %v24, 0, 0
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 0, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion into the last element.
++define <2 x i64> @f21(<2 x i64> %val) {
++; CHECK-LABEL: f21:
++; CHECK: vleig %v24, 0, 1
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 0, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion with the maximum value allowed by VLEIG.
++define <2 x i64> @f22(<2 x i64> %val) {
++; CHECK-LABEL: f22:
++; CHECK: vleig %v24, 32767, 1
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 32767, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion with the next value up.
++define <2 x i64> @f23(<2 x i64> %val) {
++; CHECK-LABEL: f23:
++; CHECK-NOT: vleig
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 32768, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion with the minimum value allowed by VLEIG.
++define <2 x i64> @f24(<2 x i64> %val) {
++; CHECK-LABEL: f24:
++; CHECK: vleig %v24, -32768, 0
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 -32768, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion with the next value down.
++define <2 x i64> @f25(<2 x i64> %val) {
++; CHECK-LABEL: f25:
++; CHECK-NOT: vleig
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 -32769, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 insertion into a variable element.
++define <2 x i64> @f26(<2 x i64> %val, i32 %index) {
++; CHECK-LABEL: f26:
++; CHECK-NOT: vleig
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> %val, i64 0, i32 %index
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 insertion of 0 into the first element.
++define <4 x float> @f27(<4 x float> %val) {
++; CHECK-LABEL: f27:
++; CHECK: vleif %v24, 0, 0
++; CHECK: br %r14
++  %ret = insertelement <4 x float> %val, float 0.0, i32 0
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion of 0 into the last element.
++define <4 x float> @f28(<4 x float> %val) {
++; CHECK-LABEL: f28:
++; CHECK: vleif %v24, 0, 3
++; CHECK: br %r14
++  %ret = insertelement <4 x float> %val, float 0.0, i32 3
++  ret <4 x float> %ret
++}
++
++; Test v4f32 insertion of a nonzero value.
++define <4 x float> @f29(<4 x float> %val) {
++; CHECK-LABEL: f29:
++; CHECK-NOT: vleif
++; CHECK: br %r14
++  %ret = insertelement <4 x float> %val, float 1.0, i32 1
++  ret <4 x float> %ret
++}
++
++; Test v2f64 insertion of 0 into the first element.
++define <2 x double> @f30(<2 x double> %val) {
++; CHECK-LABEL: f30:
++; CHECK: vleig %v24, 0, 0
++; CHECK: br %r14
++  %ret = insertelement <2 x double> %val, double 0.0, i32 0
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion of 0 into the last element.
++define <2 x double> @f31(<2 x double> %val) {
++; CHECK-LABEL: f31:
++; CHECK: vleig %v24, 0, 1
++; CHECK: br %r14
++  %ret = insertelement <2 x double> %val, double 0.0, i32 1
++  ret <2 x double> %ret
++}
++
++; Test v2f64 insertion of a nonzero value.
++define <2 x double> @f32(<2 x double> %val) {
++; CHECK-LABEL: f32:
++; CHECK-NOT: vleig
++; CHECK: br %r14
++  %ret = insertelement <2 x double> %val, double 1.0, i32 1
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-10.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-10.ll
+@@ -0,0 +1,499 @@
++; Test vector extraction to memory.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 extraction from the first element.
++define void @f1(<16 x i8> %val, i8 *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vsteb %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = extractelement <16 x i8> %val, i32 0
++  store i8 %element, i8 *%ptr
++  ret void
++}
++
++; Test v16i8 extraction from the last element.
++define void @f2(<16 x i8> %val, i8 *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: vsteb %v24, 0(%r2), 15
++; CHECK: br %r14
++  %element = extractelement <16 x i8> %val, i32 15
++  store i8 %element, i8 *%ptr
++  ret void
++}
++
++; Test v16i8 extraction of an invalid element.  This must compile,
++; but we don't care what it does.
++define void @f3(<16 x i8> %val, i8 *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK-NOT: vsteb %v24, 0(%r2), 16
++; CHECK: br %r14
++  %element = extractelement <16 x i8> %val, i32 16
++  store i8 %element, i8 *%ptr
++  ret void
++}
++
++; Test v16i8 extraction with the highest in-range offset.
++define void @f4(<16 x i8> %val, i8 *%base) {
++; CHECK-LABEL: f4:
++; CHECK: vsteb %v24, 4095(%r2), 10
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i32 4095
++  %element = extractelement <16 x i8> %val, i32 10
++  store i8 %element, i8 *%ptr
++  ret void
++}
++
++; Test v16i8 extraction with the first ouf-of-range offset.
++define void @f5(<16 x i8> %val, i8 *%base) {
++; CHECK-LABEL: f5:
++; CHECK: aghi %r2, 4096
++; CHECK: vsteb %v24, 0(%r2), 5
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i32 4096
++  %element = extractelement <16 x i8> %val, i32 5
++  store i8 %element, i8 *%ptr
++  ret void
++}
++
++; Test v16i8 extraction from a variable element.
++define void @f6(<16 x i8> %val, i8 *%ptr, i32 %index) {
++; CHECK-LABEL: f6:
++; CHECK-NOT: vsteb
++; CHECK: br %r14
++  %element = extractelement <16 x i8> %val, i32 %index
++  store i8 %element, i8 *%ptr
++  ret void
++}
++
++; Test v8i16 extraction from the first element.
++define void @f7(<8 x i16> %val, i16 *%ptr) {
++; CHECK-LABEL: f7:
++; CHECK: vsteh %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = extractelement <8 x i16> %val, i32 0
++  store i16 %element, i16 *%ptr
++  ret void
++}
++
++; Test v8i16 extraction from the last element.
++define void @f8(<8 x i16> %val, i16 *%ptr) {
++; CHECK-LABEL: f8:
++; CHECK: vsteh %v24, 0(%r2), 7
++; CHECK: br %r14
++  %element = extractelement <8 x i16> %val, i32 7
++  store i16 %element, i16 *%ptr
++  ret void
++}
++
++; Test v8i16 extraction of an invalid element.  This must compile,
++; but we don't care what it does.
++define void @f9(<8 x i16> %val, i16 *%ptr) {
++; CHECK-LABEL: f9:
++; CHECK-NOT: vsteh %v24, 0(%r2), 8
++; CHECK: br %r14
++  %element = extractelement <8 x i16> %val, i32 8
++  store i16 %element, i16 *%ptr
++  ret void
++}
++
++; Test v8i16 extraction with the highest in-range offset.
++define void @f10(<8 x i16> %val, i16 *%base) {
++; CHECK-LABEL: f10:
++; CHECK: vsteh %v24, 4094(%r2), 5
++; CHECK: br %r14
++  %ptr = getelementptr i16 *%base, i32 2047
++  %element = extractelement <8 x i16> %val, i32 5
++  store i16 %element, i16 *%ptr
++  ret void
++}
++
++; Test v8i16 extraction with the first ouf-of-range offset.
++define void @f11(<8 x i16> %val, i16 *%base) {
++; CHECK-LABEL: f11:
++; CHECK: aghi %r2, 4096
++; CHECK: vsteh %v24, 0(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr i16 *%base, i32 2048
++  %element = extractelement <8 x i16> %val, i32 1
++  store i16 %element, i16 *%ptr
++  ret void
++}
++
++; Test v8i16 extraction from a variable element.
++define void @f12(<8 x i16> %val, i16 *%ptr, i32 %index) {
++; CHECK-LABEL: f12:
++; CHECK-NOT: vsteh
++; CHECK: br %r14
++  %element = extractelement <8 x i16> %val, i32 %index
++  store i16 %element, i16 *%ptr
++  ret void
++}
++
++; Test v4i32 extraction from the first element.
++define void @f13(<4 x i32> %val, i32 *%ptr) {
++; CHECK-LABEL: f13:
++; CHECK: vstef %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = extractelement <4 x i32> %val, i32 0
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test v4i32 extraction from the last element.
++define void @f14(<4 x i32> %val, i32 *%ptr) {
++; CHECK-LABEL: f14:
++; CHECK: vstef %v24, 0(%r2), 3
++; CHECK: br %r14
++  %element = extractelement <4 x i32> %val, i32 3
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test v4i32 extraction of an invalid element.  This must compile,
++; but we don't care what it does.
++define void @f15(<4 x i32> %val, i32 *%ptr) {
++; CHECK-LABEL: f15:
++; CHECK-NOT: vstef %v24, 0(%r2), 4
++; CHECK: br %r14
++  %element = extractelement <4 x i32> %val, i32 4
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test v4i32 extraction with the highest in-range offset.
++define void @f16(<4 x i32> %val, i32 *%base) {
++; CHECK-LABEL: f16:
++; CHECK: vstef %v24, 4092(%r2), 2
++; CHECK: br %r14
++  %ptr = getelementptr i32 *%base, i32 1023
++  %element = extractelement <4 x i32> %val, i32 2
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test v4i32 extraction with the first ouf-of-range offset.
++define void @f17(<4 x i32> %val, i32 *%base) {
++; CHECK-LABEL: f17:
++; CHECK: aghi %r2, 4096
++; CHECK: vstef %v24, 0(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr i32 *%base, i32 1024
++  %element = extractelement <4 x i32> %val, i32 1
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test v4i32 extraction from a variable element.
++define void @f18(<4 x i32> %val, i32 *%ptr, i32 %index) {
++; CHECK-LABEL: f18:
++; CHECK-NOT: vstef
++; CHECK: br %r14
++  %element = extractelement <4 x i32> %val, i32 %index
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test v2i64 extraction from the first element.
++define void @f19(<2 x i64> %val, i64 *%ptr) {
++; CHECK-LABEL: f19:
++; CHECK: vsteg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = extractelement <2 x i64> %val, i32 0
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test v2i64 extraction from the last element.
++define void @f20(<2 x i64> %val, i64 *%ptr) {
++; CHECK-LABEL: f20:
++; CHECK: vsteg %v24, 0(%r2), 1
++; CHECK: br %r14
++  %element = extractelement <2 x i64> %val, i32 1
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test v2i64 extraction of an invalid element.  This must compile,
++; but we don't care what it does.
++define void @f21(<2 x i64> %val, i64 *%ptr) {
++; CHECK-LABEL: f21:
++; CHECK-NOT: vsteg %v24, 0(%r2), 2
++; CHECK: br %r14
++  %element = extractelement <2 x i64> %val, i32 2
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test v2i64 extraction with the highest in-range offset.
++define void @f22(<2 x i64> %val, i64 *%base) {
++; CHECK-LABEL: f22:
++; CHECK: vsteg %v24, 4088(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i32 511
++  %element = extractelement <2 x i64> %val, i32 1
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test v2i64 extraction with the first ouf-of-range offset.
++define void @f23(<2 x i64> %val, i64 *%base) {
++; CHECK-LABEL: f23:
++; CHECK: aghi %r2, 4096
++; CHECK: vsteg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i32 512
++  %element = extractelement <2 x i64> %val, i32 0
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test v2i64 extraction from a variable element.
++define void @f24(<2 x i64> %val, i64 *%ptr, i32 %index) {
++; CHECK-LABEL: f24:
++; CHECK-NOT: vsteg
++; CHECK: br %r14
++  %element = extractelement <2 x i64> %val, i32 %index
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test v4f32 extraction from the first element.
++define void @f25(<4 x float> %val, float *%ptr) {
++; CHECK-LABEL: f25:
++; CHECK: vstef %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = extractelement <4 x float> %val, i32 0
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test v4f32 extraction from the last element.
++define void @f26(<4 x float> %val, float *%ptr) {
++; CHECK-LABEL: f26:
++; CHECK: vstef %v24, 0(%r2), 3
++; CHECK: br %r14
++  %element = extractelement <4 x float> %val, i32 3
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test v4f32 extraction of an invalid element.  This must compile,
++; but we don't care what it does.
++define void @f27(<4 x float> %val, float *%ptr) {
++; CHECK-LABEL: f27:
++; CHECK-NOT: vstef %v24, 0(%r2), 4
++; CHECK: br %r14
++  %element = extractelement <4 x float> %val, i32 4
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test v4f32 extraction with the highest in-range offset.
++define void @f28(<4 x float> %val, float *%base) {
++; CHECK-LABEL: f28:
++; CHECK: vstef %v24, 4092(%r2), 2
++; CHECK: br %r14
++  %ptr = getelementptr float *%base, i32 1023
++  %element = extractelement <4 x float> %val, i32 2
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test v4f32 extraction with the first ouf-of-range offset.
++define void @f29(<4 x float> %val, float *%base) {
++; CHECK-LABEL: f29:
++; CHECK: aghi %r2, 4096
++; CHECK: vstef %v24, 0(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr float *%base, i32 1024
++  %element = extractelement <4 x float> %val, i32 1
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test v4f32 extraction from a variable element.
++define void @f30(<4 x float> %val, float *%ptr, i32 %index) {
++; CHECK-LABEL: f30:
++; CHECK-NOT: vstef
++; CHECK: br %r14
++  %element = extractelement <4 x float> %val, i32 %index
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test v2f64 extraction from the first element.
++define void @f32(<2 x double> %val, double *%ptr) {
++; CHECK-LABEL: f32:
++; CHECK: vsteg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %element = extractelement <2 x double> %val, i32 0
++  store double %element, double *%ptr
++  ret void
++}
++
++; Test v2f64 extraction from the last element.
++define void @f33(<2 x double> %val, double *%ptr) {
++; CHECK-LABEL: f33:
++; CHECK: vsteg %v24, 0(%r2), 1
++; CHECK: br %r14
++  %element = extractelement <2 x double> %val, i32 1
++  store double %element, double *%ptr
++  ret void
++}
++
++; Test v2f64 extraction with the highest in-range offset.
++define void @f34(<2 x double> %val, double *%base) {
++; CHECK-LABEL: f34:
++; CHECK: vsteg %v24, 4088(%r2), 1
++; CHECK: br %r14
++  %ptr = getelementptr double *%base, i32 511
++  %element = extractelement <2 x double> %val, i32 1
++  store double %element, double *%ptr
++  ret void
++}
++
++; Test v2f64 extraction with the first ouf-of-range offset.
++define void @f35(<2 x double> %val, double *%base) {
++; CHECK-LABEL: f35:
++; CHECK: aghi %r2, 4096
++; CHECK: vsteg %v24, 0(%r2), 0
++; CHECK: br %r14
++  %ptr = getelementptr double *%base, i32 512
++  %element = extractelement <2 x double> %val, i32 0
++  store double %element, double *%ptr
++  ret void
++}
++
++; Test v2f64 extraction from a variable element.
++define void @f36(<2 x double> %val, double *%ptr, i32 %index) {
++; CHECK-LABEL: f36:
++; CHECK-NOT: vsteg
++; CHECK: br %r14
++  %element = extractelement <2 x double> %val, i32 %index
++  store double %element, double *%ptr
++  ret void
++}
++
++; Test a v4i32 scatter of the first element.
++define void @f37(<4 x i32> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f37:
++; CHECK: vscef %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 0
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to i32 *
++  %element = extractelement <4 x i32> %val, i32 0
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test a v4i32 scatter of the last element.
++define void @f38(<4 x i32> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f38:
++; CHECK: vscef %v24, 0(%v26,%r2), 3
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 3
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to i32 *
++  %element = extractelement <4 x i32> %val, i32 3
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test a v4i32 scatter with the highest in-range offset.
++define void @f39(<4 x i32> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f39:
++; CHECK: vscef %v24, 4095(%v26,%r2), 1
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 1
++  %ext = zext i32 %elem to i64
++  %add1 = add i64 %base, %ext
++  %add2 = add i64 %add1, 4095
++  %ptr = inttoptr i64 %add2 to i32 *
++  %element = extractelement <4 x i32> %val, i32 1
++  store i32 %element, i32 *%ptr
++  ret void
++}
++
++; Test a v2i64 scatter of the first element.
++define void @f40(<2 x i64> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f40:
++; CHECK: vsceg %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 0
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to i64 *
++  %element = extractelement <2 x i64> %val, i32 0
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test a v2i64 scatter of the last element.
++define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f41:
++; CHECK: vsceg %v24, 0(%v26,%r2), 1
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 1
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to i64 *
++  %element = extractelement <2 x i64> %val, i32 1
++  store i64 %element, i64 *%ptr
++  ret void
++}
++
++; Test a v4f32 scatter of the first element.
++define void @f42(<4 x float> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f42:
++; CHECK: vscef %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 0
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to float *
++  %element = extractelement <4 x float> %val, i32 0
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test a v4f32 scatter of the last element.
++define void @f43(<4 x float> %val, <4 x i32> %index, i64 %base) {
++; CHECK-LABEL: f43:
++; CHECK: vscef %v24, 0(%v26,%r2), 3
++; CHECK: br %r14
++  %elem = extractelement <4 x i32> %index, i32 3
++  %ext = zext i32 %elem to i64
++  %add = add i64 %base, %ext
++  %ptr = inttoptr i64 %add to float *
++  %element = extractelement <4 x float> %val, i32 3
++  store float %element, float *%ptr
++  ret void
++}
++
++; Test a v2f64 scatter of the first element.
++define void @f44(<2 x double> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f44:
++; CHECK: vsceg %v24, 0(%v26,%r2), 0
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 0
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to double *
++  %element = extractelement <2 x double> %val, i32 0
++  store double %element, double *%ptr
++  ret void
++}
++
++; Test a v2f64 scatter of the last element.
++define void @f45(<2 x double> %val, <2 x i64> %index, i64 %base) {
++; CHECK-LABEL: f45:
++; CHECK: vsceg %v24, 0(%v26,%r2), 1
++; CHECK: br %r14
++  %elem = extractelement <2 x i64> %index, i32 1
++  %add = add i64 %base, %elem
++  %ptr = inttoptr i64 %add to double *
++  %element = extractelement <2 x double> %val, i32 1
++  store double %element, double *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-11.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-11.ll
+@@ -0,0 +1,111 @@
++; Test insertions of register values into a nonzero index of an undef.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 insertion into an undef, with an arbitrary index.
++define <16 x i8> @f1(i8 %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlvgb %v24, %r2, 12
++; CHECK-NEXT: br %r14
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 12
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into an undef, with the first good index for VLVGP.
++define <16 x i8> @f2(i8 %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 7
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into an undef, with the second good index for VLVGP.
++define <16 x i8> @f3(i8 %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 15
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 insertion into an undef, with an arbitrary index.
++define <8 x i16> @f4(i16 %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlvgh %v24, %r2, 5
++; CHECK-NEXT: br %r14
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 5
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into an undef, with the first good index for VLVGP.
++define <8 x i16> @f5(i16 %val) {
++; CHECK-LABEL: f5:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 3
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into an undef, with the second good index for VLVGP.
++define <8 x i16> @f6(i16 %val) {
++; CHECK-LABEL: f6:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 7
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 insertion into an undef, with an arbitrary index.
++define <4 x i32> @f7(i32 %val) {
++; CHECK-LABEL: f7:
++; CHECK: vlvgf %v24, %r2, 2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 2
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into an undef, with the first good index for VLVGP.
++define <4 x i32> @f8(i32 %val) {
++; CHECK-LABEL: f8:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into an undef, with the second good index for VLVGP.
++define <4 x i32> @f9(i32 %val) {
++; CHECK-LABEL: f9:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 insertion into an undef.
++define <2 x i64> @f10(i64 %val) {
++; CHECK-LABEL: f10:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK-NEXT: br %r14
++  %ret = insertelement <2 x i64> undef, i64 %val, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 insertion into an undef.
++define <4 x float> @f11(float %val) {
++; CHECK-LABEL: f11:
++; CHECK: vrepf %v24, %v0, 0
++; CHECK: br %r14
++  %ret = insertelement <4 x float> undef, float %val, i32 2
++  ret <4 x float> %ret
++}
++
++; Test v2f64 insertion into an undef.
++define <2 x double> @f12(double %val) {
++; CHECK-LABEL: f12:
++; CHECK: vrepg %v24, %v0, 0
++; CHECK: br %r14
++  %ret = insertelement <2 x double> undef, double %val, i32 1
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-12.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-12.ll
+@@ -0,0 +1,123 @@
++; Test insertions of memory values into a nonzero index of an undef.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 insertion into an undef, with an arbitrary index.
++define <16 x i8> @f1(i8 *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vlrepb %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 12
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into an undef, with the first good index for VLVGP.
++define <16 x i8> @f2(i8 *%ptr) {
++; CHECK-LABEL: f2:
++; CHECK: {{vlrepb|vllezb}} %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 7
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 insertion into an undef, with the second good index for VLVGP.
++define <16 x i8> @f3(i8 *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: vlrepb %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> undef, i8 %val, i32 15
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 insertion into an undef, with an arbitrary index.
++define <8 x i16> @f4(i16 *%ptr) {
++; CHECK-LABEL: f4:
++; CHECK: vlreph %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i16 *%ptr
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 5
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into an undef, with the first good index for VLVGP.
++define <8 x i16> @f5(i16 *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: {{vlreph|vllezh}} %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i16 *%ptr
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 3
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 insertion into an undef, with the second good index for VLVGP.
++define <8 x i16> @f6(i16 *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vlreph %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i16 *%ptr
++  %ret = insertelement <8 x i16> undef, i16 %val, i32 7
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 insertion into an undef, with an arbitrary index.
++define <4 x i32> @f7(i32 *%ptr) {
++; CHECK-LABEL: f7:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i32 *%ptr
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 2
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into an undef, with the first good index for VLVGP.
++define <4 x i32> @f8(i32 *%ptr) {
++; CHECK-LABEL: f8:
++; CHECK: {{vlrepf|vllezf}} %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i32 *%ptr
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 insertion into an undef, with the second good index for VLVGP.
++define <4 x i32> @f9(i32 *%ptr) {
++; CHECK-LABEL: f9:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i32 *%ptr
++  %ret = insertelement <4 x i32> undef, i32 %val, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 insertion into an undef.
++define <2 x i64> @f10(i64 *%ptr) {
++; CHECK-LABEL: f10:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK-NEXT: br %r14
++  %val = load i64 *%ptr
++  %ret = insertelement <2 x i64> undef, i64 %val, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 insertion into an undef.
++define <4 x float> @f11(float *%ptr) {
++; CHECK-LABEL: f11:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load float *%ptr
++  %ret = insertelement <4 x float> undef, float %val, i32 2
++  ret <4 x float> %ret
++}
++
++; Test v2f64 insertion into an undef.
++define <2 x double> @f12(double *%ptr) {
++; CHECK-LABEL: f12:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load double *%ptr
++  %ret = insertelement <2 x double> undef, double %val, i32 1
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-13.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-13.ll
+@@ -0,0 +1,69 @@
++; Test insertions of register values into 0.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 insertion into 0.
++define <16 x i8> @f1(i8 %val1, i8 %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vgbm %v24, 0
++; CHECK-DAG: vlvgb %v24, %r2, 2
++; CHECK-DAG: vlvgb %v24, %r3, 12
++; CHECK: br %r14
++  %vec1 = insertelement <16 x i8> zeroinitializer, i8 %val1, i32 2
++  %vec2 = insertelement <16 x i8> %vec1, i8 %val2, i32 12
++  ret <16 x i8> %vec2
++}
++
++; Test v8i16 insertion into 0.
++define <8 x i16> @f2(i16 %val1, i16 %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vgbm %v24, 0
++; CHECK-DAG: vlvgh %v24, %r2, 3
++; CHECK-DAG: vlvgh %v24, %r3, 5
++; CHECK: br %r14
++  %vec1 = insertelement <8 x i16> zeroinitializer, i16 %val1, i32 3
++  %vec2 = insertelement <8 x i16> %vec1, i16 %val2, i32 5
++  ret <8 x i16> %vec2
++}
++
++; Test v4i32 insertion into 0.
++define <4 x i32> @f3(i32 %val) {
++; CHECK-LABEL: f3:
++; CHECK: vgbm %v24, 0
++; CHECK: vlvgf %v24, %r2, 3
++; CHECK: br %r14
++  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 3
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 insertion into 0.
++define <2 x i64> @f4(i64 %val) {
++; CHECK-LABEL: f4:
++; CHECK: lghi [[REG:%r[0-5]]], 0
++; CHECK: vlvgp %v24, [[REG]], %r2
++; CHECK: br %r14
++  %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 1
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 insertion into 0.
++define <4 x float> @f5(float %val) {
++; CHECK-LABEL: f5:
++; CHECK-DAG: vuplhf [[REG:%v[0-9]+]], %v0
++; CHECK-DAG: vgbm [[ZERO:%v[0-9]+]], 0
++; CHECK: vmrhg %v24, [[ZERO]], [[REG]]
++; CHECK: br %r14
++  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 3
++  ret <4 x float> %ret
++}
++
++; Test v2f64 insertion into 0.
++define <2 x double> @f6(double %val) {
++; CHECK-LABEL: f6:
++; CHECK: vgbm [[REG:%v[0-9]+]], 0
++; CHECK: vmrhg %v24, [[REG]], %v0
++; CHECK: br %r14
++  %ret = insertelement <2 x double> zeroinitializer, double %val, i32 1
++  ret <2 x double> %ret
++}
++
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-14.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-14.ll
+@@ -0,0 +1,96 @@
++; Test insertions of memory values into 0.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test VLLEZB.
++define <16 x i8> @f1(i8 *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vllezb %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
++  ret <16 x i8> %ret
++}
++
++; Test VLLEZB with the highest in-range offset.
++define <16 x i8> @f2(i8 *%base) {
++; CHECK-LABEL: f2:
++; CHECK: vllezb %v24, 4095(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4095
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
++  ret <16 x i8> %ret
++}
++
++; Test VLLEZB with the next highest offset.
++define <16 x i8> @f3(i8 *%base) {
++; CHECK-LABEL: f3:
++; CHECK-NOT: vllezb %v24, 4096(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4096
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
++  ret <16 x i8> %ret
++}
++
++; Test that VLLEZB allows an index.
++define <16 x i8> @f4(i8 *%base, i64 %index) {
++; CHECK-LABEL: f4:
++; CHECK: vllezb %v24, 0({{%r2,%r3|%r3,%r2}})
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 %index
++  %val = load i8 *%ptr
++  %ret = insertelement <16 x i8> zeroinitializer, i8 %val, i32 7
++  ret <16 x i8> %ret
++}
++
++; Test VLLEZH.
++define <8 x i16> @f5(i16 *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: vllezh %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load i16 *%ptr
++  %ret = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3
++  ret <8 x i16> %ret
++}
++
++; Test VLLEZF.
++define <4 x i32> @f6(i32 *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vllezf %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load i32 *%ptr
++  %ret = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1
++  ret <4 x i32> %ret
++}
++
++; Test VLLEZG.
++define <2 x i64> @f7(i64 *%ptr) {
++; CHECK-LABEL: f7:
++; CHECK: vllezg %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load i64 *%ptr
++  %ret = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0
++  ret <2 x i64> %ret
++}
++
++; Test VLLEZF with a float.
++define <4 x float> @f8(float *%ptr) {
++; CHECK-LABEL: f8:
++; CHECK: vllezf %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load float *%ptr
++  %ret = insertelement <4 x float> zeroinitializer, float %val, i32 1
++  ret <4 x float> %ret
++}
++
++; Test VLLEZG with a double.
++define <2 x double> @f9(double *%ptr) {
++; CHECK-LABEL: f9:
++; CHECK: vllezg %v24, 0(%r2)
++; CHECK: br %r14
++  %val = load double *%ptr
++  %ret = insertelement <2 x double> zeroinitializer, double %val, i32 0
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-15.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-15.ll
+@@ -0,0 +1,105 @@
++; Test vector sign-extending loads.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i1->v16i8 extension.
++define <16 x i8> @f1(<16 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <16 x i1> *%ptr
++  %ret = sext <16 x i1> %val to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i1->v8i16 extension.
++define <8 x i16> @f2(<8 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <8 x i1> *%ptr
++  %ret = sext <8 x i1> %val to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i8->v8i16 extension.
++define <8 x i16> @f3(<8 x i8> *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuphb %v24, [[REG1]]
++; CHECK: br %r14
++  %val = load <8 x i8> *%ptr
++  %ret = sext <8 x i8> %val to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i1->v4i32 extension.
++define <4 x i32> @f4(<4 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <4 x i1> *%ptr
++  %ret = sext <4 x i1> %val to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i8->v4i32 extension.
++define <4 x i32> @f5(<4 x i8> *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]]
++; CHECK: vuphh %v24, [[REG2]]
++; CHECK: br %r14
++  %val = load <4 x i8> *%ptr
++  %ret = sext <4 x i8> %val to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i16->v4i32 extension.
++define <4 x i32> @f6(<4 x i16> *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuphh %v24, [[REG1]]
++; CHECK: br %r14
++  %val = load <4 x i16> *%ptr
++  %ret = sext <4 x i16> %val to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i1->v2i64 extension.
++define <2 x i64> @f7(<2 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <2 x i1> *%ptr
++  %ret = sext <2 x i1> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i8->v2i64 extension.
++define <2 x i64> @f8(<2 x i8> *%ptr) {
++; CHECK-LABEL: f8:
++; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]]
++; CHECK: vuphh [[REG3:%v[0-9]+]], [[REG2]]
++; CHECK: vuphf %v24, [[REG3]]
++; CHECK: br %r14
++  %val = load <2 x i8> *%ptr
++  %ret = sext <2 x i8> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i16->v2i64 extension.
++define <2 x i64> @f9(<2 x i16> *%ptr) {
++; CHECK-LABEL: f9:
++; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuphh [[REG2:%v[0-9]+]], [[REG1]]
++; CHECK: vuphf %v24, [[REG2]]
++; CHECK: br %r14
++  %val = load <2 x i16> *%ptr
++  %ret = sext <2 x i16> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i32->v2i64 extension.
++define <2 x i64> @f10(<2 x i32> *%ptr) {
++; CHECK-LABEL: f10:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuphf %v24, [[REG1]]
++; CHECK: br %r14
++  %val = load <2 x i32> *%ptr
++  %ret = sext <2 x i32> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-16.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-16.ll
+@@ -0,0 +1,105 @@
++; Test vector zero-extending loads.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i1->v16i8 extension.
++define <16 x i8> @f1(<16 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <16 x i1> *%ptr
++  %ret = zext <16 x i1> %val to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i1->v8i16 extension.
++define <8 x i16> @f2(<8 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <8 x i1> *%ptr
++  %ret = zext <8 x i1> %val to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i8->v8i16 extension.
++define <8 x i16> @f3(<8 x i8> *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuplhb %v24, [[REG1]]
++; CHECK: br %r14
++  %val = load <8 x i8> *%ptr
++  %ret = zext <8 x i8> %val to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i1->v4i32 extension.
++define <4 x i32> @f4(<4 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <4 x i1> *%ptr
++  %ret = zext <4 x i1> %val to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i8->v4i32 extension.
++define <4 x i32> @f5(<4 x i8> *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
++; CHECK: vuplhh %v24, [[REG2]]
++; CHECK: br %r14
++  %val = load <4 x i8> *%ptr
++  %ret = zext <4 x i8> %val to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i16->v4i32 extension.
++define <4 x i32> @f6(<4 x i16> *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuplhh %v24, [[REG1]]
++; CHECK: br %r14
++  %val = load <4 x i16> *%ptr
++  %ret = zext <4 x i16> %val to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i1->v2i64 extension.
++define <2 x i64> @f7(<2 x i1> *%ptr) {
++; No expected output, but must compile.
++  %val = load <2 x i1> *%ptr
++  %ret = zext <2 x i1> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i8->v2i64 extension.
++define <2 x i64> @f8(<2 x i8> *%ptr) {
++; CHECK-LABEL: f8:
++; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
++; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
++; CHECK: vuplhf %v24, [[REG3]]
++; CHECK: br %r14
++  %val = load <2 x i8> *%ptr
++  %ret = zext <2 x i8> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i16->v2i64 extension.
++define <2 x i64> @f9(<2 x i16> *%ptr) {
++; CHECK-LABEL: f9:
++; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
++; CHECK: vuplhf %v24, [[REG2]]
++; CHECK: br %r14
++  %val = load <2 x i16> *%ptr
++  %ret = zext <2 x i16> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i32->v2i64 extension.
++define <2 x i64> @f10(<2 x i32> *%ptr) {
++; CHECK-LABEL: f10:
++; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2)
++; CHECK: vuplhf %v24, [[REG1]]
++; CHECK: br %r14
++  %val = load <2 x i32> *%ptr
++  %ret = zext <2 x i32> %val to <2 x i64>
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-move-17.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-move-17.ll
+@@ -0,0 +1,104 @@
++; Test vector truncating stores.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8->v16i1 truncation.
++define void @f1(<16 x i8> %val, <16 x i1> *%ptr) {
++; No expected output, but must compile.
++  %trunc = trunc <16 x i8> %val to <16 x i1>
++  store <16 x i1> %trunc, <16 x i1> *%ptr
++  ret void
++}
++
++; Test a v8i16->v8i1 truncation.
++define void @f2(<8 x i16> %val, <8 x i1> *%ptr) {
++; No expected output, but must compile.
++  %trunc = trunc <8 x i16> %val to <8 x i1>
++  store <8 x i1> %trunc, <8 x i1> *%ptr
++  ret void
++}
++
++; Test a v8i16->v8i8 truncation.
++define void @f3(<8 x i16> %val, <8 x i8> *%ptr) {
++; CHECK-LABEL: f3:
++; CHECK: vpkh [[REG1:%v[0-9]+]], %v24, %v24
++; CHECK: vsteg [[REG1]], 0(%r2)
++; CHECK: br %r14
++  %trunc = trunc <8 x i16> %val to <8 x i8>
++  store <8 x i8> %trunc, <8 x i8> *%ptr
++  ret void
++}
++
++; Test a v4i32->v4i1 truncation.
++define void @f4(<4 x i32> %val, <4 x i1> *%ptr) {
++; No expected output, but must compile.
++  %trunc = trunc <4 x i32> %val to <4 x i1>
++  store <4 x i1> %trunc, <4 x i1> *%ptr
++  ret void
++}
++
++; Test a v4i32->v4i8 truncation.  At the moment we use a VPERM rather than
++; a chain of packs.
++define void @f5(<4 x i32> %val, <4 x i8> *%ptr) {
++; CHECK-LABEL: f5:
++; CHECK: vperm [[REG:%v[0-9]+]],
++; CHECK: vstef [[REG]], 0(%r2)
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i8>
++  store <4 x i8> %trunc, <4 x i8> *%ptr
++  ret void
++}
++
++; Test a v4i32->v4i16 truncation.
++define void @f6(<4 x i32> %val, <4 x i16> *%ptr) {
++; CHECK-LABEL: f6:
++; CHECK: vpkf [[REG1:%v[0-9]+]], %v24, %v24
++; CHECK: vsteg [[REG1]], 0(%r2)
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i16>
++  store <4 x i16> %trunc, <4 x i16> *%ptr
++  ret void
++}
++
++; Test a v2i64->v2i1 truncation.
++define void @f7(<2 x i64> %val, <2 x i1> *%ptr) {
++; No expected output, but must compile.
++  %trunc = trunc <2 x i64> %val to <2 x i1>
++  store <2 x i1> %trunc, <2 x i1> *%ptr
++  ret void
++}
++
++; Test a v2i64->v2i8 truncation.  At the moment we use a VPERM rather than
++; a chain of packs.
++define void @f8(<2 x i64> %val, <2 x i8> *%ptr) {
++; CHECK-LABEL: f8:
++; CHECK: vperm [[REG:%v[0-9]+]],
++; CHECK: vsteh [[REG]], 0(%r2)
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i8>
++  store <2 x i8> %trunc, <2 x i8> *%ptr
++  ret void
++}
++
++; Test a v2i64->v2i16 truncation.  At the moment we use a VPERM rather than
++; a chain of packs.
++define void @f9(<2 x i64> %val, <2 x i16> *%ptr) {
++; CHECK-LABEL: f9:
++; CHECK: vperm [[REG:%v[0-9]+]],
++; CHECK: vstef [[REG]], 0(%r2)
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i16>
++  store <2 x i16> %trunc, <2 x i16> *%ptr
++  ret void
++}
++
++; Test a v2i64->v2i32 truncation.
++define void @f10(<2 x i64> %val, <2 x i32> *%ptr) {
++; CHECK-LABEL: f10:
++; CHECK: vpkg [[REG1:%v[0-9]+]], %v24, %v24
++; CHECK: vsteg [[REG1]], 0(%r2)
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i32>
++  store <2 x i32> %trunc, <2 x i32> *%ptr
++  ret void
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-mul-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-mul-01.ll
+@@ -0,0 +1,60 @@
++; Test vector multiplication.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 multiplication.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmlb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = mul <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 multiplication.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmlhw %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = mul <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 multiplication.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmlf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = mul <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 multiplication.  There's no vector equivalent.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK-NOT: vmlg
++; CHECK: br %r14
++  %ret = mul <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v2f64 multiplication.
++define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
++                        <2 x double> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vfmdb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = fmul <2 x double> %val1, %val2
++  ret <2 x double> %ret
++}
++
++; Test an f64 multiplication that uses vector registers.
++define double @f6(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: wfmdb %f0, %v24, %v26
++; CHECK: br %r14
++  %scalar1 = extractelement <2 x double> %val1, i32 0
++  %scalar2 = extractelement <2 x double> %val2, i32 0
++  %ret = fmul double %scalar1, %scalar2
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-mul-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-mul-02.ll
+@@ -0,0 +1,63 @@
++; Test vector multiply-and-add.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
++
++; Test a v16i8 multiply-and-add.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2,
++                     <16 x i8> %val3) {
++; CHECK-LABEL: f1:
++; CHECK: vmalb %v24, %v26, %v28, %v30
++; CHECK: br %r14
++  %mul = mul <16 x i8> %val1, %val2
++  %ret = add <16 x i8> %mul, %val3
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 multiply-and-add.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2,
++                     <8 x i16> %val3) {
++; CHECK-LABEL: f2:
++; CHECK: vmalhw %v24, %v26, %v28, %v30
++; CHECK: br %r14
++  %mul = mul <8 x i16> %val1, %val2
++  %ret = add <8 x i16> %mul, %val3
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 multiply-and-add.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2,
++                     <4 x i32> %val3) {
++; CHECK-LABEL: f3:
++; CHECK: vmalf %v24, %v26, %v28, %v30
++; CHECK: br %r14
++  %mul = mul <4 x i32> %val1, %val2
++  %ret = add <4 x i32> %mul, %val3
++  ret <4 x i32> %ret
++}
++
++; Test a v2f64 multiply-and-add.
++define <2 x double> @f4(<2 x double> %dummy, <2 x double> %val1,
++                        <2 x double> %val2, <2 x double> %val3) {
++; CHECK-LABEL: f4:
++; CHECK: vfmadb %v24, %v26, %v28, %v30
++; CHECK: br %r14
++  %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1,
++                                            <2 x double> %val2,
++                                            <2 x double> %val3)
++  ret <2 x double> %ret
++}
++
++; Test a v2f64 multiply-and-subtract.
++define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val1,
++                        <2 x double> %val2, <2 x double> %val3) {
++; CHECK-LABEL: f5:
++; CHECK: vfmsdb %v24, %v26, %v28, %v30
++; CHECK: br %r14
++  %negval3 = fsub <2 x double> <double -0.0, double -0.0>, %val3
++  %ret = call <2 x double> @llvm.fma.v2f64 (<2 x double> %val1,
++                                            <2 x double> %val2,
++                                            <2 x double> %negval3)
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-neg-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-neg-01.ll
+@@ -0,0 +1,58 @@
++; Test vector negation.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 negation.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vlcb %v24, %v26
++; CHECK: br %r14
++  %ret = sub <16 x i8> zeroinitializer, %val
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 negation.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vlch %v24, %v26
++; CHECK: br %r14
++  %ret = sub <8 x i16> zeroinitializer, %val
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 negation.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vlcf %v24, %v26
++; CHECK: br %r14
++  %ret = sub <4 x i32> zeroinitializer, %val
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 negation.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vlcg %v24, %v26
++; CHECK: br %r14
++  %ret = sub <2 x i64> zeroinitializer, %val
++  ret <2 x i64> %ret
++}
++
++; Test a v2f64 negation.
++define <2 x double> @f5(<2 x double> %dummy, <2 x double> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vflcdb %v24, %v26
++; CHECK: br %r14
++  %ret = fsub <2 x double> <double -0.0, double -0.0>, %val
++  ret <2 x double> %ret
++}
++
++; Test an f64 negation that uses vector registers.
++define double @f6(<2 x double> %val) {
++; CHECK-LABEL: f6:
++; CHECK: wflcdb %f0, %v24
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %ret = fsub double -0.0, %scalar
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-or-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-or-01.ll
+@@ -0,0 +1,39 @@
++; Test vector OR.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 OR.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vo %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = or <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 OR.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vo %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = or <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 OR.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vo %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = or <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 OR.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vo %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = or <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-or-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-or-02.ll
+@@ -0,0 +1,107 @@
++; Test vector (or (and X, Z), (and Y, (not Z))) patterns.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) {
++; CHECK-LABEL: f1:
++; CHECK: vsel %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <16 x i8> %val3, <i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1>
++  %and1 = and <16 x i8> %val1, %val3
++  %and2 = and <16 x i8> %val2, %not
++  %ret = or <16 x i8> %and1, %and2
++  ret <16 x i8> %ret
++}
++
++; ...and again with the XOR applied to the other operand of the AND.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3) {
++; CHECK-LABEL: f2:
++; CHECK: vsel %v24, %v26, %v24, %v28
++; CHECK: br %r14
++  %not = xor <16 x i8> %val3, <i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1,
++                               i8 -1, i8 -1, i8 -1, i8 -1>
++  %and1 = and <16 x i8> %val1, %not
++  %and2 = and <16 x i8> %val2, %val3
++  %ret = or <16 x i8> %and1, %and2
++  ret <16 x i8> %ret
++}
++
++; Test v8i16.
++define <8 x i16> @f3(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) {
++; CHECK-LABEL: f3:
++; CHECK: vsel %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <8 x i16> %val3, <i16 -1, i16 -1, i16 -1, i16 -1,
++                               i16 -1, i16 -1, i16 -1, i16 -1>
++  %and1 = and <8 x i16> %val1, %val3
++  %and2 = and <8 x i16> %val2, %not
++  %ret = or <8 x i16> %and1, %and2
++  ret <8 x i16> %ret
++}
++
++; ...and again with the XOR applied to the other operand of the AND.
++define <8 x i16> @f4(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3) {
++; CHECK-LABEL: f4:
++; CHECK: vsel %v24, %v26, %v24, %v28
++; CHECK: br %r14
++  %not = xor <8 x i16> %val3, <i16 -1, i16 -1, i16 -1, i16 -1,
++                               i16 -1, i16 -1, i16 -1, i16 -1>
++  %and1 = and <8 x i16> %val1, %not
++  %and2 = and <8 x i16> %val2, %val3
++  %ret = or <8 x i16> %and1, %and2
++  ret <8 x i16> %ret
++}
++
++; Test v4i32.
++define <4 x i32> @f5(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) {
++; CHECK-LABEL: f5:
++; CHECK: vsel %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <4 x i32> %val3, <i32 -1, i32 -1, i32 -1, i32 -1>
++  %and1 = and <4 x i32> %val1, %val3
++  %and2 = and <4 x i32> %val2, %not
++  %ret = or <4 x i32> %and1, %and2
++  ret <4 x i32> %ret
++}
++
++; ...and again with the XOR applied to the other operand of the AND.
++define <4 x i32> @f6(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3) {
++; CHECK-LABEL: f6:
++; CHECK: vsel %v24, %v26, %v24, %v28
++; CHECK: br %r14
++  %not = xor <4 x i32> %val3, <i32 -1, i32 -1, i32 -1, i32 -1>
++  %and1 = and <4 x i32> %val1, %not
++  %and2 = and <4 x i32> %val2, %val3
++  %ret = or <4 x i32> %and1, %and2
++  ret <4 x i32> %ret
++}
++
++; Test v2i64.
++define <2 x i64> @f7(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) {
++; CHECK-LABEL: f7:
++; CHECK: vsel %v24, %v24, %v26, %v28
++; CHECK: br %r14
++  %not = xor <2 x i64> %val3, <i64 -1, i64 -1>
++  %and1 = and <2 x i64> %val1, %val3
++  %and2 = and <2 x i64> %val2, %not
++  %ret = or <2 x i64> %and1, %and2
++  ret <2 x i64> %ret
++}
++
++; ...and again with the XOR applied to the other operand of the AND.
++define <2 x i64> @f8(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) {
++; CHECK-LABEL: f8:
++; CHECK: vsel %v24, %v26, %v24, %v28
++; CHECK: br %r14
++  %not = xor <2 x i64> %val3, <i64 -1, i64 -1>
++  %and1 = and <2 x i64> %val1, %not
++  %and2 = and <2 x i64> %val2, %val3
++  %ret = or <2 x i64> %and1, %and2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-01.ll
+@@ -0,0 +1,175 @@
++; Test vector splat.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 splat of the first element.
++define <16 x i8> @f1(<16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vrepb %v24, %v24, 0
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> zeroinitializer
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 splat of the last element.
++define <16 x i8> @f2(<16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vrepb %v24, %v24, 15
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> <i32 15, i32 15, i32 15, i32 15,
++                                   i32 15, i32 15, i32 15, i32 15,
++                                   i32 15, i32 15, i32 15, i32 15,
++                                   i32 15, i32 15, i32 15, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <16 x i8> @f3(<16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vrepb %v24, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> undef, <16 x i8> %val,
++                       <16 x i32> <i32 20, i32 20, i32 20, i32 20,
++                                   i32 20, i32 20, i32 20, i32 20,
++                                   i32 20, i32 20, i32 20, i32 20,
++                                   i32 20, i32 20, i32 20, i32 20>
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 splat of the first element.
++define <8 x i16> @f4(<8 x i16> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vreph %v24, %v24, 0
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> zeroinitializer
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 splat of the last element.
++define <8 x i16> @f5(<8 x i16> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vreph %v24, %v24, 7
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> <i32 7, i32 7, i32 7, i32 7,
++                                  i32 7, i32 7, i32 7, i32 7>
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <8 x i16> @f6(<8 x i16> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vreph %v24, %v24, 2
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> undef, <8 x i16> %val,
++                       <8 x i32> <i32 10, i32 10, i32 10, i32 10,
++                                  i32 10, i32 10, i32 10, i32 10>
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 splat of the first element.
++define <4 x i32> @f7(<4 x i32> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vrepf %v24, %v24, 0
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 splat of the last element.
++define <4 x i32> @f8(<4 x i32> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vrepf %v24, %v24, 3
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <4 x i32> @f9(<4 x i32> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vrepf %v24, %v24, 1
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> undef, <4 x i32> %val,
++                       <4 x i32> <i32 5, i32 5, i32 5, i32 5>
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 splat of the first element.
++define <2 x i64> @f10(<2 x i64> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vrepg %v24, %v24, 0
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 splat of the last element.
++define <2 x i64> @f11(<2 x i64> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vrepg %v24, %v24, 1
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> <i32 1, i32 1>
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 splat of the first element.
++define <4 x float> @f12(<4 x float> %val) {
++; CHECK-LABEL: f12:
++; CHECK: vrepf %v24, %v24, 0
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x float> %ret
++}
++
++; Test v4f32 splat of the last element.
++define <4 x float> @f13(<4 x float> %val) {
++; CHECK-LABEL: f13:
++; CHECK: vrepf %v24, %v24, 3
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++  ret <4 x float> %ret
++}
++
++; Test v4f32 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <4 x float> @f14(<4 x float> %val) {
++; CHECK-LABEL: f14:
++; CHECK: vrepf %v24, %v24, 1
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> undef, <4 x float> %val,
++                       <4 x i32> <i32 5, i32 5, i32 5, i32 5>
++  ret <4 x float> %ret
++}
++
++; Test v2f64 splat of the first element.
++define <2 x double> @f15(<2 x double> %val) {
++; CHECK-LABEL: f15:
++; CHECK: vrepg %v24, %v24, 0
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x double> %ret
++}
++
++; Test v2f64 splat of the last element.
++define <2 x double> @f16(<2 x double> %val) {
++; CHECK-LABEL: f16:
++; CHECK: vrepg %v24, %v24, 1
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> <i32 1, i32 1>
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-02.ll
+@@ -0,0 +1,200 @@
++; Test replications of a scalar register value, represented as splats.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test v16i8 splat of the first element.
++define <16 x i8> @f1(i8 %scalar) {
++; CHECK-LABEL: f1:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vrepb %v24, [[REG]], 7
++; CHECK: br %r14
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 0
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> zeroinitializer
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 splat of the last element.
++define <16 x i8> @f2(i8 %scalar) {
++; CHECK-LABEL: f2:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vrepb %v24, [[REG]], 7
++; CHECK: br %r14
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 15
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> <i32 15, i32 15, i32 15, i32 15,
++                                   i32 15, i32 15, i32 15, i32 15,
++                                   i32 15, i32 15, i32 15, i32 15,
++                                   i32 15, i32 15, i32 15, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test v16i8 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <16 x i8> @f3(i8 %scalar) {
++; CHECK-LABEL: f3:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vrepb %v24, [[REG]], 7
++; CHECK: br %r14
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 4
++  %ret = shufflevector <16 x i8> undef, <16 x i8> %val,
++                       <16 x i32> <i32 20, i32 20, i32 20, i32 20,
++                                   i32 20, i32 20, i32 20, i32 20,
++                                   i32 20, i32 20, i32 20, i32 20,
++                                   i32 20, i32 20, i32 20, i32 20>
++  ret <16 x i8> %ret
++}
++
++; Test v8i16 splat of the first element.
++define <8 x i16> @f4(i16 %scalar) {
++; CHECK-LABEL: f4:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vreph %v24, [[REG]], 3
++; CHECK: br %r14
++  %val = insertelement <8 x i16> undef, i16 %scalar, i32 0
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> zeroinitializer
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 splat of the last element.
++define <8 x i16> @f5(i16 %scalar) {
++; CHECK-LABEL: f5:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vreph %v24, [[REG]], 3
++; CHECK: br %r14
++  %val = insertelement <8 x i16> undef, i16 %scalar, i32 7
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> <i32 7, i32 7, i32 7, i32 7,
++                                  i32 7, i32 7, i32 7, i32 7>
++  ret <8 x i16> %ret
++}
++
++; Test v8i16 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <8 x i16> @f6(i16 %scalar) {
++; CHECK-LABEL: f6:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vreph %v24, [[REG]], 3
++; CHECK: br %r14
++  %val = insertelement <8 x i16> undef, i16 %scalar, i32 2
++  %ret = shufflevector <8 x i16> undef, <8 x i16> %val,
++                       <8 x i32> <i32 10, i32 10, i32 10, i32 10,
++                                  i32 10, i32 10, i32 10, i32 10>
++  ret <8 x i16> %ret
++}
++
++; Test v4i32 splat of the first element.
++define <4 x i32> @f7(i32 %scalar) {
++; CHECK-LABEL: f7:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vrepf %v24, [[REG]], 1
++; CHECK: br %r14
++  %val = insertelement <4 x i32> undef, i32 %scalar, i32 0
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 splat of the last element.
++define <4 x i32> @f8(i32 %scalar) {
++; CHECK-LABEL: f8:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vrepf %v24, [[REG]], 1
++; CHECK: br %r14
++  %val = insertelement <4 x i32> undef, i32 %scalar, i32 3
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++  ret <4 x i32> %ret
++}
++
++; Test v4i32 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <4 x i32> @f9(i32 %scalar) {
++; CHECK-LABEL: f9:
++; CHECK: vlvgp [[REG:%v[0-9]+]], %r2, %r2
++; CHECK: vrepf %v24, [[REG]], 1
++; CHECK: br %r14
++  %val = insertelement <4 x i32> undef, i32 %scalar, i32 1
++  %ret = shufflevector <4 x i32> undef, <4 x i32> %val,
++                       <4 x i32> <i32 5, i32 5, i32 5, i32 5>
++  ret <4 x i32> %ret
++}
++
++; Test v2i64 splat of the first element.
++define <2 x i64> @f10(i64 %scalar) {
++; CHECK-LABEL: f10:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK: br %r14
++  %val = insertelement <2 x i64> undef, i64 %scalar, i32 0
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x i64> %ret
++}
++
++; Test v2i64 splat of the last element.
++define <2 x i64> @f11(i64 %scalar) {
++; CHECK-LABEL: f11:
++; CHECK: vlvgp %v24, %r2, %r2
++; CHECK: br %r14
++  %val = insertelement <2 x i64> undef, i64 %scalar, i32 1
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> <i32 1, i32 1>
++  ret <2 x i64> %ret
++}
++
++; Test v4f32 splat of the first element.
++define <4 x float> @f12(float %scalar) {
++; CHECK-LABEL: f12:
++; CHECK: vrepf %v24, %v0, 0
++; CHECK: br %r14
++  %val = insertelement <4 x float> undef, float %scalar, i32 0
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x float> %ret
++}
++
++; Test v4f32 splat of the last element.
++define <4 x float> @f13(float %scalar) {
++; CHECK-LABEL: f13:
++; CHECK: vrepf %v24, %v0, 0
++; CHECK: br %r14
++  %val = insertelement <4 x float> undef, float %scalar, i32 3
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> <i32 3, i32 3, i32 3, i32 3>
++  ret <4 x float> %ret
++}
++
++; Test v4f32 splat of an arbitrary element, using the second operand of
++; the shufflevector.
++define <4 x float> @f14(float %scalar) {
++; CHECK-LABEL: f14:
++; CHECK: vrepf %v24, %v0, 0
++; CHECK: br %r14
++  %val = insertelement <4 x float> undef, float %scalar, i32 1
++  %ret = shufflevector <4 x float> undef, <4 x float> %val,
++                       <4 x i32> <i32 5, i32 5, i32 5, i32 5>
++  ret <4 x float> %ret
++}
++
++; Test v2f64 splat of the first element.
++define <2 x double> @f15(double %scalar) {
++; CHECK-LABEL: f15:
++; CHECK: vrepg %v24, %v0, 0
++; CHECK: br %r14
++  %val = insertelement <2 x double> undef, double %scalar, i32 0
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x double> %ret
++}
++
++; Test v2f64 splat of the last element.
++define <2 x double> @f16(double %scalar) {
++; CHECK-LABEL: f16:
++; CHECK: vrepg %v24, %v0, 0
++; CHECK: br %r14
++  %val = insertelement <2 x double> undef, double %scalar, i32 1
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> <i32 1, i32 1>
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-03.ll
+@@ -0,0 +1,251 @@
++; Test replications of a scalar memory value, represented as splats.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 replicating load with no offset.
++define <16 x i8> @f1(i8 *%ptr) {
++; CHECK-LABEL: f1:
++; CHECK: vlrepb %v24, 0(%r2)
++; CHECK: br %r14
++  %scalar = load i8 *%ptr
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 0
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> zeroinitializer
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 replicating load with the maximum in-range offset.
++define <16 x i8> @f2(i8 *%base) {
++; CHECK-LABEL: f2:
++; CHECK: vlrepb %v24, 4095(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4095
++  %scalar = load i8 *%ptr
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 0
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> zeroinitializer
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 replicating load with the first out-of-range offset.
++define <16 x i8> @f3(i8 *%base) {
++; CHECK-LABEL: f3:
++; CHECK: aghi %r2, 4096
++; CHECK: vlrepb %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i8 *%base, i64 4096
++  %scalar = load i8 *%ptr
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 0
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> zeroinitializer
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 replicating load with no offset.
++define <8 x i16> @f4(i16 *%ptr) {
++; CHECK-LABEL: f4:
++; CHECK: vlreph %v24, 0(%r2)
++; CHECK: br %r14
++  %scalar = load i16 *%ptr
++  %val = insertelement <8 x i16> undef, i16 %scalar, i32 0
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> zeroinitializer
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 replicating load with the maximum in-range offset.
++define <8 x i16> @f5(i16 *%base) {
++; CHECK-LABEL: f5:
++; CHECK: vlreph %v24, 4094(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i16 *%base, i64 2047
++  %scalar = load i16 *%ptr
++  %val = insertelement <8 x i16> undef, i16 %scalar, i32 0
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> zeroinitializer
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 replicating load with the first out-of-range offset.
++define <8 x i16> @f6(i16 *%base) {
++; CHECK-LABEL: f6:
++; CHECK: aghi %r2, 4096
++; CHECK: vlreph %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i16 *%base, i64 2048
++  %scalar = load i16 *%ptr
++  %val = insertelement <8 x i16> undef, i16 %scalar, i32 0
++  %ret = shufflevector <8 x i16> %val, <8 x i16> undef,
++                       <8 x i32> zeroinitializer
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 replicating load with no offset.
++define <4 x i32> @f7(i32 *%ptr) {
++; CHECK-LABEL: f7:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %scalar = load i32 *%ptr
++  %val = insertelement <4 x i32> undef, i32 %scalar, i32 0
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 replicating load with the maximum in-range offset.
++define <4 x i32> @f8(i32 *%base) {
++; CHECK-LABEL: f8:
++; CHECK: vlrepf %v24, 4092(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i32 *%base, i64 1023
++  %scalar = load i32 *%ptr
++  %val = insertelement <4 x i32> undef, i32 %scalar, i32 0
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 replicating load with the first out-of-range offset.
++define <4 x i32> @f9(i32 *%base) {
++; CHECK-LABEL: f9:
++; CHECK: aghi %r2, 4096
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i32 *%base, i64 1024
++  %scalar = load i32 *%ptr
++  %val = insertelement <4 x i32> undef, i32 %scalar, i32 0
++  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 replicating load with no offset.
++define <2 x i64> @f10(i64 *%ptr) {
++; CHECK-LABEL: f10:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %scalar = load i64 *%ptr
++  %val = insertelement <2 x i64> undef, i64 %scalar, i32 0
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 replicating load with the maximum in-range offset.
++define <2 x i64> @f11(i64 *%base) {
++; CHECK-LABEL: f11:
++; CHECK: vlrepg %v24, 4088(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i32 511
++  %scalar = load i64 *%ptr
++  %val = insertelement <2 x i64> undef, i64 %scalar, i32 0
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 replicating load with the first out-of-range offset.
++define <2 x i64> @f12(i64 *%base) {
++; CHECK-LABEL: f12:
++; CHECK: aghi %r2, 4096
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr i64 *%base, i32 512
++  %scalar = load i64 *%ptr
++  %val = insertelement <2 x i64> undef, i64 %scalar, i32 0
++  %ret = shufflevector <2 x i64> %val, <2 x i64> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x i64> %ret
++}
++
++; Test a v4f32 replicating load with no offset.
++define <4 x float> @f13(float *%ptr) {
++; CHECK-LABEL: f13:
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %scalar = load float *%ptr
++  %val = insertelement <4 x float> undef, float %scalar, i32 0
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x float> %ret
++}
++
++; Test a v4f32 replicating load with the maximum in-range offset.
++define <4 x float> @f14(float *%base) {
++; CHECK-LABEL: f14:
++; CHECK: vlrepf %v24, 4092(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%base, i64 1023
++  %scalar = load float *%ptr
++  %val = insertelement <4 x float> undef, float %scalar, i32 0
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x float> %ret
++}
++
++; Test a v4f32 replicating load with the first out-of-range offset.
++define <4 x float> @f15(float *%base) {
++; CHECK-LABEL: f15:
++; CHECK: aghi %r2, 4096
++; CHECK: vlrepf %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr float *%base, i64 1024
++  %scalar = load float *%ptr
++  %val = insertelement <4 x float> undef, float %scalar, i32 0
++  %ret = shufflevector <4 x float> %val, <4 x float> undef,
++                       <4 x i32> zeroinitializer
++  ret <4 x float> %ret
++}
++
++; Test a v2f64 replicating load with no offset.
++define <2 x double> @f16(double *%ptr) {
++; CHECK-LABEL: f16:
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %scalar = load double *%ptr
++  %val = insertelement <2 x double> undef, double %scalar, i32 0
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x double> %ret
++}
++
++; Test a v2f64 replicating load with the maximum in-range offset.
++define <2 x double> @f17(double *%base) {
++; CHECK-LABEL: f17:
++; CHECK: vlrepg %v24, 4088(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr double *%base, i32 511
++  %scalar = load double *%ptr
++  %val = insertelement <2 x double> undef, double %scalar, i32 0
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x double> %ret
++}
++
++; Test a v2f64 replicating load with the first out-of-range offset.
++define <2 x double> @f18(double *%base) {
++; CHECK-LABEL: f18:
++; CHECK: aghi %r2, 4096
++; CHECK: vlrepg %v24, 0(%r2)
++; CHECK: br %r14
++  %ptr = getelementptr double *%base, i32 512
++  %scalar = load double *%ptr
++  %val = insertelement <2 x double> undef, double %scalar, i32 0
++  %ret = shufflevector <2 x double> %val, <2 x double> undef,
++                       <2 x i32> zeroinitializer
++  ret <2 x double> %ret
++}
++
++; Test a v16i8 replicating load with an index.
++define <16 x i8> @f19(i8 *%base, i64 %index) {
++; CHECK-LABEL: f19:
++; CHECK: vlrepb %v24, 1023(%r3,%r2)
++; CHECK: br %r14
++  %ptr1 = getelementptr i8 *%base, i64 %index
++  %ptr = getelementptr i8 *%ptr1, i64 1023
++  %scalar = load i8 *%ptr
++  %val = insertelement <16 x i8> undef, i8 %scalar, i32 0
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> zeroinitializer
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-04.ll
+@@ -0,0 +1,200 @@
++; Test vector merge high.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a canonical v16i8 merge high.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmrhb %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 0, i32 16, i32 1, i32 17,
++                                   i32 2, i32 18, i32 3, i32 19,
++                                   i32 4, i32 20, i32 5, i32 21,
++                                   i32 6, i32 22, i32 7, i32 23>
++  ret <16 x i8> %ret
++}
++
++; Test a reversed v16i8 merge high.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmrhb %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 16, i32 0, i32 17, i32 1,
++                                   i32 18, i32 2, i32 19, i32 3,
++                                   i32 20, i32 4, i32 21, i32 5,
++                                   i32 22, i32 6, i32 23, i32 7>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge high with only the first operand being used.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmrhb %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 0, i32 0, i32 1, i32 1,
++                                   i32 2, i32 2, i32 3, i32 3,
++                                   i32 4, i32 4, i32 5, i32 5,
++                                   i32 6, i32 6, i32 7, i32 7>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge high with only the second operand being used.
++; This is converted into @f3 by target-independent code.
++define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmrhb %v24, %v26, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 16, i32 16, i32 17, i32 17,
++                                   i32 18, i32 18, i32 19, i32 19,
++                                   i32 20, i32 20, i32 21, i32 21,
++                                   i32 22, i32 22, i32 23, i32 23>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge with both operands being the same.  This too is
++; converted into @f3 by target-independent code.
++define <16 x i8> @f5(<16 x i8> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vmrhb %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> %val,
++                       <16 x i32> <i32 0, i32 16, i32 17, i32 17,
++                                   i32 18, i32 2, i32 3, i32 3,
++                                   i32 20, i32 20, i32 5, i32 5,
++                                   i32 6, i32 22, i32 23, i32 7>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge in which some of the indices are don't care.
++define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmrhb %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 0, i32 undef, i32 1, i32 17,
++                                   i32 undef, i32 18, i32 undef, i32 undef,
++                                   i32 undef, i32 20, i32 5, i32 21,
++                                   i32 undef, i32 22, i32 7, i32 undef>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge in which one of the operands is undefined and where
++; indices for that operand are "don't care".  Target-independent code
++; converts the indices themselves into "undef"s.
++define <16 x i8> @f7(<16 x i8> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vmrhb %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> undef, <16 x i8> %val,
++                       <16 x i32> <i32 11, i32 16, i32 17, i32 5,
++                                   i32 18, i32 10, i32 19, i32 19,
++                                   i32 20, i32 20, i32 21, i32 3,
++                                   i32 2, i32 22, i32 9, i32 23>
++  ret <16 x i8> %ret
++}
++
++; Test a canonical v8i16 merge high.
++define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmrhh %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 0, i32 8, i32 1, i32 9,
++                                  i32 2, i32 10, i32 3, i32 11>
++  ret <8 x i16> %ret
++}
++
++; Test a reversed v8i16 merge high.
++define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vmrhh %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 8, i32 0, i32 9, i32 1,
++                                  i32 10, i32 2, i32 11, i32 3>
++  ret <8 x i16> %ret
++}
++
++; Test a canonical v4i32 merge high.
++define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vmrhf %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++  ret <4 x i32> %ret
++}
++
++; Test a reversed v4i32 merge high.
++define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vmrhf %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 4, i32 0, i32 5, i32 1>
++  ret <4 x i32> %ret
++}
++
++; Test a canonical v2i64 merge high.
++define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vmrhg %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2,
++                       <2 x i32> <i32 0, i32 2>
++  ret <2 x i64> %ret
++}
++
++; Test a reversed v2i64 merge high.
++define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vmrhg %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2,
++                       <2 x i32> <i32 2, i32 0>
++  ret <2 x i64> %ret
++}
++
++; Test a canonical v4f32 merge high.
++define <4 x float> @f14(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f14:
++; CHECK: vmrhf %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++  ret <4 x float> %ret
++}
++
++; Test a reversed v4f32 merge high.
++define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f15:
++; CHECK: vmrhf %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 4, i32 0, i32 5, i32 1>
++  ret <4 x float> %ret
++}
++
++; Test a canonical v2f64 merge high.
++define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f16:
++; CHECK: vmrhg %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val1, <2 x double> %val2,
++                       <2 x i32> <i32 0, i32 2>
++  ret <2 x double> %ret
++}
++
++; Test a reversed v2f64 merge high.
++define <2 x double> @f17(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f17:
++; CHECK: vmrhg %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val1, <2 x double> %val2,
++                       <2 x i32> <i32 2, i32 0>
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-05.ll
+@@ -0,0 +1,200 @@
++; Test vector merge low.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a canonical v16i8 merge low.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vmrlb %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 8, i32 24, i32 9, i32 25,
++                                   i32 10, i32 26, i32 11, i32 27,
++                                   i32 12, i32 28, i32 13, i32 29,
++                                   i32 14, i32 30, i32 15, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a reversed v16i8 merge low.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vmrlb %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 24, i32 8, i32 25, i32 9,
++                                   i32 26, i32 10, i32 27, i32 11,
++                                   i32 28, i32 12, i32 29, i32 13,
++                                   i32 30, i32 14, i32 31, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge low with only the first operand being used.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vmrlb %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 8, i32 8, i32 9, i32 9,
++                                   i32 10, i32 10, i32 11, i32 11,
++                                   i32 12, i32 12, i32 13, i32 13,
++                                   i32 14, i32 14, i32 15, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge low with only the second operand being used.
++; This is converted into @f3 by target-independent code.
++define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vmrlb %v24, %v26, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 24, i32 24, i32 25, i32 25,
++                                   i32 26, i32 26, i32 27, i32 27,
++                                   i32 28, i32 28, i32 29, i32 29,
++                                   i32 30, i32 30, i32 31, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge with both operands being the same.  This too is
++; converted into @f3 by target-independent code.
++define <16 x i8> @f5(<16 x i8> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vmrlb %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> %val,
++                       <16 x i32> <i32 8, i32 24, i32 25, i32 25,
++                                   i32 26, i32 10, i32 11, i32 11,
++                                   i32 28, i32 28, i32 13, i32 13,
++                                   i32 14, i32 30, i32 31, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge in which some of the indices are don't care.
++define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vmrlb %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 8, i32 undef, i32 9, i32 25,
++                                   i32 undef, i32 26, i32 undef, i32 undef,
++                                   i32 undef, i32 28, i32 13, i32 29,
++                                   i32 undef, i32 30, i32 15, i32 undef>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 merge in which one of the operands is undefined and where
++; indices for that operand are "don't care".  Target-independent code
++; converts the indices themselves into "undef"s.
++define <16 x i8> @f7(<16 x i8> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vmrlb %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> undef, <16 x i8> %val,
++                       <16 x i32> <i32 11, i32 24, i32 25, i32 5,
++                                   i32 26, i32 10, i32 27, i32 27,
++                                   i32 28, i32 28, i32 29, i32 3,
++                                   i32 2, i32 30, i32 9, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a canonical v8i16 merge low.
++define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vmrlh %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 4, i32 12, i32 5, i32 13,
++                                  i32 6, i32 14, i32 7, i32 15>
++  ret <8 x i16> %ret
++}
++
++; Test a reversed v8i16 merge low.
++define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vmrlh %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 12, i32 4, i32 13, i32 5,
++                                  i32 14, i32 6, i32 15, i32 7>
++  ret <8 x i16> %ret
++}
++
++; Test a canonical v4i32 merge low.
++define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vmrlf %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++  ret <4 x i32> %ret
++}
++
++; Test a reversed v4i32 merge low.
++define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vmrlf %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 6, i32 2, i32 7, i32 3>
++  ret <4 x i32> %ret
++}
++
++; Test a canonical v2i64 merge low.
++define <2 x i64> @f12(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vmrlg %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2,
++                       <2 x i32> <i32 1, i32 3>
++  ret <2 x i64> %ret
++}
++
++; Test a reversed v2i64 merge low.
++define <2 x i64> @f13(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vmrlg %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2,
++                       <2 x i32> <i32 3, i32 1>
++  ret <2 x i64> %ret
++}
++
++; Test a canonical v4f32 merge low.
++define <4 x float> @f14(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f14:
++; CHECK: vmrlf %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++  ret <4 x float> %ret
++}
++
++; Test a reversed v4f32 merge low.
++define <4 x float> @f15(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f15:
++; CHECK: vmrlf %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 6, i32 2, i32 7, i32 3>
++  ret <4 x float> %ret
++}
++
++; Test a canonical v2f64 merge low.
++define <2 x double> @f16(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f16:
++; CHECK: vmrlg %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val1, <2 x double> %val2,
++                       <2 x i32> <i32 1, i32 3>
++  ret <2 x double> %ret
++}
++
++; Test a reversed v2f64 merge low.
++define <2 x double> @f17(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f17:
++; CHECK: vmrlg %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val1, <2 x double> %val2,
++                       <2 x i32> <i32 3, i32 1>
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-06.ll
+@@ -0,0 +1,160 @@
++; Test vector pack.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a canonical v16i8 pack.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vpkh %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15,
++                                   i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a reversed v16i8 pack.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vpkh %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31,
++                                   i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 pack with only the first operand being used.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vpkh %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15,
++                                   i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 pack with only the second operand being used.
++; This is converted into @f3 by target-independent code.
++define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vpkh %v24, %v26, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31,
++                                   i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 pack with both operands being the same.  This too is
++; converted into @f3 by target-independent code.
++define <16 x i8> @f5(<16 x i8> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vpkh %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> %val,
++                       <16 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                   i32 9, i32 11, i32 13, i32 15,
++                                   i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 pack in which some of the indices are don't care.
++define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vpkh %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 1, i32 undef, i32 5, i32 7,
++                                   i32 undef, i32 11, i32 undef, i32 undef,
++                                   i32 undef, i32 19, i32 21, i32 23,
++                                   i32 undef, i32 27, i32 29, i32 undef>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 pack in which one of the operands is undefined and where
++; indices for that operand are "don't care".  Target-independent code
++; converts the indices themselves into "undef"s.
++define <16 x i8> @f7(<16 x i8> %val) {
++; CHECK-LABEL: f7:
++; CHECK: vpkh %v24, %v24, %v24
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> undef, <16 x i8> %val,
++                       <16 x i32> <i32 7, i32 1, i32 9, i32 15,
++                                   i32 15, i32 3, i32 5, i32 1,
++                                   i32 17, i32 19, i32 21, i32 23,
++                                   i32 25, i32 27, i32 29, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a canonical v8i16 pack.
++define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vpkf %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 1, i32 3, i32 5, i32 7,
++                                  i32 9, i32 11, i32 13, i32 15>
++  ret <8 x i16> %ret
++}
++
++; Test a reversed v8i16 pack.
++define <8 x i16> @f9(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vpkf %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 9, i32 11, i32 13, i32 15,
++                                  i32 1, i32 3, i32 5, i32 7>
++  ret <8 x i16> %ret
++}
++
++; Test a canonical v4i32 pack.
++define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vpkg %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  ret <4 x i32> %ret
++}
++
++; Test a reversed v4i32 pack.
++define <4 x i32> @f11(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vpkg %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 5, i32 7, i32 1, i32 3>
++  ret <4 x i32> %ret
++}
++
++; Test a canonical v4f32 pack.
++define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vpkg %v24, %v24, %v26
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++  ret <4 x float> %ret
++}
++
++; Test a reversed v4f32 pack.
++define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vpkg %v24, %v26, %v24
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 5, i32 7, i32 1, i32 3>
++  ret <4 x float> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-07.ll
+@@ -0,0 +1,145 @@
++; Test vector shift left double immediate.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift with the lowest useful shift amount.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vsldb %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 1, i32 2, i32 3, i32 4,
++                                   i32 5, i32 6, i32 7, i32 8,
++                                   i32 9, i32 10, i32 11, i32 12,
++                                   i32 13, i32 14, i32 15, i32 16>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift with the highest shift amount.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vsldb %v24, %v24, %v26, 15
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 15, i32 16, i32 17, i32 18,
++                                   i32 19, i32 20, i32 21, i32 22,
++                                   i32 23, i32 24, i32 25, i32 26,
++                                   i32 27, i32 28, i32 29, i32 30>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift in which the operands need to be reversed.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vsldb %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 20, i32 21, i32 22, i32 23,
++                                   i32 24, i32 25, i32 26, i32 27,
++                                   i32 28, i32 29, i32 30, i32 31,
++                                   i32 0, i32 1, i32 2, i32 3>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift in which the operands need to be duplicated.
++define <16 x i8> @f4(<16 x i8> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vsldb %v24, %v24, %v24, 7
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> <i32 7, i32 8, i32 9, i32 10,
++                                   i32 11, i32 12, i32 13, i32 14,
++                                   i32 15, i32 0, i32 1, i32 2,
++                                   i32 3, i32 4, i32 5, i32 6>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift in which some of the indices are undefs.
++define <16 x i8> @f5(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f5:
++; CHECK: vsldb %v24, %v24, %v26, 11
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef,
++                                   i32 15, i32 16, i32 undef, i32 18,
++                                   i32 19, i32 20, i32 21, i32 22,
++                                   i32 23, i32 24, i32 25, i32 26>
++  ret <16 x i8> %ret
++}
++
++; ...and again with reversed operands.
++define <16 x i8> @f6(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vsldb %v24, %v26, %v24, 13
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 undef, i32 undef, i32 31, i32 0,
++                                   i32 1, i32 2, i32 3, i32 4,
++                                   i32 5, i32 6, i32 7, i32 8,
++                                   i32 9, i32 10, i32 11, i32 12>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift with the lowest useful shift amount.
++define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vsldb %v24, %v24, %v26, 2
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 1, i32 2, i32 3, i32 4,
++                                  i32 5, i32 6, i32 7, i32 8>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift with the highest useful shift amount.
++define <8 x i16> @f8(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vsldb %v24, %v24, %v26, 14
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 7, i32 8, i32 9, i32 10,
++                                  i32 11, i32 12, i32 13, i32 14>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift with the lowest useful shift amount.
++define <4 x i32> @f9(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vsldb %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 1, i32 2, i32 3, i32 4>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift with the highest useful shift amount.
++define <4 x i32> @f10(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vsldb %v24, %v24, %v26, 12
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 3, i32 4, i32 5, i32 6>
++  ret <4 x i32> %ret
++}
++
++; Test a v4f32 shift with the lowest useful shift amount.
++define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vsldb %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 1, i32 2, i32 3, i32 4>
++  ret <4 x float> %ret
++}
++
++; Test a v4f32 shift with the highest useful shift amount.
++define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vsldb %v24, %v24, %v26, 12
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 3, i32 4, i32 5, i32 6>
++  ret <4 x float> %ret
++}
++
++; We use VPDI for v2i64 shuffles.
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-08.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-08.ll
+@@ -0,0 +1,170 @@
++; Test vector permutes using VPDI.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a high1/low2 permute for v16i8.
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vpdi %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                   i32 4, i32 5, i32 6, i32 7,
++                                   i32 24, i32 25, i32 26, i32 27,
++                                   i32 28, i32 29, i32 30, i32 31>
++  ret <16 x i8> %ret
++}
++
++; Test a low2/high1 permute for v16i8.
++define <16 x i8> @f2(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vpdi %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 24, i32 25, i32 26, i32 27,
++                                   i32 28, i32 29, i32 30, i32 31,
++                                   i32 0, i32 1, i32 2, i32 3,
++                                   i32 4, i32 5, i32 6, i32 7>
++  ret <16 x i8> %ret
++}
++
++; Test a low1/high2 permute for v16i8.
++define <16 x i8> @f3(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vpdi %v24, %v24, %v26, 4
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 8, i32 9, i32 10, i32 undef,
++                                   i32 12, i32 undef, i32 14, i32 15,
++                                   i32 16, i32 17, i32 undef, i32 19,
++                                   i32 20, i32 21, i32 22, i32 undef>
++  ret <16 x i8> %ret
++}
++
++; Test a high2/low1 permute for v16i8.
++define <16 x i8> @f4(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vpdi %v24, %v26, %v24, 1
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 16, i32 17, i32 18, i32 19,
++                                   i32 20, i32 21, i32 22, i32 23,
++                                   i32 8, i32 9, i32 10, i32 11,
++                                   i32 12, i32 13, i32 14, i32 15>
++  ret <16 x i8> %ret
++}
++
++; Test reversing two doublewords in a v16i8.
++define <16 x i8> @f5(<16 x i8> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vpdi %v24, %v24, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <16 x i8> %val, <16 x i8> undef,
++                       <16 x i32> <i32 8, i32 9, i32 10, i32 11,
++                                   i32 12, i32 13, i32 14, i32 15,
++                                   i32 0, i32 1, i32 2, i32 3,
++                                   i32 4, i32 5, i32 6, i32 7>
++  ret <16 x i8> %ret
++}
++
++; Test a high1/low2 permute for v8i16.
++define <8 x i16> @f6(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vpdi %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 0, i32 1, i32 2, i32 3,
++                                  i32 12, i32 13, i32 14, i32 15>
++  ret <8 x i16> %ret
++}
++
++; Test a low2/high1 permute for v8i16.
++define <8 x i16> @f7(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: vpdi %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 12, i32 13, i32 14, i32 15,
++                                  i32 0, i32 1, i32 2, i32 3>
++  ret <8 x i16> %ret
++}
++
++; Test a high1/low2 permute for v4i32.
++define <4 x i32> @f8(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vpdi %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 0, i32 1, i32 6, i32 7>
++  ret <4 x i32> %ret
++}
++
++; Test a low2/high1 permute for v4i32.
++define <4 x i32> @f9(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vpdi %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 6, i32 7, i32 0, i32 1>
++  ret <4 x i32> %ret
++}
++
++; Test a high1/low2 permute for v2i64.
++define <2 x i64> @f10(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vpdi %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2,
++                       <2 x i32> <i32 0, i32 3>
++  ret <2 x i64> %ret
++}
++
++; Test low2/high1 permute for v2i64.
++define <2 x i64> @f11(<2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vpdi %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <2 x i64> %val1, <2 x i64> %val2,
++                       <2 x i32> <i32 3, i32 0>
++  ret <2 x i64> %ret
++}
++
++; Test a high1/low2 permute for v4f32.
++define <4 x float> @f12(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vpdi %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 0, i32 1, i32 6, i32 7>
++  ret <4 x float> %ret
++}
++
++; Test a low2/high1 permute for v4f32.
++define <4 x float> @f13(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vpdi %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <4 x float> %val1, <4 x float> %val2,
++                       <4 x i32> <i32 6, i32 7, i32 0, i32 1>
++  ret <4 x float> %ret
++}
++
++; Test a high1/low2 permute for v2f64.
++define <2 x double> @f14(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f14:
++; CHECK: vpdi %v24, %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val1, <2 x double> %val2,
++                       <2 x i32> <i32 0, i32 3>
++  ret <2 x double> %ret
++}
++
++; Test a low2/high1 permute for v2f64.
++define <2 x double> @f15(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f15:
++; CHECK: vpdi %v24, %v26, %v24, 4
++; CHECK: br %r14
++  %ret = shufflevector <2 x double> %val1, <2 x double> %val2,
++                       <2 x i32> <i32 3, i32 0>
++  ret <2 x double> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-09.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-09.ll
+@@ -0,0 +1,38 @@
++; Test general vector permute of a v16i8.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-CODE %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++
++define <16 x i8> @f1(<16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-CODE-LABEL: f1:
++; CHECK-CODE: larl [[REG:%r[0-5]]],
++; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]])
++; CHECK-CODE: vperm %v24, %v24, %v26, [[MASK]]
++; CHECK-CODE: br %r14
++;
++; CHECK-VECTOR: .byte 1
++; CHECK-VECTOR-NEXT: .byte 19
++; CHECK-VECTOR-NEXT: .byte 6
++; CHECK-VECTOR-NEXT: .byte 5
++; CHECK-VECTOR-NEXT: .byte 20
++; CHECK-VECTOR-NEXT: .byte 22
++; CHECK-VECTOR-NEXT: .byte 1
++; CHECK-VECTOR-NEXT: .byte 1
++; CHECK-VECTOR-NEXT: .byte 25
++; CHECK-VECTOR-NEXT: .byte 29
++; CHECK-VECTOR-NEXT: .byte 11
++; Any byte would be OK here
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .byte 31
++; CHECK-VECTOR-NEXT: .byte 4
++; CHECK-VECTOR-NEXT: .byte 15
++; CHECK-VECTOR-NEXT: .byte 19
++  %ret = shufflevector <16 x i8> %val1, <16 x i8> %val2,
++                       <16 x i32> <i32 1, i32 19, i32 6, i32 5,
++                                   i32 20, i32 22, i32 1, i32 1,
++                                   i32 25, i32 29, i32 11, i32 undef,
++                                   i32 31, i32 4, i32 15, i32 19>
++  ret <16 x i8> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-10.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-10.ll
+@@ -0,0 +1,36 @@
++; Test general vector permute of a v8i16.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-CODE %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++
++define <8 x i16> @f1(<8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-CODE-LABEL: f1:
++; CHECK-CODE: larl [[REG:%r[0-5]]],
++; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]])
++; CHECK-CODE: vperm %v24, %v26, %v24, [[MASK]]
++; CHECK-CODE: br %r14
++;
++; CHECK-VECTOR: .byte 0
++; CHECK-VECTOR-NEXT: .byte 1
++; CHECK-VECTOR-NEXT: .byte 26
++; CHECK-VECTOR-NEXT: .byte 27
++; Any 2 bytes would be OK here
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .byte 28
++; CHECK-VECTOR-NEXT: .byte 29
++; CHECK-VECTOR-NEXT: .byte 6
++; CHECK-VECTOR-NEXT: .byte 7
++; CHECK-VECTOR-NEXT: .byte 14
++; CHECK-VECTOR-NEXT: .byte 15
++; CHECK-VECTOR-NEXT: .byte 8
++; CHECK-VECTOR-NEXT: .byte 9
++; CHECK-VECTOR-NEXT: .byte 16
++; CHECK-VECTOR-NEXT: .byte 17
++  %ret = shufflevector <8 x i16> %val1, <8 x i16> %val2,
++                       <8 x i32> <i32 8, i32 5, i32 undef, i32 6,
++                                  i32 11, i32 15, i32 12, i32 0>
++  ret <8 x i16> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-perm-11.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-perm-11.ll
+@@ -0,0 +1,35 @@
++; Test general vector permute of a v4i32.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-CODE %s
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | \
++; RUN:   FileCheck -check-prefix=CHECK-VECTOR %s
++
++define <4 x i32> @f1(<4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-CODE-LABEL: f1:
++; CHECK-CODE: larl [[REG:%r[0-5]]],
++; CHECK-CODE: vl [[MASK:%v[0-9]+]], 0([[REG]])
++; CHECK-CODE: vperm %v24, %v26, %v24, [[MASK]]
++; CHECK-CODE: br %r14
++;
++; CHECK-VECTOR: .byte 4
++; CHECK-VECTOR-NEXT: .byte 5
++; CHECK-VECTOR-NEXT: .byte 6
++; CHECK-VECTOR-NEXT: .byte 7
++; CHECK-VECTOR-NEXT: .byte 20
++; CHECK-VECTOR-NEXT: .byte 21
++; CHECK-VECTOR-NEXT: .byte 22
++; CHECK-VECTOR-NEXT: .byte 23
++; Any 4 bytes would be OK here
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .space 1
++; CHECK-VECTOR-NEXT: .byte 12
++; CHECK-VECTOR-NEXT: .byte 13
++; CHECK-VECTOR-NEXT: .byte 14
++; CHECK-VECTOR-NEXT: .byte 15
++  %ret = shufflevector <4 x i32> %val1, <4 x i32> %val2,
++                       <4 x i32> <i32 5, i32 1, i32 undef, i32 7>
++  ret <4 x i32> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-round-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-round-01.ll
+@@ -0,0 +1,118 @@
++; Test v2f64 rounding.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare double @llvm.rint.f64(double)
++declare double @llvm.nearbyint.f64(double)
++declare double @llvm.floor.f64(double)
++declare double @llvm.ceil.f64(double)
++declare double @llvm.trunc.f64(double)
++declare double @llvm.round.f64(double)
++declare <2 x double> @llvm.rint.v2f64(<2 x double>)
++declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
++declare <2 x double> @llvm.floor.v2f64(<2 x double>)
++declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
++declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
++declare <2 x double> @llvm.round.v2f64(<2 x double>)
++
++define <2 x double> @f1(<2 x double> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vfidb %v24, %v24, 0, 0
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %val)
++  ret <2 x double> %res
++}
++
++define <2 x double> @f2(<2 x double> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vfidb %v24, %v24, 4, 0
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %val)
++  ret <2 x double> %res
++}
++
++define <2 x double> @f3(<2 x double> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vfidb %v24, %v24, 4, 7
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %val)
++  ret <2 x double> %res
++}
++
++define <2 x double> @f4(<2 x double> %val) {
++; CHECK-LABEL: f4:
++; CHECK: vfidb %v24, %v24, 4, 6
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %val)
++  ret <2 x double> %res
++}
++
++define <2 x double> @f5(<2 x double> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vfidb %v24, %v24, 4, 5
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %val)
++  ret <2 x double> %res
++}
++
++define <2 x double> @f6(<2 x double> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vfidb %v24, %v24, 4, 1
++; CHECK: br %r14
++  %res = call <2 x double> @llvm.round.v2f64(<2 x double> %val)
++  ret <2 x double> %res
++}
++
++define double @f7(<2 x double> %val) {
++; CHECK-LABEL: f7:
++; CHECK: wfidb %f0, %v24, 0, 0
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %res = call double @llvm.rint.f64(double %scalar)
++  ret double %res
++}
++
++define double @f8(<2 x double> %val) {
++; CHECK-LABEL: f8:
++; CHECK: wfidb %f0, %v24, 4, 0
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %res = call double @llvm.nearbyint.f64(double %scalar)
++  ret double %res
++}
++
++define double @f9(<2 x double> %val) {
++; CHECK-LABEL: f9:
++; CHECK: wfidb %f0, %v24, 4, 7
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %res = call double @llvm.floor.f64(double %scalar)
++  ret double %res
++}
++
++define double @f10(<2 x double> %val) {
++; CHECK-LABEL: f10:
++; CHECK: wfidb %f0, %v24, 4, 6
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %res = call double @llvm.ceil.f64(double %scalar)
++  ret double %res
++}
++
++define double @f11(<2 x double> %val) {
++; CHECK-LABEL: f11:
++; CHECK: wfidb %f0, %v24, 4, 5
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %res = call double @llvm.trunc.f64(double %scalar)
++  ret double %res
++}
++
++define double @f12(<2 x double> %val) {
++; CHECK-LABEL: f12:
++; CHECK: wfidb %f0, %v24, 4, 1
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %res = call double @llvm.round.f64(double %scalar)
++  ret double %res
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-01.ll
+@@ -0,0 +1,39 @@
++; Test vector shift left with vector shift amount.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: veslvb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = shl <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: veslvh %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = shl <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: veslvf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = shl <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 shift.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: veslvg %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = shl <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-02.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-02.ll
+@@ -0,0 +1,39 @@
++; Test vector arithmetic shift right with vector shift amount.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vesravb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = ashr <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vesravh %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = ashr <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vesravf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = ashr <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 shift.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vesravg %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = ashr <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-03.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-03.ll
+@@ -0,0 +1,39 @@
++; Test vector logical shift right with vector shift amount.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vesrlvb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = lshr <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vesrlvh %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = lshr <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vesrlvf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = lshr <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 shift.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vesrlvg %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = lshr <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-04.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-04.ll
+@@ -0,0 +1,134 @@
++; Test vector shift left with scalar shift amount.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift by a variable.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) {
++; CHECK-LABEL: f1:
++; CHECK: veslb %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %truncshift = trunc i32 %shift to i8
++  %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0
++  %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef,
++                        <16 x i32> zeroinitializer
++  %ret = shl <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift by the lowest useful constant.
++define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: veslb %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shl <16 x i8> %val, <i8 1, i8 1, i8 1, i8 1,
++                              i8 1, i8 1, i8 1, i8 1,
++                              i8 1, i8 1, i8 1, i8 1,
++                              i8 1, i8 1, i8 1, i8 1>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift by the highest useful constant.
++define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK: veslb %v24, %v26, 7
++; CHECK: br %r14
++  %ret = shl <16 x i8> %val, <i8 7, i8 7, i8 7, i8 7,
++                              i8 7, i8 7, i8 7, i8 7,
++                              i8 7, i8 7, i8 7, i8 7,
++                              i8 7, i8 7, i8 7, i8 7>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift by a variable.
++define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) {
++; CHECK-LABEL: f4:
++; CHECK: veslh %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %truncshift = trunc i32 %shift to i16
++  %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0
++  %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef,
++                        <8 x i32> zeroinitializer
++  %ret = shl <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift by the lowest useful constant.
++define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f5:
++; CHECK: veslh %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shl <8 x i16> %val, <i16 1, i16 1, i16 1, i16 1,
++                              i16 1, i16 1, i16 1, i16 1>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift by the highest useful constant.
++define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f6:
++; CHECK: veslh %v24, %v26, 15
++; CHECK: br %r14
++  %ret = shl <8 x i16> %val, <i16 15, i16 15, i16 15, i16 15,
++                              i16 15, i16 15, i16 15, i16 15>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift by a variable.
++define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) {
++; CHECK-LABEL: f7:
++; CHECK: veslf %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0
++  %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef,
++                        <4 x i32> zeroinitializer
++  %ret = shl <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift by the lowest useful constant.
++define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f8:
++; CHECK: veslf %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shl <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift by the highest useful constant.
++define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f9:
++; CHECK: veslf %v24, %v26, 31
++; CHECK: br %r14
++  %ret = shl <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 shift by a variable.
++define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) {
++; CHECK-LABEL: f10:
++; CHECK: veslg %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %extshift = sext i32 %shift to i64
++  %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0
++  %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef,
++                        <2 x i32> zeroinitializer
++  %ret = shl <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 shift by the lowest useful constant.
++define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f11:
++; CHECK: veslg %v24, %v26, 1
++; CHECK: br %r14
++  %ret = shl <2 x i64> %val, <i64 1, i64 1>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 shift by the highest useful constant.
++define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f12:
++; CHECK: veslg %v24, %v26, 63
++; CHECK: br %r14
++  %ret = shl <2 x i64> %val, <i64 63, i64 63>
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-05.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-05.ll
+@@ -0,0 +1,134 @@
++; Test vector arithmetic shift right with scalar shift amount.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift by a variable.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) {
++; CHECK-LABEL: f1:
++; CHECK: vesrab %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %truncshift = trunc i32 %shift to i8
++  %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0
++  %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef,
++                        <16 x i32> zeroinitializer
++  %ret = ashr <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift by the lowest useful constant.
++define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vesrab %v24, %v26, 1
++; CHECK: br %r14
++  %ret = ashr <16 x i8> %val, <i8 1, i8 1, i8 1, i8 1,
++                               i8 1, i8 1, i8 1, i8 1,
++                               i8 1, i8 1, i8 1, i8 1,
++                               i8 1, i8 1, i8 1, i8 1>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift by the highest useful constant.
++define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vesrab %v24, %v26, 7
++; CHECK: br %r14
++  %ret = ashr <16 x i8> %val, <i8 7, i8 7, i8 7, i8 7,
++                               i8 7, i8 7, i8 7, i8 7,
++                               i8 7, i8 7, i8 7, i8 7,
++                               i8 7, i8 7, i8 7, i8 7>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift by a variable.
++define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) {
++; CHECK-LABEL: f4:
++; CHECK: vesrah %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %truncshift = trunc i32 %shift to i16
++  %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0
++  %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef,
++                        <8 x i32> zeroinitializer
++  %ret = ashr <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift by the lowest useful constant.
++define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vesrah %v24, %v26, 1
++; CHECK: br %r14
++  %ret = ashr <8 x i16> %val, <i16 1, i16 1, i16 1, i16 1,
++                               i16 1, i16 1, i16 1, i16 1>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift by the highest useful constant.
++define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vesrah %v24, %v26, 15
++; CHECK: br %r14
++  %ret = ashr <8 x i16> %val, <i16 15, i16 15, i16 15, i16 15,
++                               i16 15, i16 15, i16 15, i16 15>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift by a variable.
++define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) {
++; CHECK-LABEL: f7:
++; CHECK: vesraf %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0
++  %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef,
++                        <4 x i32> zeroinitializer
++  %ret = ashr <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift by the lowest useful constant.
++define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vesraf %v24, %v26, 1
++; CHECK: br %r14
++  %ret = ashr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift by the highest useful constant.
++define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vesraf %v24, %v26, 31
++; CHECK: br %r14
++  %ret = ashr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 shift by a variable.
++define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) {
++; CHECK-LABEL: f10:
++; CHECK: vesrag %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %extshift = sext i32 %shift to i64
++  %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0
++  %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef,
++                        <2 x i32> zeroinitializer
++  %ret = ashr <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 shift by the lowest useful constant.
++define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vesrag %v24, %v26, 1
++; CHECK: br %r14
++  %ret = ashr <2 x i64> %val, <i64 1, i64 1>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 shift by the highest useful constant.
++define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f12:
++; CHECK: vesrag %v24, %v26, 63
++; CHECK: br %r14
++  %ret = ashr <2 x i64> %val, <i64 63, i64 63>
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-06.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-06.ll
+@@ -0,0 +1,134 @@
++; Test vector logical shift right with scalar shift amount.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 shift by a variable.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, i32 %shift) {
++; CHECK-LABEL: f1:
++; CHECK: vesrlb %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %truncshift = trunc i32 %shift to i8
++  %shiftvec = insertelement <16 x i8> undef, i8 %truncshift, i32 0
++  %val2 = shufflevector <16 x i8> %shiftvec, <16 x i8> undef,
++                        <16 x i32> zeroinitializer
++  %ret = lshr <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift by the lowest useful constant.
++define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f2:
++; CHECK: vesrlb %v24, %v26, 1
++; CHECK: br %r14
++  %ret = lshr <16 x i8> %val, <i8 1, i8 1, i8 1, i8 1,
++                               i8 1, i8 1, i8 1, i8 1,
++                               i8 1, i8 1, i8 1, i8 1,
++                               i8 1, i8 1, i8 1, i8 1>
++  ret <16 x i8> %ret
++}
++
++; Test a v16i8 shift by the highest useful constant.
++define <16 x i8> @f3(<16 x i8> %dummy, <16 x i8> %val) {
++; CHECK-LABEL: f3:
++; CHECK: vesrlb %v24, %v26, 7
++; CHECK: br %r14
++  %ret = lshr <16 x i8> %val, <i8 7, i8 7, i8 7, i8 7,
++                               i8 7, i8 7, i8 7, i8 7,
++                               i8 7, i8 7, i8 7, i8 7,
++                               i8 7, i8 7, i8 7, i8 7>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 shift by a variable.
++define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val1, i32 %shift) {
++; CHECK-LABEL: f4:
++; CHECK: vesrlh %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %truncshift = trunc i32 %shift to i16
++  %shiftvec = insertelement <8 x i16> undef, i16 %truncshift, i32 0
++  %val2 = shufflevector <8 x i16> %shiftvec, <8 x i16> undef,
++                        <8 x i32> zeroinitializer
++  %ret = lshr <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift by the lowest useful constant.
++define <8 x i16> @f5(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f5:
++; CHECK: vesrlh %v24, %v26, 1
++; CHECK: br %r14
++  %ret = lshr <8 x i16> %val, <i16 1, i16 1, i16 1, i16 1,
++                               i16 1, i16 1, i16 1, i16 1>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i16 shift by the highest useful constant.
++define <8 x i16> @f6(<8 x i16> %dummy, <8 x i16> %val) {
++; CHECK-LABEL: f6:
++; CHECK: vesrlh %v24, %v26, 15
++; CHECK: br %r14
++  %ret = lshr <8 x i16> %val, <i16 15, i16 15, i16 15, i16 15,
++                               i16 15, i16 15, i16 15, i16 15>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 shift by a variable.
++define <4 x i32> @f7(<4 x i32> %dummy, <4 x i32> %val1, i32 %shift) {
++; CHECK-LABEL: f7:
++; CHECK: vesrlf %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %shiftvec = insertelement <4 x i32> undef, i32 %shift, i32 0
++  %val2 = shufflevector <4 x i32> %shiftvec, <4 x i32> undef,
++                        <4 x i32> zeroinitializer
++  %ret = lshr <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift by the lowest useful constant.
++define <4 x i32> @f8(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vesrlf %v24, %v26, 1
++; CHECK: br %r14
++  %ret = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i32 shift by the highest useful constant.
++define <4 x i32> @f9(<4 x i32> %dummy, <4 x i32> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vesrlf %v24, %v26, 31
++; CHECK: br %r14
++  %ret = lshr <4 x i32> %val, <i32 31, i32 31, i32 31, i32 31>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 shift by a variable.
++define <2 x i64> @f10(<2 x i64> %dummy, <2 x i64> %val1, i32 %shift) {
++; CHECK-LABEL: f10:
++; CHECK: vesrlg %v24, %v26, 0(%r2)
++; CHECK: br %r14
++  %extshift = sext i32 %shift to i64
++  %shiftvec = insertelement <2 x i64> undef, i64 %extshift, i32 0
++  %val2 = shufflevector <2 x i64> %shiftvec, <2 x i64> undef,
++                        <2 x i32> zeroinitializer
++  %ret = lshr <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 shift by the lowest useful constant.
++define <2 x i64> @f11(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vesrlg %v24, %v26, 1
++; CHECK: br %r14
++  %ret = lshr <2 x i64> %val, <i64 1, i64 1>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i64 shift by the highest useful constant.
++define <2 x i64> @f12(<2 x i64> %dummy, <2 x i64> %val) {
++; CHECK-LABEL: f12:
++; CHECK: vesrlg %v24, %v26, 63
++; CHECK: br %r14
++  %ret = lshr <2 x i64> %val, <i64 63, i64 63>
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-shift-07.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-shift-07.ll
+@@ -0,0 +1,182 @@
++; Test vector sign extensions.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i1->v16i8 extension.
++define <16 x i8> @f1(<16 x i8> %val) {
++; CHECK-LABEL: f1:
++; CHECK: veslb [[REG:%v[0-9]+]], %v24, 7
++; CHECK: vesrab %v24, [[REG]], 7
++; CHECK: br %r14
++  %trunc = trunc <16 x i8> %val to <16 x i1>
++  %ret = sext <16 x i1> %trunc to <16 x i8>
++  ret <16 x i8> %ret
++}
++
++; Test a v8i1->v8i16 extension.
++define <8 x i16> @f2(<8 x i16> %val) {
++; CHECK-LABEL: f2:
++; CHECK: veslh [[REG:%v[0-9]+]], %v24, 15
++; CHECK: vesrah %v24, [[REG]], 15
++; CHECK: br %r14
++  %trunc = trunc <8 x i16> %val to <8 x i1>
++  %ret = sext <8 x i1> %trunc to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v8i8->v8i16 extension.
++define <8 x i16> @f3(<8 x i16> %val) {
++; CHECK-LABEL: f3:
++; CHECK: veslh [[REG:%v[0-9]+]], %v24, 8
++; CHECK: vesrah %v24, [[REG]], 8
++; CHECK: br %r14
++  %trunc = trunc <8 x i16> %val to <8 x i8>
++  %ret = sext <8 x i8> %trunc to <8 x i16>
++  ret <8 x i16> %ret
++}
++
++; Test a v4i1->v4i32 extension.
++define <4 x i32> @f4(<4 x i32> %val) {
++; CHECK-LABEL: f4:
++; CHECK: veslf [[REG:%v[0-9]+]], %v24, 31
++; CHECK: vesraf %v24, [[REG]], 31
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i1>
++  %ret = sext <4 x i1> %trunc to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i8->v4i32 extension.
++define <4 x i32> @f5(<4 x i32> %val) {
++; CHECK-LABEL: f5:
++; CHECK: veslf [[REG:%v[0-9]+]], %v24, 24
++; CHECK: vesraf %v24, [[REG]], 24
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i8>
++  %ret = sext <4 x i8> %trunc to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v4i16->v4i32 extension.
++define <4 x i32> @f6(<4 x i32> %val) {
++; CHECK-LABEL: f6:
++; CHECK: veslf [[REG:%v[0-9]+]], %v24, 16
++; CHECK: vesraf %v24, [[REG]], 16
++; CHECK: br %r14
++  %trunc = trunc <4 x i32> %val to <4 x i16>
++  %ret = sext <4 x i16> %trunc to <4 x i32>
++  ret <4 x i32> %ret
++}
++
++; Test a v2i1->v2i64 extension.
++define <2 x i64> @f7(<2 x i64> %val) {
++; CHECK-LABEL: f7:
++; CHECK: veslg [[REG:%v[0-9]+]], %v24, 63
++; CHECK: vesrag %v24, [[REG]], 63
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i1>
++  %ret = sext <2 x i1> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i8->v2i64 extension.
++define <2 x i64> @f8(<2 x i64> %val) {
++; CHECK-LABEL: f8:
++; CHECK: vsegb %v24, %v24
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i8>
++  %ret = sext <2 x i8> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i16->v2i64 extension.
++define <2 x i64> @f9(<2 x i64> %val) {
++; CHECK-LABEL: f9:
++; CHECK: vsegh %v24, %v24
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i16>
++  %ret = sext <2 x i16> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test a v2i32->v2i64 extension.
++define <2 x i64> @f10(<2 x i64> %val) {
++; CHECK-LABEL: f10:
++; CHECK: vsegf %v24, %v24
++; CHECK: br %r14
++  %trunc = trunc <2 x i64> %val to <2 x i32>
++  %ret = sext <2 x i32> %trunc to <2 x i64>
++  ret <2 x i64> %ret
++}
++
++; Test an alternative v2i8->v2i64 extension.
++define <2 x i64> @f11(<2 x i64> %val) {
++; CHECK-LABEL: f11:
++; CHECK: vsegb %v24, %v24
++; CHECK: br %r14
++  %shl = shl <2 x i64> %val, <i64 56, i64 56>
++  %ret = ashr <2 x i64> %shl, <i64 56, i64 56>
++  ret <2 x i64> %ret
++}
++
++; Test an alternative v2i16->v2i64 extension.
++define <2 x i64> @f12(<2 x i64> %val) {
++; CHECK-LABEL: f12:
++; CHECK: vsegh %v24, %v24
++; CHECK: br %r14
++  %shl = shl <2 x i64> %val, <i64 48, i64 48>
++  %ret = ashr <2 x i64> %shl, <i64 48, i64 48>
++  ret <2 x i64> %ret
++}
++
++; Test an alternative v2i32->v2i64 extension.
++define <2 x i64> @f13(<2 x i64> %val) {
++; CHECK-LABEL: f13:
++; CHECK: vsegf %v24, %v24
++; CHECK: br %r14
++  %shl = shl <2 x i64> %val, <i64 32, i64 32>
++  %ret = ashr <2 x i64> %shl, <i64 32, i64 32>
++  ret <2 x i64> %ret
++}
++
++; Test an extraction-based v2i8->v2i64 extension.
++define <2 x i64> @f14(<16 x i8> %val) {
++; CHECK-LABEL: f14:
++; CHECK: vsegb %v24, %v24
++; CHECK: br %r14
++  %elt0 = extractelement <16 x i8> %val, i32 7
++  %elt1 = extractelement <16 x i8> %val, i32 15
++  %ext0 = sext i8 %elt0 to i64
++  %ext1 = sext i8 %elt1 to i64
++  %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0
++  %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1
++  ret <2 x i64> %vec1
++}
++
++; Test an extraction-based v2i16->v2i64 extension.
++define <2 x i64> @f15(<16 x i16> %val) {
++; CHECK-LABEL: f15:
++; CHECK: vsegh %v24, %v24
++; CHECK: br %r14
++  %elt0 = extractelement <16 x i16> %val, i32 3
++  %elt1 = extractelement <16 x i16> %val, i32 7
++  %ext0 = sext i16 %elt0 to i64
++  %ext1 = sext i16 %elt1 to i64
++  %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0
++  %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1
++  ret <2 x i64> %vec1
++}
++
++; Test an extraction-based v2i32->v2i64 extension.
++define <2 x i64> @f16(<16 x i32> %val) {
++; CHECK-LABEL: f16:
++; CHECK: vsegf %v24, %v24
++; CHECK: br %r14
++  %elt0 = extractelement <16 x i32> %val, i32 1
++  %elt1 = extractelement <16 x i32> %val, i32 3
++  %ext0 = sext i32 %elt0 to i64
++  %ext1 = sext i32 %elt1 to i64
++  %vec0 = insertelement <2 x i64> undef, i64 %ext0, i32 0
++  %vec1 = insertelement <2 x i64> %vec0, i64 %ext1, i32 1
++  ret <2 x i64> %vec1
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-sqrt-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-sqrt-01.ll
+@@ -0,0 +1,23 @@
++; Test f64 and v2f64 square root.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++declare double @llvm.sqrt.f64(double)
++declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
++
++define <2 x double> @f1(<2 x double> %val) {
++; CHECK-LABEL: f1:
++; CHECK: vfsqdb %v24, %v24
++; CHECK: br %r14
++  %ret = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
++  ret <2 x double> %ret
++}
++
++define double @f2(<2 x double> %val) {
++; CHECK-LABEL: f2:
++; CHECK: wfsqdb %f0, %v24
++; CHECK: br %r14
++  %scalar = extractelement <2 x double> %val, i32 0
++  %ret = call double @llvm.sqrt.f64(double %scalar)
++  ret double %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-sub-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-sub-01.ll
+@@ -0,0 +1,148 @@
++; Test vector subtraction.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 subtraction.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vsb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 subtraction.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vsh %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 subtraction.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vsf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 subtraction.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vsg %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
++
++; Test a v4f32 subtraction, as an example of an operation that needs to be
++; scalarized and reassembled.  At present there's an unnecessary move that
++; could be avoided with smarter ordering.  It also isn't important whether
++; the VSLDBs use the result of the VLRs or use %v24 and %v26 directly.
++define <4 x float> @f5(<4 x float> %val1, <4 x float> %val2) {
++; CHECK-LABEL: f5:
++; CHECK-DAG: vlr %v[[A1:[0-5]]], %v24
++; CHECK-DAG: vlr %v[[A2:[0-5]]], %v26
++; CHECK-DAG: vrepf %v[[B1:[0-5]]], %v[[A1]], 1
++; CHECK-DAG: vrepf %v[[B2:[0-5]]], %v[[A2]], 1
++; CHECK-DAG: vrepf %v[[C1:[0-5]]], %v[[A1]], 2
++; CHECK-DAG: vrepf %v[[C2:[0-5]]], %v[[A2]], 2
++; CHECK-DAG: vrepf %v[[D1:[0-5]]], %v[[A1]], 3
++; CHECK-DAG: vrepf %v[[D2:[0-5]]], %v[[A2]], 3
++; CHECK-DAG: ler %f[[A1copy:[0-5]]], %f[[A1]]
++; CHECK-DAG: sebr %f[[A1copy]], %f[[A2]]
++; CHECK-DAG: sebr %f[[B1]], %f[[B2]]
++; CHECK-DAG: sebr %f[[C1]], %f[[C2]]
++; CHECK-DAG: sebr %f[[D1]], %f[[D2]]
++; CHECK-DAG: vmrhf [[HIGH:%v[0-9]+]], %v[[A1copy]], %v[[B1]]
++; CHECK-DAG: vmrhf [[LOW:%v[0-9]+]], %v[[C1]], %v[[D1]]
++; CHECK: vmrhg %v24, [[HIGH]], [[LOW]]
++; CHECK: br %r14
++  %ret = fsub <4 x float> %val1, %val2
++  ret <4 x float> %ret
++}
++
++; Test a v2f64 subtraction.
++define <2 x double> @f6(<2 x double> %dummy, <2 x double> %val1,
++                        <2 x double> %val2) {
++; CHECK-LABEL: f6:
++; CHECK: vfsdb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = fsub <2 x double> %val1, %val2
++  ret <2 x double> %ret
++}
++
++; Test an f64 subtraction that uses vector registers.
++define double @f7(<2 x double> %val1, <2 x double> %val2) {
++; CHECK-LABEL: f7:
++; CHECK: wfsdb %f0, %v24, %v26
++; CHECK: br %r14
++  %scalar1 = extractelement <2 x double> %val1, i32 0
++  %scalar2 = extractelement <2 x double> %val2, i32 0
++  %ret = fsub double %scalar1, %scalar2
++  ret double %ret
++}
++
++; Test a v2i8 subtraction, which gets promoted to v16i8.
++define <2 x i8> @f8(<2 x i8> %dummy, <2 x i8> %val1, <2 x i8> %val2) {
++; CHECK-LABEL: f8:
++; CHECK: vsb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <2 x i8> %val1, %val2
++  ret <2 x i8> %ret
++}
++
++; Test a v4i8 subtraction, which gets promoted to v16i8.
++define <4 x i8> @f9(<4 x i8> %dummy, <4 x i8> %val1, <4 x i8> %val2) {
++; CHECK-LABEL: f9:
++; CHECK: vsb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <4 x i8> %val1, %val2
++  ret <4 x i8> %ret
++}
++
++; Test a v8i8 subtraction, which gets promoted to v16i8.
++define <8 x i8> @f10(<8 x i8> %dummy, <8 x i8> %val1, <8 x i8> %val2) {
++; CHECK-LABEL: f10:
++; CHECK: vsb %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <8 x i8> %val1, %val2
++  ret <8 x i8> %ret
++}
++
++; Test a v2i16 subtraction, which gets promoted to v8i16.
++define <2 x i16> @f11(<2 x i16> %dummy, <2 x i16> %val1, <2 x i16> %val2) {
++; CHECK-LABEL: f11:
++; CHECK: vsh %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <2 x i16> %val1, %val2
++  ret <2 x i16> %ret
++}
++
++; Test a v4i16 subtraction, which gets promoted to v8i16.
++define <4 x i16> @f12(<4 x i16> %dummy, <4 x i16> %val1, <4 x i16> %val2) {
++; CHECK-LABEL: f12:
++; CHECK: vsh %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <4 x i16> %val1, %val2
++  ret <4 x i16> %ret
++}
++
++; Test a v2i32 subtraction, which gets promoted to v4i32.
++define <2 x i32> @f13(<2 x i32> %dummy, <2 x i32> %val1, <2 x i32> %val2) {
++; CHECK-LABEL: f13:
++; CHECK: vsf %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = sub <2 x i32> %val1, %val2
++  ret <2 x i32> %ret
++}
++
++; Test a v2f32 subtraction, which gets promoted to v4f32.
++define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) {
++; No particular output expected, but must compile.
++  %ret = fsub <2 x float> %val1, %val2
++  ret <2 x float> %ret
++}
+Index: llvm-36/test/CodeGen/SystemZ/vec-xor-01.ll
+===================================================================
+--- /dev/null
++++ llvm-36/test/CodeGen/SystemZ/vec-xor-01.ll
+@@ -0,0 +1,39 @@
++; Test vector XOR.
++;
++; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
++
++; Test a v16i8 XOR.
++define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val1, <16 x i8> %val2) {
++; CHECK-LABEL: f1:
++; CHECK: vx %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = xor <16 x i8> %val1, %val2
++  ret <16 x i8> %ret
++}
++
++; Test a v8i16 XOR.
++define <8 x i16> @f2(<8 x i16> %dummy, <8 x i16> %val1, <8 x i16> %val2) {
++; CHECK-LABEL: f2:
++; CHECK: vx %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = xor <8 x i16> %val1, %val2
++  ret <8 x i16> %ret
++}
++
++; Test a v4i32 XOR.
++define <4 x i32> @f3(<4 x i32> %dummy, <4 x i32> %val1, <4 x i32> %val2) {
++; CHECK-LABEL: f3:
++; CHECK: vx %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = xor <4 x i32> %val1, %val2
++  ret <4 x i32> %ret
++}
++
++; Test a v2i64 XOR.
++define <2 x i64> @f4(<2 x i64> %dummy, <2 x i64> %val1, <2 x i64> %val2) {
++; CHECK-LABEL: f4:
++; CHECK: vx %v24, %v26, %v28
++; CHECK: br %r14
++  %ret = xor <2 x i64> %val1, %val2
++  ret <2 x i64> %ret
++}
+Index: llvm-36/test/MC/Disassembler/SystemZ/insns-z13-bad.txt
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/Disassembler/SystemZ/insns-z13-bad.txt
+@@ -0,0 +1,39 @@
++# Test z13 instructions that don't have PC-relative operands.
++# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z13 2>&1 \
++# RUN:   | FileCheck %s
++
++# This would be "vlef %v0, 0, 4", but element 4 is invalid.
++#
++#CHECK: warning: invalid instruction encoding
++#CHECK-NEXT: 0xe7 0x00 0x00 0x00 0x40 0x03
++0xe7 0x00 0x00 0x00 0x40 0x03
++
++# ...and again with element 15
++#
++#CHECK: warning: invalid instruction encoding
++#CHECK-NEXT: 0xe7 0x00 0x00 0x00 0xf0 0x03
++0xe7 0x00 0x00 0x00 0xf0 0x03
++
++# This would be "vleg %v0, 0, 2", but element 2 is invalid.
++#
++#CHECK: warning: invalid instruction encoding
++#CHECK-NEXT: 0xe7 0x00 0x00 0x00 0x20 0x02
++0xe7 0x00 0x00 0x00 0x20 0x02
++
++# ...and again with element 15
++#
++#CHECK: warning: invalid instruction encoding
++#CHECK-NEXT: 0xe7 0x00 0x00 0x00 0xf0 0x02
++0xe7 0x00 0x00 0x00 0xf0 0x02
++
++# This would be "vleh %v0, 0, 8", but element 8 is invalid.
++#
++#CHECK: warning: invalid instruction encoding
++#CHECK-NEXT: 0xe7 0x00 0x00 0x00 0x80 0x01
++0xe7 0x00 0x00 0x00 0x80 0x01
++
++# ...and again with element 15
++#
++#CHECK: warning: invalid instruction encoding
++#CHECK-NEXT: 0xe7 0x00 0x00 0x00 0xf0 0x01
++0xe7 0x00 0x00 0x00 0xf0 0x01
+Index: llvm-36/test/MC/Disassembler/SystemZ/insns-z13.txt
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/Disassembler/SystemZ/insns-z13.txt
+@@ -0,0 +1,3315 @@
++# Test z13 instructions that don't have PC-relative operands.
++# RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z13 \
++# RUN:   | FileCheck %s
++
++#CHECK: lcbb    %r0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x27
++
++#CHECK: lcbb    %r1, 2475(%r7,%r8), 12
++0xe7 0x17 0x89 0xab 0xc0 0x27
++
++#CHECK: lcbb    %r15, 4095(%r15,%r15), 15
++0xe7 0xff 0xff 0xff 0xf0 0x27
++
++#CHECK: vab     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf3
++
++#CHECK: vab     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf3
++
++#CHECK: vab     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf3
++
++#CHECK: vaccb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf1
++
++#CHECK: vaccb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf1
++
++#CHECK: vaccb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf1
++
++#CHECK: vacccq  %v0, %v0, %v0, %v0
++0xe7 0x00 0x04 0x00 0x00 0xb9
++
++#CHECK: vacccq  %v3, %v20, %v5, %v22
++0xe7 0x34 0x54 0x00 0x65 0xb9
++
++#CHECK: vacccq  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf4 0x00 0xff 0xb9
++
++#CHECK: vaccf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf1
++
++#CHECK: vaccf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf1
++
++#CHECK: vaccf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf1
++
++#CHECK: vaccg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf1
++
++#CHECK: vaccg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf1
++
++#CHECK: vaccg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf1
++
++#CHECK: vacch   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf1
++
++#CHECK: vacch   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf1
++
++#CHECK: vacch   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf1
++
++#CHECK: vaccq   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x40 0xf1
++
++#CHECK: vaccq   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x4a 0xf1
++
++#CHECK: vaccq   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x4e 0xf1
++
++#CHECK: vacq    %v0, %v0, %v0, %v0
++0xe7 0x00 0x04 0x00 0x00 0xbb
++
++#CHECK: vacq    %v3, %v20, %v5, %v22
++0xe7 0x34 0x54 0x00 0x65 0xbb
++
++#CHECK: vacq    %v31, %v31, %v31, %v31
++0xe7 0xff 0xf4 0x00 0xff 0xbb
++
++#CHECK: vaf     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf3
++
++#CHECK: vaf     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf3
++
++#CHECK: vaf     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf3
++
++#CHECK: vag     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf3
++
++#CHECK: vag     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf3
++
++#CHECK: vag     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf3
++
++#CHECK: vah     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf3
++
++#CHECK: vah     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf3
++
++#CHECK: vah     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf3
++
++#CHECK: vaq     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x40 0xf3
++
++#CHECK: vaq     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x4a 0xf3
++
++#CHECK: vaq     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x4e 0xf3
++
++#CHECK: vavgb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf2
++
++#CHECK: vavgb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf2
++
++#CHECK: vavgb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf2
++
++#CHECK: vavgf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf2
++
++#CHECK: vavgf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf2
++
++#CHECK: vavgf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf2
++
++#CHECK: vavgg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf2
++
++#CHECK: vavgg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf2
++
++#CHECK: vavgg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf2
++
++#CHECK: vavgh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf2
++
++#CHECK: vavgh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf2
++
++#CHECK: vavgh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf2
++
++#CHECK: vavglb  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf0
++
++#CHECK: vavglb  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf0
++
++#CHECK: vavglb  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf0
++
++#CHECK: vavglf  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf0
++
++#CHECK: vavglf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf0
++
++#CHECK: vavglf  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf0
++
++#CHECK: vavglg  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf0
++
++#CHECK: vavglg  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf0
++
++#CHECK: vavglg  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf0
++
++#CHECK: vavglh  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf0
++
++#CHECK: vavglh  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf0
++
++#CHECK: vavglh  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf0
++
++#CHECK: vcdgb   %v0, %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0xc3
++
++#CHECK: vcdgb   %v19, %v14, 4, 10
++0xe7 0x3e 0x00 0xa4 0x38 0xc3
++
++#CHECK: vcdgb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xf7 0x3c 0xc3
++
++#CHECK: vcdlgb  %v0, %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0xc1
++
++#CHECK: vcdlgb  %v19, %v14, 4, 10
++0xe7 0x3e 0x00 0xa4 0x38 0xc1
++
++#CHECK: vcdlgb  %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xf7 0x3c 0xc1
++
++#CHECK: vceqb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf8
++
++#CHECK: vceqb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf8
++
++#CHECK: vceqbs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x04 0xf8
++
++#CHECK: vceqb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf8
++
++#CHECK: vceqf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf8
++
++#CHECK: vceqf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf8
++
++#CHECK: vceqfs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0xf8
++
++#CHECK: vceqf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf8
++
++#CHECK: vceqg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf8
++
++#CHECK: vceqg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf8
++
++#CHECK: vceqgs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x34 0xf8
++
++#CHECK: vceqg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf8
++
++#CHECK: vceqh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf8
++
++#CHECK: vceqh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf8
++
++#CHECK: vceqhs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0xf8
++
++#CHECK: vceqh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf8
++
++#CHECK: vcgdb   %v0, %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0xc2
++
++#CHECK: vcgdb   %v19, %v14, 4, 10
++0xe7 0x3e 0x00 0xa4 0x38 0xc2
++
++#CHECK: vcgdb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xf7 0x3c 0xc2
++
++#CHECK: vchb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xfb
++
++#CHECK: vchb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xfb
++
++#CHECK: vchbs   %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x04 0xfb
++
++#CHECK: vchb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xfb
++
++#CHECK: vchf    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xfb
++
++#CHECK: vchf    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xfb
++
++#CHECK: vchfs   %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0xfb
++
++#CHECK: vchf    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xfb
++
++#CHECK: vchg    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xfb
++
++#CHECK: vchg    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xfb
++
++#CHECK: vchgs   %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x34 0xfb
++
++#CHECK: vchg    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xfb
++
++#CHECK: vchh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xfb
++
++#CHECK: vchh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xfb
++
++#CHECK: vchhs   %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0xfb
++
++#CHECK: vchh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xfb
++
++#CHECK: vchlb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf9
++
++#CHECK: vchlb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf9
++
++#CHECK: vchlbs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x04 0xf9
++
++#CHECK: vchlb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf9
++
++#CHECK: vchlf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf9
++
++#CHECK: vchlf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf9
++
++#CHECK: vchlfs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0xf9
++
++#CHECK: vchlf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf9
++
++#CHECK: vchlg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf9
++
++#CHECK: vchlg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf9
++
++#CHECK: vchlgs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x34 0xf9
++
++#CHECK: vchlg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf9
++
++#CHECK: vchlh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf9
++
++#CHECK: vchlh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf9
++
++#CHECK: vchlhs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0xf9
++
++#CHECK: vchlh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf9
++
++#CHECK: vcksm   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x66
++
++#CHECK: vcksm   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x66
++
++#CHECK: vcksm   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x66
++
++#CHECK: vclgdb  %v0, %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0xc0
++
++#CHECK: vclgdb  %v19, %v14, 4, 10
++0xe7 0x3e 0x00 0xa4 0x38 0xc0
++
++#CHECK: vclgdb  %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xf7 0x3c 0xc0
++
++#CHECK: vclzb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x53
++
++#CHECK: vclzb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0x53
++
++#CHECK: vclzb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0x53
++
++#CHECK: vclzf   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x53
++
++#CHECK: vclzf   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0x53
++
++#CHECK: vclzf   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0x53
++
++#CHECK: vclzg   %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x53
++
++#CHECK: vclzg   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0x53
++
++#CHECK: vclzg   %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0x53
++
++#CHECK: vclzh   %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x53
++
++#CHECK: vclzh   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0x53
++
++#CHECK: vclzh   %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0x53
++
++#CHECK: vctzb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x52
++
++#CHECK: vctzb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0x52
++
++#CHECK: vctzb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0x52
++
++#CHECK: vctzf   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x52
++
++#CHECK: vctzf   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0x52
++
++#CHECK: vctzf   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0x52
++
++#CHECK: vctzg   %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x52
++
++#CHECK: vctzg   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0x52
++
++#CHECK: vctzg   %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0x52
++
++#CHECK: vctzh   %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x52
++
++#CHECK: vctzh   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0x52
++
++#CHECK: vctzh   %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0x52
++
++#CHECK: vecb    %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xdb
++
++#CHECK: vecb    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xdb
++
++#CHECK: vecb    %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xdb
++
++#CHECK: vecf    %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xdb
++
++#CHECK: vecf    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xdb
++
++#CHECK: vecf    %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xdb
++
++#CHECK: vecg    %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xdb
++
++#CHECK: vecg    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0xdb
++
++#CHECK: vecg    %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xdb
++
++#CHECK: vech    %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xdb
++
++#CHECK: vech    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xdb
++
++#CHECK: vech    %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xdb
++
++#CHECK: veclb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xd9
++
++#CHECK: veclb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xd9
++
++#CHECK: veclb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xd9
++
++#CHECK: veclf   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xd9
++
++#CHECK: veclf   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xd9
++
++#CHECK: veclf   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xd9
++
++#CHECK: veclg   %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xd9
++
++#CHECK: veclg   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0xd9
++
++#CHECK: veclg   %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xd9
++
++#CHECK: veclh   %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xd9
++
++#CHECK: veclh   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xd9
++
++#CHECK: veclh   %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xd9
++
++#CHECK: verimb  %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x72
++
++#CHECK: verimb  %v3, %v20, %v5, 103
++0xe7 0x34 0x50 0x67 0x04 0x72
++
++#CHECK: verimb  %v31, %v31, %v31, 255
++0xe7 0xff 0xf0 0xff 0x0e 0x72
++
++#CHECK: verimf  %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x72
++
++#CHECK: verimf  %v3, %v20, %v5, 103
++0xe7 0x34 0x50 0x67 0x24 0x72
++
++#CHECK: verimf  %v31, %v31, %v31, 255
++0xe7 0xff 0xf0 0xff 0x2e 0x72
++
++#CHECK: verimg  %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x72
++
++#CHECK: verimg  %v3, %v20, %v5, 103
++0xe7 0x34 0x50 0x67 0x34 0x72
++
++#CHECK: verimg  %v31, %v31, %v31, 255
++0xe7 0xff 0xf0 0xff 0x3e 0x72
++
++#CHECK: verimh  %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x72
++
++#CHECK: verimh  %v3, %v20, %v5, 103
++0xe7 0x34 0x50 0x67 0x14 0x72
++
++#CHECK: verimh  %v31, %v31, %v31, 255
++0xe7 0xff 0xf0 0xff 0x1e 0x72
++
++#CHECK: verllvb %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x73
++
++#CHECK: verllvb %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x73
++
++#CHECK: verllvb %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x73
++
++#CHECK: verllvf %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x73
++
++#CHECK: verllvf %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x73
++
++#CHECK: verllvf %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x73
++
++#CHECK: verllvg %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x73
++
++#CHECK: verllvg %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x73
++
++#CHECK: verllvg %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x73
++
++#CHECK: verllvh %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x73
++
++#CHECK: verllvh %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x73
++
++#CHECK: verllvh %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x73
++
++#CHECK: verllb  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x33
++
++#CHECK: verllb  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x04 0x33
++
++#CHECK: verllb  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x0c 0x33
++
++#CHECK: verllf  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x33
++
++#CHECK: verllf  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x24 0x33
++
++#CHECK: verllf  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x2c 0x33
++
++#CHECK: verllg  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x33
++
++#CHECK: verllg  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x34 0x33
++
++#CHECK: verllg  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x3c 0x33
++
++#CHECK: verllh  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x33
++
++#CHECK: verllh  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x14 0x33
++
++#CHECK: verllh  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x1c 0x33
++
++#CHECK: veslvb  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x70
++
++#CHECK: veslvb  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x70
++
++#CHECK: veslvb  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x70
++
++#CHECK: veslvf  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x70
++
++#CHECK: veslvf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x70
++
++#CHECK: veslvf  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x70
++
++#CHECK: veslvg  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x70
++
++#CHECK: veslvg  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x70
++
++#CHECK: veslvg  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x70
++
++#CHECK: veslvh  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x70
++
++#CHECK: veslvh  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x70
++
++#CHECK: veslvh  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x70
++
++#CHECK: veslb   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x30
++
++#CHECK: veslb   %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x04 0x30
++
++#CHECK: veslb   %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x0c 0x30
++
++#CHECK: veslf   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x30
++
++#CHECK: veslf   %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x24 0x30
++
++#CHECK: veslf   %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x2c 0x30
++
++#CHECK: veslg   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x30
++
++#CHECK: veslg   %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x34 0x30
++
++#CHECK: veslg   %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x3c 0x30
++
++#CHECK: veslh   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x30
++
++#CHECK: veslh   %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x14 0x30
++
++#CHECK: veslh   %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x1c 0x30
++
++#CHECK: vesravb %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x7a
++
++#CHECK: vesravb %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x7a
++
++#CHECK: vesravb %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x7a
++
++#CHECK: vesravf %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x7a
++
++#CHECK: vesravf %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x7a
++
++#CHECK: vesravf %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x7a
++
++#CHECK: vesravg %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x7a
++
++#CHECK: vesravg %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x7a
++
++#CHECK: vesravg %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x7a
++
++#CHECK: vesravh %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x7a
++
++#CHECK: vesravh %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x7a
++
++#CHECK: vesravh %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x7a
++
++#CHECK: vesrab  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x3a
++
++#CHECK: vesrab  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x04 0x3a
++
++#CHECK: vesrab  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x0c 0x3a
++
++#CHECK: vesraf  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x3a
++
++#CHECK: vesraf  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x24 0x3a
++
++#CHECK: vesraf  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x2c 0x3a
++
++#CHECK: vesrag  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x3a
++
++#CHECK: vesrag  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x34 0x3a
++
++#CHECK: vesrag  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x3c 0x3a
++
++#CHECK: vesrah  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x3a
++
++#CHECK: vesrah  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x14 0x3a
++
++#CHECK: vesrah  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x1c 0x3a
++
++#CHECK: vesrlvb %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x78
++
++#CHECK: vesrlvb %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x78
++
++#CHECK: vesrlvb %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x78
++
++#CHECK: vesrlvf %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x78
++
++#CHECK: vesrlvf %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x78
++
++#CHECK: vesrlvf %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x78
++
++#CHECK: vesrlvg %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x78
++
++#CHECK: vesrlvg %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x78
++
++#CHECK: vesrlvg %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x78
++
++#CHECK: vesrlvh %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x78
++
++#CHECK: vesrlvh %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x78
++
++#CHECK: vesrlvh %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x78
++
++#CHECK: vesrlb  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x38
++
++#CHECK: vesrlb  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x04 0x38
++
++#CHECK: vesrlb  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x0c 0x38
++
++#CHECK: vesrlf  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x38
++
++#CHECK: vesrlf  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x24 0x38
++
++#CHECK: vesrlf  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x2c 0x38
++
++#CHECK: vesrlg  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x38
++
++#CHECK: vesrlg  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x34 0x38
++
++#CHECK: vesrlg  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x3c 0x38
++
++#CHECK: vesrlh  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x38
++
++#CHECK: vesrlh  %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x14 0x38
++
++#CHECK: vesrlh  %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x1c 0x38
++
++#CHECK: vfadb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xe3
++
++#CHECK: vfadb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xe3
++
++#CHECK: vfadb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xe3
++
++#CHECK: vfaeb   %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x82
++
++#CHECK: vfaeb   %v0, %v0, %v0, 12
++0xe7 0x00 0x00 0xc0 0x00 0x82
++
++#CHECK: vfaeb   %v18, %v3, %v20, 0
++0xe7 0x23 0x40 0x00 0x0a 0x82
++
++#CHECK: vfaeb   %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x40 0x0e 0x82
++
++#CHECK: vfaebs  %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0x90 0x0e 0x82
++
++#CHECK: vfaezb  %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x60 0x0e 0x82
++
++#CHECK: vfaezbs %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0xb0 0x0e 0x82
++
++#CHECK: vfaef   %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x82
++
++#CHECK: vfaef   %v0, %v0, %v0, 12
++0xe7 0x00 0x00 0xc0 0x20 0x82
++
++#CHECK: vfaef   %v18, %v3, %v20, 0
++0xe7 0x23 0x40 0x00 0x2a 0x82
++
++#CHECK: vfaef   %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x40 0x2e 0x82
++
++#CHECK: vfaefs  %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0x90 0x2e 0x82
++
++#CHECK: vfaezf  %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x60 0x2e 0x82
++
++#CHECK: vfaezfs %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0xb0 0x2e 0x82
++
++#CHECK: vfaeh   %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x82
++
++#CHECK: vfaeh   %v0, %v0, %v0, 12
++0xe7 0x00 0x00 0xc0 0x10 0x82
++
++#CHECK: vfaeh   %v18, %v3, %v20, 0
++0xe7 0x23 0x40 0x00 0x1a 0x82
++
++#CHECK: vfaeh   %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x40 0x1e 0x82
++
++#CHECK: vfaehs  %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0x90 0x1e 0x82
++
++#CHECK: vfaezh  %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x60 0x1e 0x82
++
++#CHECK: vfaezhs %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0xb0 0x1e 0x82
++
++#CHECK: vfcedb  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xe8
++
++#CHECK: vfcedb  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xe8
++
++#CHECK: vfcedb  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xe8
++
++#CHECK: vfcedbs %v0, %v0, %v0
++0xe7 0x00 0x00 0x10 0x30 0xe8
++
++#CHECK: vfcedbs %v18, %v3, %v20
++0xe7 0x23 0x40 0x10 0x3a 0xe8
++
++#CHECK: vfcedbs %v31, %v31, %v31
++0xe7 0xff 0xf0 0x10 0x3e 0xe8
++
++#CHECK: vfchdb  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xeb
++
++#CHECK: vfchdb  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xeb
++
++#CHECK: vfchdb  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xeb
++
++#CHECK: vfchdbs %v0, %v0, %v0
++0xe7 0x00 0x00 0x10 0x30 0xeb
++
++#CHECK: vfchdbs %v18, %v3, %v20
++0xe7 0x23 0x40 0x10 0x3a 0xeb
++
++#CHECK: vfchdbs %v31, %v31, %v31
++0xe7 0xff 0xf0 0x10 0x3e 0xeb
++
++#CHECK: vfchedb %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xea
++
++#CHECK: vfchedb %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xea
++
++#CHECK: vfchedb %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xea
++
++#CHECK: vfchedbs %v0, %v0, %v0
++0xe7 0x00 0x00 0x10 0x30 0xea
++
++#CHECK: vfchedbs %v18, %v3, %v20
++0xe7 0x23 0x40 0x10 0x3a 0xea
++
++#CHECK: vfchedbs %v31, %v31, %v31
++0xe7 0xff 0xf0 0x10 0x3e 0xea
++
++#CHECK: vfddb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xe5
++
++#CHECK: vfddb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xe5
++
++#CHECK: vfddb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xe5
++
++#CHECK: vfeeb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x80
++
++#CHECK: vfeeb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x80
++
++#CHECK: vfeebs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x04 0x80
++
++#CHECK: vfeezb  %v18, %v3, %v20
++0xe7 0x23 0x40 0x20 0x0a 0x80
++
++#CHECK: vfeezbs %v7, %v24, %v9
++0xe7 0x78 0x90 0x30 0x04 0x80
++
++#CHECK: vfeeb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x80
++
++#CHECK: vfeef   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x80
++
++#CHECK: vfeef   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x80
++
++#CHECK: vfeefs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0x80
++
++#CHECK: vfeezf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x20 0x2a 0x80
++
++#CHECK: vfeezfs %v7, %v24, %v9
++0xe7 0x78 0x90 0x30 0x24 0x80
++
++#CHECK: vfeef   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x80
++
++#CHECK: vfeeh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x80
++
++#CHECK: vfeeh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x80
++
++#CHECK: vfeehs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0x80
++
++#CHECK: vfeezh  %v18, %v3, %v20
++0xe7 0x23 0x40 0x20 0x1a 0x80
++
++#CHECK: vfeezhs %v7, %v24, %v9
++0xe7 0x78 0x90 0x30 0x14 0x80
++
++#CHECK: vfeeh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x80
++
++#CHECK: vfeneb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x81
++
++#CHECK: vfeneb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x81
++
++#CHECK: vfenebs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x04 0x81
++
++#CHECK: vfenezb  %v18, %v3, %v20
++0xe7 0x23 0x40 0x20 0x0a 0x81
++
++#CHECK: vfenezbs %v7, %v24, %v9
++0xe7 0x78 0x90 0x30 0x04 0x81
++
++#CHECK: vfeneb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x81
++
++#CHECK: vfenef   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x81
++
++#CHECK: vfenef   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x81
++
++#CHECK: vfenefs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0x81
++
++#CHECK: vfenezf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x20 0x2a 0x81
++
++#CHECK: vfenezfs %v7, %v24, %v9
++0xe7 0x78 0x90 0x30 0x24 0x81
++
++#CHECK: vfenef   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x81
++
++#CHECK: vfeneh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x81
++
++#CHECK: vfeneh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x81
++
++#CHECK: vfenehs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0x81
++
++#CHECK: vfenezh  %v18, %v3, %v20
++0xe7 0x23 0x40 0x20 0x1a 0x81
++
++#CHECK: vfenezhs %v7, %v24, %v9
++0xe7 0x78 0x90 0x30 0x14 0x81
++
++#CHECK: vfeneh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x81
++
++#CHECK: vfidb   %v0, %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0xc7
++
++#CHECK: vfidb   %v19, %v14, 4, 10
++0xe7 0x3e 0x00 0xa4 0x38 0xc7
++
++#CHECK: vfidb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xf7 0x3c 0xc7
++
++#CHECK: vistrb  %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x5c
++
++#CHECK: vistrb  %v18, %v3
++0xe7 0x23 0x00 0x00 0x08 0x5c
++
++#CHECK: vistrbs %v7, %v24
++0xe7 0x78 0x00 0x10 0x04 0x5c
++
++#CHECK: vistrb  %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0x5c
++
++#CHECK: vistrf  %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x5c
++
++#CHECK: vistrf  %v18, %v3
++0xe7 0x23 0x00 0x00 0x28 0x5c
++
++#CHECK: vistrfs %v7, %v24
++0xe7 0x78 0x00 0x10 0x24 0x5c
++
++#CHECK: vistrf  %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0x5c
++
++#CHECK: vistrh  %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x5c
++
++#CHECK: vistrh  %v18, %v3
++0xe7 0x23 0x00 0x00 0x18 0x5c
++
++#CHECK: vistrhs %v7, %v24
++0xe7 0x78 0x00 0x10 0x14 0x5c
++
++#CHECK: vistrh  %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0x5c
++
++#CHECK: vfmadb  %v0, %v0, %v0, %v0
++0xe7 0x00 0x03 0x00 0x00 0x8f
++
++#CHECK: vfmadb  %v3, %v20, %v5, %v22
++0xe7 0x34 0x53 0x00 0x65 0x8f
++
++#CHECK: vfmadb  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf3 0x00 0xff 0x8f
++
++#CHECK: vfmdb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xe7
++
++#CHECK: vfmdb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xe7
++
++#CHECK: vfmdb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xe7
++
++#CHECK: vfmsdb  %v0, %v0, %v0, %v0
++0xe7 0x00 0x03 0x00 0x00 0x8e
++
++#CHECK: vfmsdb  %v3, %v20, %v5, %v22
++0xe7 0x34 0x53 0x00 0x65 0x8e
++
++#CHECK: vfmsdb  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf3 0x00 0xff 0x8e
++
++#CHECK: vfsdb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xe2
++
++#CHECK: vfsdb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xe2
++
++#CHECK: vfsdb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xe2
++
++#CHECK: vzero   %v0
++0xe7 0x00 0x00 0x00 0x00 0x44
++
++#CHECK: vgbm    %v0, 1
++0xe7 0x00 0x00 0x01 0x00 0x44
++
++#CHECK: vgbm    %v0, 65534
++0xe7 0x00 0xff 0xfe 0x00 0x44
++
++#CHECK: vone    %v0
++0xe7 0x00 0xff 0xff 0x00 0x44
++
++#CHECK: vgbm    %v17, 4660
++0xe7 0x10 0x12 0x34 0x08 0x44
++
++#CHECK: vone    %v31
++0xe7 0xf0 0xff 0xff 0x08 0x44
++
++#CHECK: vgef    %v0, 0(%v0), 0
++0xe7 0x00 0x00 0x00 0x00 0x13
++
++#CHECK: vgef    %v10, 1000(%v19,%r7), 2
++0xe7 0xa3 0x73 0xe8 0x24 0x13
++
++#CHECK: vgef    %v31, 4095(%v31,%r15), 3
++0xe7 0xff 0xff 0xff 0x3c 0x13
++
++#CHECK: vgeg    %v0, 0(%v0), 0
++0xe7 0x00 0x00 0x00 0x00 0x12
++
++#CHECK: vgeg    %v10, 1000(%v19,%r7), 1
++0xe7 0xa3 0x73 0xe8 0x14 0x12
++
++#CHECK: vgeg    %v31, 4095(%v31,%r15), 1
++0xe7 0xff 0xff 0xff 0x1c 0x12
++
++#CHECK: vgfmab  %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xbc
++
++#CHECK: vgfmab  %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xbc
++
++#CHECK: vgfmab  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xbc
++
++#CHECK: vgfmaf  %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xbc
++
++#CHECK: vgfmaf  %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xbc
++
++#CHECK: vgfmaf  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xbc
++
++#CHECK: vgfmag  %v0, %v0, %v0, %v0
++0xe7 0x00 0x03 0x00 0x00 0xbc
++
++#CHECK: vgfmag  %v3, %v20, %v5, %v22
++0xe7 0x34 0x53 0x00 0x65 0xbc
++
++#CHECK: vgfmag  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf3 0x00 0xff 0xbc
++
++#CHECK: vgfmah  %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xbc
++
++#CHECK: vgfmah  %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xbc
++
++#CHECK: vgfmah  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xbc
++
++#CHECK: vgfmb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xb4
++
++#CHECK: vgfmb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xb4
++
++#CHECK: vgfmb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xb4
++
++#CHECK: vgfmf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xb4
++
++#CHECK: vgfmf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xb4
++
++#CHECK: vgfmf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xb4
++
++#CHECK: vgfmg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xb4
++
++#CHECK: vgfmg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xb4
++
++#CHECK: vgfmg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xb4
++
++#CHECK: vgfmh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xb4
++
++#CHECK: vgfmh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xb4
++
++#CHECK: vgfmh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xb4
++
++#CHECK: vgmb    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x46
++
++#CHECK: vgmb    %v22, 55, 66
++0xe7 0x60 0x37 0x42 0x08 0x46
++
++#CHECK: vgmb    %v31, 255, 255
++0xe7 0xf0 0xff 0xff 0x08 0x46
++
++#CHECK: vgmf    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x20 0x46
++
++#CHECK: vgmf    %v22, 55, 66
++0xe7 0x60 0x37 0x42 0x28 0x46
++
++#CHECK: vgmf    %v31, 255, 255
++0xe7 0xf0 0xff 0xff 0x28 0x46
++
++#CHECK: vgmg    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0x46
++
++#CHECK: vgmg    %v22, 55, 66
++0xe7 0x60 0x37 0x42 0x38 0x46
++
++#CHECK: vgmg    %v31, 255, 255
++0xe7 0xf0 0xff 0xff 0x38 0x46
++
++#CHECK: vgmh    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x10 0x46
++
++#CHECK: vgmh    %v22, 55, 66
++0xe7 0x60 0x37 0x42 0x18 0x46
++
++#CHECK: vgmh    %v31, 255, 255
++0xe7 0xf0 0xff 0xff 0x18 0x46
++
++#CHECK: vl      %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x06
++
++#CHECK: vl      %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x08 0x06
++
++#CHECK: vl      %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x08 0x06
++
++#CHECK: vlbb    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x07
++
++#CHECK: vlbb    %v17, 2475(%r7,%r8), 12
++0xe7 0x17 0x89 0xab 0xc8 0x07
++
++#CHECK: vlbb    %v31, 4095(%r15,%r15), 15
++0xe7 0xff 0xff 0xff 0xf8 0x07
++
++#CHECK: vlcb    %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xde
++
++#CHECK: vlcb    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xde
++
++#CHECK: vlcb    %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xde
++
++#CHECK: vlcf    %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xde
++
++#CHECK: vlcf    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xde
++
++#CHECK: vlcf    %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xde
++
++#CHECK: vlcg    %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xde
++
++#CHECK: vlcg    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0xde
++
++#CHECK: vlcg    %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xde
++
++#CHECK: vlch    %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xde
++
++#CHECK: vlch    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xde
++
++#CHECK: vlch    %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xde
++
++#CHECK: vldeb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xc4
++
++#CHECK: vldeb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xc4
++
++#CHECK: vldeb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xc4
++
++#CHECK: vleb    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x00
++
++#CHECK: vleb    %v17, 2475(%r7,%r8), 12
++0xe7 0x17 0x89 0xab 0xc8 0x00
++
++#CHECK: vleb    %v31, 4095(%r15,%r15), 15
++0xe7 0xff 0xff 0xff 0xf8 0x00
++
++#CHECK: vledb   %v0, %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x30 0xc5
++
++#CHECK: vledb   %v19, %v14, 4, 10
++0xe7 0x3e 0x00 0xa4 0x38 0xc5
++
++#CHECK: vledb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xf7 0x3c 0xc5
++
++#CHECK: vlef    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x03
++
++#CHECK: vlef    %v17, 2475(%r7,%r8), 2
++0xe7 0x17 0x89 0xab 0x28 0x03
++
++#CHECK: vlef    %v31, 4095(%r15,%r15), 3
++0xe7 0xff 0xff 0xff 0x38 0x03
++
++#CHECK: vleg    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x02
++
++#CHECK: vleg    %v17, 2475(%r7,%r8), 1
++0xe7 0x17 0x89 0xab 0x18 0x02
++
++#CHECK: vleg    %v31, 4095(%r15,%r15), 1
++0xe7 0xff 0xff 0xff 0x18 0x02
++
++#CHECK: vleh    %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x01
++
++#CHECK: vleh    %v17, 2475(%r7,%r8), 5
++0xe7 0x17 0x89 0xab 0x58 0x01
++
++#CHECK: vleh    %v31, 4095(%r15,%r15), 7
++0xe7 0xff 0xff 0xff 0x78 0x01
++
++#CHECK: vleib   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x40
++
++#CHECK: vleib   %v23, -30293, 12
++0xe7 0x70 0x89 0xab 0xc8 0x40
++
++#CHECK: vleib   %v31, -1, 15
++0xe7 0xf0 0xff 0xff 0xf8 0x40
++
++#CHECK: vleif   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x43
++
++#CHECK: vleif   %v23, -30293, 2
++0xe7 0x70 0x89 0xab 0x28 0x43
++
++#CHECK: vleif   %v31, -1, 3
++0xe7 0xf0 0xff 0xff 0x38 0x43
++
++#CHECK: vleig   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x42
++
++#CHECK: vleig   %v23, -30293, 1
++0xe7 0x70 0x89 0xab 0x18 0x42
++
++#CHECK: vleig   %v31, -1, 1
++0xe7 0xf0 0xff 0xff 0x18 0x42
++
++#CHECK: vleih   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x41
++
++#CHECK: vleih   %v23, -30293, 5
++0xe7 0x70 0x89 0xab 0x58 0x41
++
++#CHECK: vleih   %v31, -1, 7
++0xe7 0xf0 0xff 0xff 0x78 0x41
++
++#CHECK: vflcdb  %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xcc
++
++#CHECK: vflcdb  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0xcc
++
++#CHECK: vflcdb  %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xcc
++
++#CHECK: vflndb  %v0, %v0
++0xe7 0x00 0x00 0x10 0x30 0xcc
++
++#CHECK: vflndb  %v19, %v14
++0xe7 0x3e 0x00 0x10 0x38 0xcc
++
++#CHECK: vflndb  %v31, %v31
++0xe7 0xff 0x00 0x10 0x3c 0xcc
++
++#CHECK: vflpdb  %v0, %v0
++0xe7 0x00 0x00 0x20 0x30 0xcc
++
++#CHECK: vflpdb  %v19, %v14
++0xe7 0x3e 0x00 0x20 0x38 0xcc
++
++#CHECK: vflpdb  %v31, %v31
++0xe7 0xff 0x00 0x20 0x3c 0xcc
++
++#CHECK: vlgvb   %r0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x21
++
++#CHECK: vlgvb   %r2, %v19, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x04 0x21
++
++#CHECK: vlgvb   %r15, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x04 0x21
++
++#CHECK: vlgvf   %r0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x21
++
++#CHECK: vlgvf   %r2, %v19, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x24 0x21
++
++#CHECK: vlgvf   %r15, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x24 0x21
++
++#CHECK: vlgvg   %r0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x21
++
++#CHECK: vlgvg   %r2, %v19, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x34 0x21
++
++#CHECK: vlgvg   %r15, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x34 0x21
++
++#CHECK: vlgvh   %r0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x21
++
++#CHECK: vlgvh   %r2, %v19, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x14 0x21
++
++#CHECK: vlgvh   %r15, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x14 0x21
++
++#CHECK: vfsqdb  %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xce
++
++#CHECK: vfsqdb  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0xce
++
++#CHECK: vfsqdb  %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xce
++
++#CHECK: vftcidb %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x4a
++
++#CHECK: vftcidb %v19, %v4, 1383
++0xe7 0x34 0x56 0x70 0x38 0x4a
++
++#CHECK: vftcidb %v31, %v31, 4095
++0xe7 0xff 0xff 0xf0 0x3c 0x4a
++
++#CHECK: vll     %v0, %r0, 0
++0xe7 0x00 0x00 0x00 0x00 0x37
++
++#CHECK: vll     %v18, %r3, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x08 0x37
++
++#CHECK: vll     %v31, %r15, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x08 0x37
++
++#CHECK: vllezb  %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x04
++
++#CHECK: vllezb  %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x08 0x04
++
++#CHECK: vllezb  %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x08 0x04
++
++#CHECK: vllezf  %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x04
++
++#CHECK: vllezf  %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x28 0x04
++
++#CHECK: vllezf  %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x28 0x04
++
++#CHECK: vllezg  %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x04
++
++#CHECK: vllezg  %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x38 0x04
++
++#CHECK: vllezg  %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x38 0x04
++
++#CHECK: vllezh  %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x04
++
++#CHECK: vllezh  %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x18 0x04
++
++#CHECK: vllezh  %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x18 0x04
++
++#CHECK: vlm     %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x36
++
++#CHECK: vlm     %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x04 0x36
++
++#CHECK: vlm     %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x0c 0x36
++
++#CHECK: vlpb    %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xdf
++
++#CHECK: vlpb    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xdf
++
++#CHECK: vlpb    %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xdf
++
++#CHECK: vlpf    %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xdf
++
++#CHECK: vlpf    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xdf
++
++#CHECK: vlpf    %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xdf
++
++#CHECK: vlpg    %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xdf
++
++#CHECK: vlpg    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x38 0xdf
++
++#CHECK: vlpg    %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xdf
++
++#CHECK: vlph    %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xdf
++
++#CHECK: vlph    %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xdf
++
++#CHECK: vlph    %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xdf
++
++#CHECK: vlr     %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x56
++
++#CHECK: vlr     %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0x56
++
++#CHECK: vlr     %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0x56
++
++#CHECK: vlrepb   %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x05
++
++#CHECK: vlrepb   %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x08 0x05
++
++#CHECK: vlrepb   %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x08 0x05
++
++#CHECK: vlrepf   %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x05
++
++#CHECK: vlrepf   %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x28 0x05
++
++#CHECK: vlrepf   %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x28 0x05
++
++#CHECK: vlrepg   %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x05
++
++#CHECK: vlrepg   %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x38 0x05
++
++#CHECK: vlrepg   %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x38 0x05
++
++#CHECK: vlreph   %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x05
++
++#CHECK: vlreph   %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x18 0x05
++
++#CHECK: vlreph   %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x18 0x05
++
++#CHECK: vlvgb   %v0, %r0, 0
++0xe7 0x00 0x00 0x00 0x00 0x22
++
++#CHECK: vlvgb   %v18, %r3, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x08 0x22
++
++#CHECK: vlvgb   %v31, %r15, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x08 0x22
++
++#CHECK: vlvgf   %v0, %r0, 0
++0xe7 0x00 0x00 0x00 0x20 0x22
++
++#CHECK: vlvgf   %v18, %r3, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x28 0x22
++
++#CHECK: vlvgf   %v31, %r15, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x28 0x22
++
++#CHECK: vlvgg   %v0, %r0, 0
++0xe7 0x00 0x00 0x00 0x30 0x22
++
++#CHECK: vlvgg   %v18, %r3, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x38 0x22
++
++#CHECK: vlvgg   %v31, %r15, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x38 0x22
++
++#CHECK: vlvgh   %v0, %r0, 0
++0xe7 0x00 0x00 0x00 0x10 0x22
++
++#CHECK: vlvgh   %v18, %r3, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x18 0x22
++
++#CHECK: vlvgh   %v31, %r15, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x18 0x22
++
++#CHECK: vlvgp   %v0, %r0, %r0
++0xe7 0x00 0x00 0x00 0x00 0x62
++
++#CHECK: vlvgp   %v18, %r3, %r4
++0xe7 0x23 0x40 0x00 0x08 0x62
++
++#CHECK: vlvgp   %v31, %r15, %r15
++0xe7 0xff 0xf0 0x00 0x08 0x62
++
++#CHECK: vmaeb   %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xae
++
++#CHECK: vmaeb   %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xae
++
++#CHECK: vmaeb   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xae
++
++#CHECK: vmaef   %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xae
++
++#CHECK: vmaef   %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xae
++
++#CHECK: vmaef   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xae
++
++#CHECK: vmaeh   %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xae
++
++#CHECK: vmaeh   %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xae
++
++#CHECK: vmaeh   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xae
++
++#CHECK: vmahb   %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xab
++
++#CHECK: vmahb   %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xab
++
++#CHECK: vmahb   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xab
++
++#CHECK: vmahf   %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xab
++
++#CHECK: vmahf   %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xab
++
++#CHECK: vmahf   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xab
++
++#CHECK: vmahh   %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xab
++
++#CHECK: vmahh   %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xab
++
++#CHECK: vmahh   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xab
++
++#CHECK: vmalb   %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xaa
++
++#CHECK: vmalb   %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xaa
++
++#CHECK: vmalb   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xaa
++
++#CHECK: vmaleb  %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xac
++
++#CHECK: vmaleb  %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xac
++
++#CHECK: vmaleb  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xac
++
++#CHECK: vmalef  %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xac
++
++#CHECK: vmalef  %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xac
++
++#CHECK: vmalef  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xac
++
++#CHECK: vmaleh  %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xac
++
++#CHECK: vmaleh  %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xac
++
++#CHECK: vmaleh  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xac
++
++#CHECK: vmalf   %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xaa
++
++#CHECK: vmalf   %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xaa
++
++#CHECK: vmalf   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xaa
++
++#CHECK: vmalhb  %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa9
++
++#CHECK: vmalhb  %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xa9
++
++#CHECK: vmalhb  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xa9
++
++#CHECK: vmalhf  %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xa9
++
++#CHECK: vmalhf  %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xa9
++
++#CHECK: vmalhf  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xa9
++
++#CHECK: vmalhh  %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xa9
++
++#CHECK: vmalhh  %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xa9
++
++#CHECK: vmalhh  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xa9
++
++#CHECK: vmalhw  %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xaa
++
++#CHECK: vmalhw  %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xaa
++
++#CHECK: vmalhw  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xaa
++
++#CHECK: vmalob  %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xad
++
++#CHECK: vmalob  %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xad
++
++#CHECK: vmalob  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xad
++
++#CHECK: vmalof  %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xad
++
++#CHECK: vmalof  %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xad
++
++#CHECK: vmalof  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xad
++
++#CHECK: vmaloh  %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xad
++
++#CHECK: vmaloh  %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xad
++
++#CHECK: vmaloh  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xad
++
++#CHECK: vmaob   %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xaf
++
++#CHECK: vmaob   %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0xaf
++
++#CHECK: vmaob   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0xaf
++
++#CHECK: vmaof   %v0, %v0, %v0, %v0
++0xe7 0x00 0x02 0x00 0x00 0xaf
++
++#CHECK: vmaof   %v3, %v20, %v5, %v22
++0xe7 0x34 0x52 0x00 0x65 0xaf
++
++#CHECK: vmaof   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf2 0x00 0xff 0xaf
++
++#CHECK: vmaoh   %v0, %v0, %v0, %v0
++0xe7 0x00 0x01 0x00 0x00 0xaf
++
++#CHECK: vmaoh   %v3, %v20, %v5, %v22
++0xe7 0x34 0x51 0x00 0x65 0xaf
++
++#CHECK: vmaoh   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf1 0x00 0xff 0xaf
++
++#CHECK: vmeb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa6
++
++#CHECK: vmeb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa6
++
++#CHECK: vmeb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa6
++
++#CHECK: vmef    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa6
++
++#CHECK: vmef    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa6
++
++#CHECK: vmef    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa6
++
++#CHECK: vmeh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa6
++
++#CHECK: vmeh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa6
++
++#CHECK: vmeh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa6
++
++#CHECK: vmhb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa3
++
++#CHECK: vmhb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa3
++
++#CHECK: vmhb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa3
++
++#CHECK: vmhf    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa3
++
++#CHECK: vmhf    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa3
++
++#CHECK: vmhf    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa3
++
++#CHECK: vmhh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa3
++
++#CHECK: vmhh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa3
++
++#CHECK: vmhh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa3
++
++#CHECK: vmlb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa2
++
++#CHECK: vmlb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa2
++
++#CHECK: vmlb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa2
++
++#CHECK: vmlf    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa2
++
++#CHECK: vmlf    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa2
++
++#CHECK: vmlf    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa2
++
++#CHECK: vmleb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa4
++
++#CHECK: vmleb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa4
++
++#CHECK: vmleb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa4
++
++#CHECK: vmlef   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa4
++
++#CHECK: vmlef   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa4
++
++#CHECK: vmlef   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa4
++
++#CHECK: vmleh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa4
++
++#CHECK: vmleh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa4
++
++#CHECK: vmleh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa4
++
++#CHECK: vmlhb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa1
++
++#CHECK: vmlhb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa1
++
++#CHECK: vmlhb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa1
++
++#CHECK: vmlhf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa1
++
++#CHECK: vmlhf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa1
++
++#CHECK: vmlhf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa1
++
++#CHECK: vmlhh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa1
++
++#CHECK: vmlhh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa1
++
++#CHECK: vmlhh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa1
++
++#CHECK: vmlhw   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa2
++
++#CHECK: vmlhw   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa2
++
++#CHECK: vmlhw   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa2
++
++#CHECK: vmlob   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa5
++
++#CHECK: vmlob   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa5
++
++#CHECK: vmlob   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa5
++
++#CHECK: vmlof   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa5
++
++#CHECK: vmlof   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa5
++
++#CHECK: vmlof   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa5
++
++#CHECK: vmloh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa5
++
++#CHECK: vmloh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa5
++
++#CHECK: vmloh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa5
++
++#CHECK: vmnb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xfe
++
++#CHECK: vmnb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xfe
++
++#CHECK: vmnb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xfe
++
++#CHECK: vmnf    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xfe
++
++#CHECK: vmnf    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xfe
++
++#CHECK: vmnf    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xfe
++
++#CHECK: vmng    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xfe
++
++#CHECK: vmng    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xfe
++
++#CHECK: vmng    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xfe
++
++#CHECK: vmnh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xfe
++
++#CHECK: vmnh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xfe
++
++#CHECK: vmnh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xfe
++
++#CHECK: vmnlb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xfc
++
++#CHECK: vmnlb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xfc
++
++#CHECK: vmnlb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xfc
++
++#CHECK: vmnlf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xfc
++
++#CHECK: vmnlf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xfc
++
++#CHECK: vmnlf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xfc
++
++#CHECK: vmnlg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xfc
++
++#CHECK: vmnlg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xfc
++
++#CHECK: vmnlg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xfc
++
++#CHECK: vmnlh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xfc
++
++#CHECK: vmnlh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xfc
++
++#CHECK: vmnlh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xfc
++
++#CHECK: vmob    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xa7
++
++#CHECK: vmob    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xa7
++
++#CHECK: vmob    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xa7
++
++#CHECK: vmof    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xa7
++
++#CHECK: vmof    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xa7
++
++#CHECK: vmof    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xa7
++
++#CHECK: vmoh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xa7
++
++#CHECK: vmoh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xa7
++
++#CHECK: vmoh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xa7
++
++#CHECK: vmrhb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x61
++
++#CHECK: vmrhb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x61
++
++#CHECK: vmrhb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x61
++
++#CHECK: vmrhf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x61
++
++#CHECK: vmrhf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x61
++
++#CHECK: vmrhf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x61
++
++#CHECK: vmrhg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x61
++
++#CHECK: vmrhg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x61
++
++#CHECK: vmrhg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x61
++
++#CHECK: vmrhh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x61
++
++#CHECK: vmrhh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x61
++
++#CHECK: vmrhh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x61
++
++#CHECK: vmrlb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x60
++
++#CHECK: vmrlb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x60
++
++#CHECK: vmrlb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x60
++
++#CHECK: vmrlf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x60
++
++#CHECK: vmrlf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x60
++
++#CHECK: vmrlf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x60
++
++#CHECK: vmrlg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x60
++
++#CHECK: vmrlg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x60
++
++#CHECK: vmrlg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x60
++
++#CHECK: vmrlh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x60
++
++#CHECK: vmrlh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x60
++
++#CHECK: vmrlh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x60
++
++#CHECK: vmxb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xff
++
++#CHECK: vmxb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xff
++
++#CHECK: vmxb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xff
++
++#CHECK: vmxf    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xff
++
++#CHECK: vmxf    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xff
++
++#CHECK: vmxf    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xff
++
++#CHECK: vmxg    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xff
++
++#CHECK: vmxg    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xff
++
++#CHECK: vmxg    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xff
++
++#CHECK: vmxh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xff
++
++#CHECK: vmxh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xff
++
++#CHECK: vmxh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xff
++
++#CHECK: vmxlb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xfd
++
++#CHECK: vmxlb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xfd
++
++#CHECK: vmxlb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xfd
++
++#CHECK: vmxlf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xfd
++
++#CHECK: vmxlf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xfd
++
++#CHECK: vmxlf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xfd
++
++#CHECK: vmxlg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xfd
++
++#CHECK: vmxlg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xfd
++
++#CHECK: vmxlg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xfd
++
++#CHECK: vmxlh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xfd
++
++#CHECK: vmxlh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xfd
++
++#CHECK: vmxlh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xfd
++
++#CHECK: vn      %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x68
++
++#CHECK: vn      %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x68
++
++#CHECK: vn      %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x68
++
++#CHECK: vnc     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x69
++
++#CHECK: vnc     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x69
++
++#CHECK: vnc     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x69
++
++#CHECK: vno     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x6b
++
++#CHECK: vno     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x6b
++
++#CHECK: vno     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x6b
++
++#CHECK: vo      %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x6a
++
++#CHECK: vo      %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x6a
++
++#CHECK: vo      %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x6a
++
++#CHECK: vpdi    %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x84
++
++#CHECK: vpdi    %v3, %v20, %v5, 4
++0xe7 0x34 0x50 0x00 0x44 0x84
++
++#CHECK: vpdi    %v31, %v31, %v31, 15
++0xe7 0xff 0xf0 0x00 0xfe 0x84
++
++#CHECK: vperm   %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x8c
++
++#CHECK: vperm   %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0x8c
++
++#CHECK: vperm   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0x8c
++
++#CHECK: vpkf    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x94
++
++#CHECK: vpkf    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x94
++
++#CHECK: vpkf    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x94
++
++#CHECK: vpkg    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x94
++
++#CHECK: vpkg    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x94
++
++#CHECK: vpkg    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x94
++
++#CHECK: vpkh    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x94
++
++#CHECK: vpkh    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x94
++
++#CHECK: vpkh    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x94
++
++#CHECK: vpklsf  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x95
++
++#CHECK: vpklsf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x95
++
++#CHECK: vpklsfs %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0x95
++
++#CHECK: vpklsf  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x95
++
++#CHECK: vpklsg  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x95
++
++#CHECK: vpklsg  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x95
++
++#CHECK: vpklsgs %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x34 0x95
++
++#CHECK: vpklsg  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x95
++
++#CHECK: vpklsh  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x95
++
++#CHECK: vpklsh  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x95
++
++#CHECK: vpklshs %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0x95
++
++#CHECK: vpklsh  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x95
++
++#CHECK: vpksf   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x97
++
++#CHECK: vpksf   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x97
++
++#CHECK: vpksfs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x24 0x97
++
++#CHECK: vpksf   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x97
++
++#CHECK: vpksg   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x97
++
++#CHECK: vpksg   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x97
++
++#CHECK: vpksgs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x34 0x97
++
++#CHECK: vpksg   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x97
++
++#CHECK: vpksh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x97
++
++#CHECK: vpksh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x97
++
++#CHECK: vpkshs  %v7, %v24, %v9
++0xe7 0x78 0x90 0x10 0x14 0x97
++
++#CHECK: vpksh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x97
++
++#CHECK: vpopct  %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x50
++
++#CHECK: vpopct  %v19, %v14, 0
++0xe7 0x3e 0x00 0x00 0x08 0x50
++
++#CHECK: vpopct  %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0x50
++
++#CHECK: vrepb   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x4d
++
++#CHECK: vrepb   %v19, %v4, 22136
++0xe7 0x34 0x56 0x78 0x08 0x4d
++
++#CHECK: vrepb   %v31, %v31, 65535
++0xe7 0xff 0xff 0xff 0x0c 0x4d
++
++#CHECK: vrepf   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x4d
++
++#CHECK: vrepf   %v19, %v4, 22136
++0xe7 0x34 0x56 0x78 0x28 0x4d
++
++#CHECK: vrepf   %v31, %v31, 65535
++0xe7 0xff 0xff 0xff 0x2c 0x4d
++
++#CHECK: vrepg   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x4d
++
++#CHECK: vrepg   %v19, %v4, 22136
++0xe7 0x34 0x56 0x78 0x38 0x4d
++
++#CHECK: vrepg   %v31, %v31, 65535
++0xe7 0xff 0xff 0xff 0x3c 0x4d
++
++#CHECK: vreph   %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x4d
++
++#CHECK: vreph   %v19, %v4, 22136
++0xe7 0x34 0x56 0x78 0x18 0x4d
++
++#CHECK: vreph   %v31, %v31, 65535
++0xe7 0xff 0xff 0xff 0x1c 0x4d
++
++#CHECK: vrepib  %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x45
++
++#CHECK: vrepib  %v23, -30293
++0xe7 0x70 0x89 0xab 0x08 0x45
++
++#CHECK: vrepib  %v31, -1
++0xe7 0xf0 0xff 0xff 0x08 0x45
++
++#CHECK: vrepif  %v0, 0
++0xe7 0x00 0x00 0x00 0x20 0x45
++
++#CHECK: vrepif  %v23, -30293
++0xe7 0x70 0x89 0xab 0x28 0x45
++
++#CHECK: vrepif  %v31, -1
++0xe7 0xf0 0xff 0xff 0x28 0x45
++
++#CHECK: vrepig  %v0, 0
++0xe7 0x00 0x00 0x00 0x30 0x45
++
++#CHECK: vrepig  %v23, -30293
++0xe7 0x70 0x89 0xab 0x38 0x45
++
++#CHECK: vrepig  %v31, -1
++0xe7 0xf0 0xff 0xff 0x38 0x45
++
++#CHECK: vrepih  %v0, 0
++0xe7 0x00 0x00 0x00 0x10 0x45
++
++#CHECK: vrepih  %v23, -30293
++0xe7 0x70 0x89 0xab 0x18 0x45
++
++#CHECK: vrepih  %v31, -1
++0xe7 0xf0 0xff 0xff 0x18 0x45
++
++#CHECK: vsb     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf7
++
++#CHECK: vsb     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf7
++
++#CHECK: vsb     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf7
++
++#CHECK: vsbiq   %v0, %v0, %v0, %v0
++0xe7 0x00 0x04 0x00 0x00 0xbf
++
++#CHECK: vsbiq   %v3, %v20, %v5, %v22
++0xe7 0x34 0x54 0x00 0x65 0xbf
++
++#CHECK: vsbiq   %v31, %v31, %v31, %v31
++0xe7 0xff 0xf4 0x00 0xff 0xbf
++
++#CHECK: vsbcbiq %v0, %v0, %v0, %v0
++0xe7 0x00 0x04 0x00 0x00 0xbd
++
++#CHECK: vsbcbiq %v3, %v20, %v5, %v22
++0xe7 0x34 0x54 0x00 0x65 0xbd
++
++#CHECK: vsbcbiq %v31, %v31, %v31, %v31
++0xe7 0xff 0xf4 0x00 0xff 0xbd
++
++#CHECK: vscbib  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xf5
++
++#CHECK: vscbib  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0xf5
++
++#CHECK: vscbib  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0xf5
++
++#CHECK: vscbif  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf5
++
++#CHECK: vscbif  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf5
++
++#CHECK: vscbif  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf5
++
++#CHECK: vscbig  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf5
++
++#CHECK: vscbig  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf5
++
++#CHECK: vscbig  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf5
++
++#CHECK: vscbih  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf5
++
++#CHECK: vscbih  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf5
++
++#CHECK: vscbih  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf5
++
++#CHECK: vscbiq  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x40 0xf5
++
++#CHECK: vscbiq  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x4a 0xf5
++
++#CHECK: vscbiq  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x4e 0xf5
++
++#CHECK: vscef   %v0, 0(%v0), 0
++0xe7 0x00 0x00 0x00 0x00 0x1b
++
++#CHECK: vscef   %v10, 1000(%v19,%r7), 2
++0xe7 0xa3 0x73 0xe8 0x24 0x1b
++
++#CHECK: vscef   %v31, 4095(%v31,%r15), 3
++0xe7 0xff 0xff 0xff 0x3c 0x1b
++
++#CHECK: vsceg   %v0, 0(%v0), 0
++0xe7 0x00 0x00 0x00 0x00 0x1a
++
++#CHECK: vsceg   %v10, 1000(%v19,%r7), 1
++0xe7 0xa3 0x73 0xe8 0x14 0x1a
++
++#CHECK: vsceg   %v31, 4095(%v31,%r15), 1
++0xe7 0xff 0xff 0xff 0x1c 0x1a
++
++#CHECK: vsegb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x5f
++
++#CHECK: vsegb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0x5f
++
++#CHECK: vsegb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0x5f
++
++#CHECK: vsegf   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x5f
++
++#CHECK: vsegf   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0x5f
++
++#CHECK: vsegf   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0x5f
++
++#CHECK: vsegh   %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x5f
++
++#CHECK: vsegh   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0x5f
++
++#CHECK: vsegh   %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0x5f
++
++#CHECK: vsel    %v0, %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x8d
++
++#CHECK: vsel    %v3, %v20, %v5, %v22
++0xe7 0x34 0x50 0x00 0x65 0x8d
++
++#CHECK: vsel    %v31, %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0xff 0x8d
++
++#CHECK: vsf     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xf7
++
++#CHECK: vsf     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0xf7
++
++#CHECK: vsf     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0xf7
++
++#CHECK: vsg     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0xf7
++
++#CHECK: vsg     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0xf7
++
++#CHECK: vsg     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0xf7
++
++#CHECK: vsh     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xf7
++
++#CHECK: vsh     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0xf7
++
++#CHECK: vsh     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0xf7
++
++#CHECK: vsl     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x74
++
++#CHECK: vsl     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x74
++
++#CHECK: vsl     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x74
++
++#CHECK: vslb    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x75
++
++#CHECK: vslb    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x75
++
++#CHECK: vslb    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x75
++
++#CHECK: vsldb   %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x77
++
++#CHECK: vsldb   %v3, %v20, %v5, 103
++0xe7 0x34 0x50 0x67 0x04 0x77
++
++#CHECK: vsldb   %v31, %v31, %v31, 255
++0xe7 0xff 0xf0 0xff 0x0e 0x77
++
++#CHECK: vsq     %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x40 0xf7
++
++#CHECK: vsq     %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x4a 0xf7
++
++#CHECK: vsq     %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x4e 0xf7
++
++#CHECK: vsra    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x7e
++
++#CHECK: vsra    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x7e
++
++#CHECK: vsra    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x7e
++
++#CHECK: vsrab   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x7f
++
++#CHECK: vsrab   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x7f
++
++#CHECK: vsrab   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x7f
++
++#CHECK: vsrl    %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x7c
++
++#CHECK: vsrl    %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x7c
++
++#CHECK: vsrl    %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x7c
++
++#CHECK: vsrlb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x7d
++
++#CHECK: vsrlb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x7d
++
++#CHECK: vsrlb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x7d
++
++#CHECK: vst     %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x0E
++
++#CHECK: vst     %v17, 2475(%r7,%r8)
++0xe7 0x17 0x89 0xab 0x08 0x0E
++
++#CHECK: vst     %v31, 4095(%r15,%r15)
++0xe7 0xff 0xff 0xff 0x08 0x0E
++
++#CHECK: vsteb   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x08
++
++#CHECK: vsteb   %v17, 2475(%r7,%r8), 12
++0xe7 0x17 0x89 0xab 0xc8 0x08
++
++#CHECK: vsteb   %v31, 4095(%r15,%r15), 15
++0xe7 0xff 0xff 0xff 0xf8 0x08
++
++#CHECK: vstef   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x0b
++
++#CHECK: vstef   %v17, 2475(%r7,%r8), 2
++0xe7 0x17 0x89 0xab 0x28 0x0b
++
++#CHECK: vstef   %v31, 4095(%r15,%r15), 3
++0xe7 0xff 0xff 0xff 0x38 0x0b
++
++#CHECK: vsteg   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x0a
++
++#CHECK: vsteg   %v17, 2475(%r7,%r8), 1
++0xe7 0x17 0x89 0xab 0x18 0x0a
++
++#CHECK: vsteg   %v31, 4095(%r15,%r15), 1
++0xe7 0xff 0xff 0xff 0x18 0x0a
++
++#CHECK: vsteh   %v0, 0, 0
++0xe7 0x00 0x00 0x00 0x00 0x09
++
++#CHECK: vsteh   %v17, 2475(%r7,%r8), 5
++0xe7 0x17 0x89 0xab 0x58 0x09
++
++#CHECK: vsteh   %v31, 4095(%r15,%r15), 7
++0xe7 0xff 0xff 0xff 0x78 0x09
++
++#CHECK: vstl    %v0, %r0, 0
++0xe7 0x00 0x00 0x00 0x00 0x3f
++
++#CHECK: vstl    %v18, %r3, 1383(%r4)
++0xe7 0x23 0x45 0x67 0x08 0x3f
++
++#CHECK: vstl    %v31, %r15, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x08 0x3f
++
++#CHECK: vstm    %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x3e
++
++#CHECK: vstm    %v12, %v18, 1110(%r3)
++0xe7 0xc2 0x34 0x56 0x04 0x3e
++
++#CHECK: vstm    %v31, %v31, 4095(%r15)
++0xe7 0xff 0xff 0xff 0x0c 0x3e
++
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 0
++0xe7 0x00 0x00 0x00 0x00 0x8a
++
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 12
++0xe7 0x00 0x00 0xc0 0x00 0x8a
++
++#CHECK: vstrcb   %v18, %v3, %v20, %v5, 0
++0xe7 0x23 0x40 0x00 0x5a 0x8a
++
++#CHECK: vstrcb   %v31, %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x40 0xff 0x8a
++
++#CHECK: vstrcbs  %v31, %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0x90 0xff 0x8a
++
++#CHECK: vstrczb  %v31, %v31, %v31, %v31, 4
++0xe7 0xff 0xf0 0x60 0xff 0x8a
++
++#CHECK: vstrczbs %v31, %v31, %v31, %v31, 8
++0xe7 0xff 0xf0 0xb0 0xff 0x8a
++
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 0
++0xe7 0x00 0x02 0x00 0x00 0x8a
++
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 12
++0xe7 0x00 0x02 0xc0 0x00 0x8a
++
++#CHECK: vstrcf   %v18, %v3, %v20, %v5, 0
++0xe7 0x23 0x42 0x00 0x5a 0x8a
++
++#CHECK: vstrcf   %v31, %v31, %v31, %v31, 4
++0xe7 0xff 0xf2 0x40 0xff 0x8a
++
++#CHECK: vstrcfs  %v31, %v31, %v31, %v31, 8
++0xe7 0xff 0xf2 0x90 0xff 0x8a
++
++#CHECK: vstrczf  %v31, %v31, %v31, %v31, 4
++0xe7 0xff 0xf2 0x60 0xff 0x8a
++
++#CHECK: vstrczfs %v31, %v31, %v31, %v31, 8
++0xe7 0xff 0xf2 0xb0 0xff 0x8a
++
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 0
++0xe7 0x00 0x01 0x00 0x00 0x8a
++
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 12
++0xe7 0x00 0x01 0xc0 0x00 0x8a
++
++#CHECK: vstrch   %v18, %v3, %v20, %v5, 0
++0xe7 0x23 0x41 0x00 0x5a 0x8a
++
++#CHECK: vstrch   %v31, %v31, %v31, %v31, 4
++0xe7 0xff 0xf1 0x40 0xff 0x8a
++
++#CHECK: vstrchs  %v31, %v31, %v31, %v31, 8
++0xe7 0xff 0xf1 0x90 0xff 0x8a
++
++#CHECK: vstrczh  %v31, %v31, %v31, %v31, 4
++0xe7 0xff 0xf1 0x60 0xff 0x8a
++
++#CHECK: vstrczhs %v31, %v31, %v31, %v31, 8
++0xe7 0xff 0xf1 0xb0 0xff 0x8a
++
++#CHECK: vsumgh  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x65
++
++#CHECK: vsumgh  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x65
++
++#CHECK: vsumgh  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x65
++
++#CHECK: vsumgf  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x65
++
++#CHECK: vsumgf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x65
++
++#CHECK: vsumgf  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x65
++
++#CHECK: vsumqf  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0x67
++
++#CHECK: vsumqf  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x2a 0x67
++
++#CHECK: vsumqf  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x2e 0x67
++
++#CHECK: vsumqg  %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x30 0x67
++
++#CHECK: vsumqg  %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x3a 0x67
++
++#CHECK: vsumqg  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x3e 0x67
++
++#CHECK: vsumb   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x64
++
++#CHECK: vsumb   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x64
++
++#CHECK: vsumb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x64
++
++#CHECK: vsumh   %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0x64
++
++#CHECK: vsumh   %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x1a 0x64
++
++#CHECK: vsumh   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x1e 0x64
++
++#CHECK: vtm     %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xd8
++
++#CHECK: vtm     %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xd8
++
++#CHECK: vtm     %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xd8
++
++#CHECK: vuphb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xd7
++
++#CHECK: vuphb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xd7
++
++#CHECK: vuphb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xd7
++
++#CHECK: vuphf   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xd7
++
++#CHECK: vuphf   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xd7
++
++#CHECK: vuphf   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xd7
++
++#CHECK: vuphh   %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xd7
++
++#CHECK: vuphh   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xd7
++
++#CHECK: vuphh   %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xd7
++
++#CHECK: vuplhb  %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xd5
++
++#CHECK: vuplhb  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xd5
++
++#CHECK: vuplhb  %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xd5
++
++#CHECK: vuplhf  %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xd5
++
++#CHECK: vuplhf  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xd5
++
++#CHECK: vuplhf  %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xd5
++
++#CHECK: vuplhh  %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xd5
++
++#CHECK: vuplhh  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xd5
++
++#CHECK: vuplhh  %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xd5
++
++#CHECK: vuplb   %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xd6
++
++#CHECK: vuplb   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xd6
++
++#CHECK: vuplb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xd6
++
++#CHECK: vuplf   %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xd6
++
++#CHECK: vuplf   %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xd6
++
++#CHECK: vuplf   %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xd6
++
++#CHECK: vuplhw  %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xd6
++
++#CHECK: vuplhw  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xd6
++
++#CHECK: vuplhw  %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xd6
++
++#CHECK: vupllb  %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0xd4
++
++#CHECK: vupllb  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x08 0xd4
++
++#CHECK: vupllb  %v31, %v31
++0xe7 0xff 0x00 0x00 0x0c 0xd4
++
++#CHECK: vupllf  %v0, %v0
++0xe7 0x00 0x00 0x00 0x20 0xd4
++
++#CHECK: vupllf  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x28 0xd4
++
++#CHECK: vupllf  %v31, %v31
++0xe7 0xff 0x00 0x00 0x2c 0xd4
++
++#CHECK: vupllh  %v0, %v0
++0xe7 0x00 0x00 0x00 0x10 0xd4
++
++#CHECK: vupllh  %v19, %v14
++0xe7 0x3e 0x00 0x00 0x18 0xd4
++
++#CHECK: vupllh  %v31, %v31
++0xe7 0xff 0x00 0x00 0x1c 0xd4
++
++#CHECK: vx      %v0, %v0, %v0
++0xe7 0x00 0x00 0x00 0x00 0x6d
++
++#CHECK: vx      %v18, %v3, %v20
++0xe7 0x23 0x40 0x00 0x0a 0x6d
++
++#CHECK: vx      %v31, %v31, %v31
++0xe7 0xff 0xf0 0x00 0x0e 0x6d
++
++#CHECK: wcdgb   %f0, %f0, 0, 0
++0xe7 0x00 0x00 0x08 0x30 0xc3
++
++#CHECK: wcdgb   %v19, %f14, 4, 10
++0xe7 0x3e 0x00 0xac 0x38 0xc3
++
++#CHECK: wcdgb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xff 0x3c 0xc3
++
++#CHECK: wcdlgb  %f0, %f0, 0, 0
++0xe7 0x00 0x00 0x08 0x30 0xc1
++
++#CHECK: wcdlgb  %v19, %f14, 4, 10
++0xe7 0x3e 0x00 0xac 0x38 0xc1
++
++#CHECK: wcdlgb  %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xff 0x3c 0xc1
++
++#CHECK: wcgdb   %f0, %f0, 0, 0
++0xe7 0x00 0x00 0x08 0x30 0xc2
++
++#CHECK: wcgdb   %v19, %f14, 4, 10
++0xe7 0x3e 0x00 0xac 0x38 0xc2
++
++#CHECK: wcgdb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xff 0x3c 0xc2
++
++#CHECK: wclgdb  %f0, %f0, 0, 0
++0xe7 0x00 0x00 0x08 0x30 0xc0
++
++#CHECK: wclgdb  %v19, %f14, 4, 10
++0xe7 0x3e 0x00 0xac 0x38 0xc0
++
++#CHECK: wclgdb  %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xff 0x3c 0xc0
++
++#CHECK: wfadb   %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xe3
++
++#CHECK: wfadb   %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xe3
++
++#CHECK: wfadb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xe3
++
++#CHECK: wfcdb   %f0, %f0
++0xe7 0x00 0x00 0x00 0x30 0xcb
++
++#CHECK: wfcdb   %v19, %f14
++0xe7 0x3e 0x00 0x00 0x38 0xcb
++
++#CHECK: wfcdb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xcb
++
++#CHECK: wfcedb  %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xe8
++
++#CHECK: wfcedb  %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xe8
++
++#CHECK: wfcedb  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xe8
++
++#CHECK: wfcedbs %f0, %f0, %f0
++0xe7 0x00 0x00 0x18 0x30 0xe8
++
++#CHECK: wfcedbs %v18, %f3, %v20
++0xe7 0x23 0x40 0x18 0x3a 0xe8
++
++#CHECK: wfcedbs %v31, %v31, %v31
++0xe7 0xff 0xf0 0x18 0x3e 0xe8
++
++#CHECK: wfchdb  %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xeb
++
++#CHECK: wfchdb  %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xeb
++
++#CHECK: wfchdb  %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xeb
++
++#CHECK: wfchdbs %f0, %f0, %f0
++0xe7 0x00 0x00 0x18 0x30 0xeb
++
++#CHECK: wfchdbs %v18, %f3, %v20
++0xe7 0x23 0x40 0x18 0x3a 0xeb
++
++#CHECK: wfchdbs %v31, %v31, %v31
++0xe7 0xff 0xf0 0x18 0x3e 0xeb
++
++#CHECK: wfchedb %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xea
++
++#CHECK: wfchedb %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xea
++
++#CHECK: wfchedb %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xea
++
++#CHECK: wfchedbs %f0, %f0, %f0
++0xe7 0x00 0x00 0x18 0x30 0xea
++
++#CHECK: wfchedbs %v18, %f3, %v20
++0xe7 0x23 0x40 0x18 0x3a 0xea
++
++#CHECK: wfchedbs %v31, %v31, %v31
++0xe7 0xff 0xf0 0x18 0x3e 0xea
++
++#CHECK: wfddb   %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xe5
++
++#CHECK: wfddb   %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xe5
++
++#CHECK: wfddb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xe5
++
++#CHECK: wfidb   %f0, %f0, 0, 0
++0xe7 0x00 0x00 0x08 0x30 0xc7
++
++#CHECK: wfidb   %v19, %f14, 4, 10
++0xe7 0x3e 0x00 0xac 0x38 0xc7
++
++#CHECK: wfidb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xff 0x3c 0xc7
++
++#CHECK: wfkdb   %f0, %f0
++0xe7 0x00 0x00 0x00 0x30 0xca
++
++#CHECK: wfkdb   %v19, %f14
++0xe7 0x3e 0x00 0x00 0x38 0xca
++
++#CHECK: wfkdb   %v31, %v31
++0xe7 0xff 0x00 0x00 0x3c 0xca
++
++#CHECK: wflcdb  %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xcc
++
++#CHECK: wflcdb  %v19, %f14
++0xe7 0x3e 0x00 0x08 0x38 0xcc
++
++#CHECK: wflcdb  %v31, %v31
++0xe7 0xff 0x00 0x08 0x3c 0xcc
++
++#CHECK: wflndb  %f0, %f0
++0xe7 0x00 0x00 0x18 0x30 0xcc
++
++#CHECK: wflndb  %v19, %f14
++0xe7 0x3e 0x00 0x18 0x38 0xcc
++
++#CHECK: wflndb  %v31, %v31
++0xe7 0xff 0x00 0x18 0x3c 0xcc
++
++#CHECK: wflpdb  %f0, %f0
++0xe7 0x00 0x00 0x28 0x30 0xcc
++
++#CHECK: wflpdb  %v19, %f14
++0xe7 0x3e 0x00 0x28 0x38 0xcc
++
++#CHECK: wflpdb  %v31, %v31
++0xe7 0xff 0x00 0x28 0x3c 0xcc
++
++#CHECK: wfmadb  %f0, %f0, %f0, %f0
++0xe7 0x00 0x03 0x08 0x00 0x8f
++
++#CHECK: wfmadb  %f3, %v20, %f5, %v22
++0xe7 0x34 0x53 0x08 0x65 0x8f
++
++#CHECK: wfmadb  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf3 0x08 0xff 0x8f
++
++#CHECK: wfmdb   %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xe7
++
++#CHECK: wfmdb   %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xe7
++
++#CHECK: wfmdb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xe7
++
++#CHECK: wfmsdb  %f0, %f0, %f0, %f0
++0xe7 0x00 0x03 0x08 0x00 0x8e
++
++#CHECK: wfmsdb  %f3, %v20, %f5, %v22
++0xe7 0x34 0x53 0x08 0x65 0x8e
++
++#CHECK: wfmsdb  %v31, %v31, %v31, %v31
++0xe7 0xff 0xf3 0x08 0xff 0x8e
++
++#CHECK: wfsdb   %f0, %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xe2
++
++#CHECK: wfsdb   %v18, %f3, %v20
++0xe7 0x23 0x40 0x08 0x3a 0xe2
++
++#CHECK: wfsdb   %v31, %v31, %v31
++0xe7 0xff 0xf0 0x08 0x3e 0xe2
++
++#CHECK: wfsqdb  %f0, %f0
++0xe7 0x00 0x00 0x08 0x30 0xce
++
++#CHECK: wfsqdb  %v19, %f14
++0xe7 0x3e 0x00 0x08 0x38 0xce
++
++#CHECK: wfsqdb  %v31, %v31
++0xe7 0xff 0x00 0x08 0x3c 0xce
++
++#CHECK: wftcidb %f0, %f0, 0
++0xe7 0x00 0x00 0x08 0x30 0x4a
++
++#CHECK: wftcidb %v19, %f4, 1383
++0xe7 0x34 0x56 0x78 0x38 0x4a
++
++#CHECK: wftcidb %v31, %v31, 4095
++0xe7 0xff 0xff 0xf8 0x3c 0x4a
++
++#CHECK: wldeb   %f0, %f0
++0xe7 0x00 0x00 0x08 0x20 0xc4
++
++#CHECK: wldeb   %v19, %f14
++0xe7 0x3e 0x00 0x08 0x28 0xc4
++
++#CHECK: wldeb   %v31, %v31
++0xe7 0xff 0x00 0x08 0x2c 0xc4
++
++#CHECK: wledb   %f0, %f0, 0, 0
++0xe7 0x00 0x00 0x08 0x30 0xc5
++
++#CHECK: wledb   %v19, %f14, 4, 10
++0xe7 0x3e 0x00 0xac 0x38 0xc5
++
++#CHECK: wledb   %v31, %v31, 7, 15
++0xe7 0xff 0x00 0xff 0x3c 0xc5
+Index: llvm-36/test/MC/Disassembler/SystemZ/insns.txt
+===================================================================
+--- llvm-36.orig/test/MC/Disassembler/SystemZ/insns.txt
++++ llvm-36/test/MC/Disassembler/SystemZ/insns.txt
+@@ -2503,6 +2503,15 @@
+ # CHECK: ear %r15, %a15
+ 0xb2 0x4f 0x00 0xff
+ 
++# CHECK: etnd %r0
++0xb2 0xec 0x00 0x00
++
++# CHECK: etnd %r15
++0xb2 0xec 0x00 0xf0
++
++# CHECK: etnd %r7
++0xb2 0xec 0x00 0x70
++
+ # CHECK: fidbr %f0, 0, %f0
+ 0xb3 0x5f 0x00 0x00
+ 
+@@ -6034,6 +6043,36 @@
+ # CHECK: ny %r15, 0
+ 0xe3 0xf0 0x00 0x00 0x00 0x54
+ 
++# CHECK: ntstg %r0, -524288
++0xe3 0x00 0x00 0x00 0x80 0x25
++
++# CHECK: ntstg %r0, -1
++0xe3 0x00 0x0f 0xff 0xff 0x25
++
++# CHECK: ntstg %r0, 0
++0xe3 0x00 0x00 0x00 0x00 0x25
++
++# CHECK: ntstg %r0, 1
++0xe3 0x00 0x00 0x01 0x00 0x25
++
++# CHECK: ntstg %r0, 524287
++0xe3 0x00 0x0f 0xff 0x7f 0x25
++
++# CHECK: ntstg %r0, 0(%r1)
++0xe3 0x00 0x10 0x00 0x00 0x25
++
++# CHECK: ntstg %r0, 0(%r15)
++0xe3 0x00 0xf0 0x00 0x00 0x25
++
++# CHECK: ntstg %r0, 524287(%r1,%r15)
++0xe3 0x01 0xff 0xff 0x7f 0x25
++
++# CHECK: ntstg %r0, 524287(%r15,%r1)
++0xe3 0x0f 0x1f 0xff 0x7f 0x25
++
++# CHECK: ntstg %r15, 0
++0xe3 0xf0 0x00 0x00 0x00 0x25
++
+ # CHECK: oc 0(1), 0
+ 0xd6 0x00 0x00 0x00 0x00 0x00
+ 
+@@ -6334,6 +6373,33 @@
+ # CHECK: pfd 15, 0
+ 0xe3 0xf0 0x00 0x00 0x00 0x36
+ 
++# CHECK: popcnt %r0, %r0
++0xb9 0xe1 0x00 0x00
++
++# CHECK: popcnt %r0, %r15
++0xb9 0xe1 0x00 0x0f
++
++# CHECK: popcnt %r15, %r0
++0xb9 0xe1 0x00 0xf0
++
++# CHECK: popcnt %r7, %r8
++0xb9 0xe1 0x00 0x78
++
++# CHECK: ppa %r0, %r0, 0
++0xb2 0xe8 0x00 0x00
++
++# CHECK: ppa %r0, %r0, 15
++0xb2 0xe8 0xf0 0x00
++
++# CHECK: ppa %r0, %r15, 0
++0xb2 0xe8 0x00 0x0f
++
++# CHECK: ppa %r4, %r6, 7
++0xb2 0xe8 0x70 0x46
++
++# CHECK: ppa %r15, %r0, 0
++0xb2 0xe8 0x00 0xf0
++
+ # CHECK: risbg %r0, %r0, 0, 0, 0
+ 0xec 0x00 0x00 0x00 0x00 0x55
+ 
+@@ -6355,6 +6421,27 @@
+ # CHECK: risbg %r4, %r5, 6, 7, 8
+ 0xec 0x45 0x06 0x07 0x08 0x55
+ 
++# CHECK: risbgn %r0, %r0, 0, 0, 0
++0xec 0x00 0x00 0x00 0x00 0x59
++
++# CHECK: risbgn %r0, %r0, 0, 0, 63
++0xec 0x00 0x00 0x00 0x3f 0x59
++
++# CHECK: risbgn %r0, %r0, 0, 255, 0
++0xec 0x00 0x00 0xff 0x00 0x59
++
++# CHECK: risbgn %r0, %r0, 255, 0, 0
++0xec 0x00 0xff 0x00 0x00 0x59
++
++# CHECK: risbgn %r0, %r15, 0, 0, 0
++0xec 0x0f 0x00 0x00 0x00 0x59
++
++# CHECK: risbgn %r15, %r0, 0, 0, 0
++0xec 0xf0 0x00 0x00 0x00 0x59
++
++# CHECK: risbgn %r4, %r5, 6, 7, 8
++0xec 0x45 0x06 0x07 0x08 0x59
++
+ # CHECK: risbhg %r0, %r0, 0, 0, 0
+ 0xec 0x00 0x00 0x00 0x00 0x5d
+ 
+@@ -8029,6 +8116,93 @@
+ # CHECK: sy %r15, 0
+ 0xe3 0xf0 0x00 0x00 0x00 0x5b
+ 
++# CHECK: tabort 0
++0xb2 0xfc 0x00 0x00
++
++# CHECK: tabort 0(%r1)
++0xb2 0xfc 0x10 0x00
++
++# CHECK: tabort 0(%r15)
++0xb2 0xfc 0xf0 0x00
++
++# CHECK: tabort 4095
++0xb2 0xfc 0x0f 0xff
++
++# CHECK: tabort 4095(%r1)
++0xb2 0xfc 0x1f 0xff
++
++# CHECK: tabort 4095(%r15)
++0xb2 0xfc 0xff 0xff
++
++# CHECK: tbegin 0, 0
++0xe5 0x60 0x00 0x00 0x00 0x00
++
++# CHECK: tbegin 4095, 0
++0xe5 0x60 0x0f 0xff 0x00 0x00
++
++# CHECK: tbegin 0, 0
++0xe5 0x60 0x00 0x00 0x00 0x00
++
++# CHECK: tbegin 0, 1
++0xe5 0x60 0x00 0x00 0x00 0x01
++
++# CHECK: tbegin 0, 32767
++0xe5 0x60 0x00 0x00 0x7f 0xff
++
++# CHECK: tbegin 0, 32768
++0xe5 0x60 0x00 0x00 0x80 0x00
++
++# CHECK: tbegin 0, 65535
++0xe5 0x60 0x00 0x00 0xff 0xff
++
++# CHECK: tbegin 0(%r1), 42
++0xe5 0x60 0x10 0x00 0x00 0x2a
++
++# CHECK: tbegin 0(%r15), 42
++0xe5 0x60 0xf0 0x00 0x00 0x2a
++
++# CHECK: tbegin 4095(%r1), 42
++0xe5 0x60 0x1f 0xff 0x00 0x2a
++
++# CHECK: tbegin 4095(%r15), 42
++0xe5 0x60 0xff 0xff 0x00 0x2a
++
++# CHECK: tbeginc 0, 0
++0xe5 0x61 0x00 0x00 0x00 0x00
++
++# CHECK: tbeginc 4095, 0
++0xe5 0x61 0x0f 0xff 0x00 0x00
++
++# CHECK: tbeginc 0, 0
++0xe5 0x61 0x00 0x00 0x00 0x00
++
++# CHECK: tbeginc 0, 1
++0xe5 0x61 0x00 0x00 0x00 0x01
++
++# CHECK: tbeginc 0, 32767
++0xe5 0x61 0x00 0x00 0x7f 0xff
++
++# CHECK: tbeginc 0, 32768
++0xe5 0x61 0x00 0x00 0x80 0x00
++
++# CHECK: tbeginc 0, 65535
++0xe5 0x61 0x00 0x00 0xff 0xff
++
++# CHECK: tbeginc 0(%r1), 42
++0xe5 0x61 0x10 0x00 0x00 0x2a
++
++# CHECK: tbeginc 0(%r15), 42
++0xe5 0x61 0xf0 0x00 0x00 0x2a
++
++# CHECK: tbeginc 4095(%r1), 42
++0xe5 0x61 0x1f 0xff 0x00 0x2a
++
++# CHECK: tbeginc 4095(%r15), 42
++0xe5 0x61 0xff 0xff 0x00 0x2a
++
++# CHECK: tend
++0xb2 0xf8 0x00 0x00
++
+ # CHECK: tm 0, 0
+ 0x91 0x00 0x00 0x00
+ 
+Index: llvm-36/test/MC/SystemZ/fixups.s
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/SystemZ/fixups.s
+@@ -0,0 +1,119 @@
++
++# RUN: llvm-mc -triple s390x-unknown-unknown --show-encoding %s | FileCheck %s
++
++# RUN: llvm-mc -triple s390x-unknown-unknown -filetype=obj %s | \
++# RUN: llvm-readobj -r | FileCheck %s -check-prefix=CHECK-REL
++
++# CHECK: larl %r14, target                      # encoding: [0xc0,0xe0,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target+2, kind: FK_390_PC32DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PC32DBL target 0x2
++	.align 16
++	larl %r14, target
++
++# CHECK: larl %r14, target@GOT                  # encoding: [0xc0,0xe0,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@GOT+2, kind: FK_390_PC32DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_GOTENT target 0x2
++	.align 16
++	larl %r14, target@got
++
++# CHECK: larl %r14, target@INDNTPOFF            # encoding: [0xc0,0xe0,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@INDNTPOFF+2, kind: FK_390_PC32DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_TLS_IEENT target 0x2
++	.align 16
++	larl %r14, target@indntpoff
++
++# CHECK: brasl %r14, target                     # encoding: [0xc0,0xe5,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target+2, kind: FK_390_PC32DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PC32DBL target 0x2
++	.align 16
++	brasl %r14, target
++
++# CHECK: brasl %r14, target@PLT                 # encoding: [0xc0,0xe5,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC32DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT32DBL target 0x2
++	.align 16
++	brasl %r14, target@plt
++
++# CHECK: brasl %r14, target@PLT:tls_gdcall:sym  # encoding: [0xc0,0xe5,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC32DBL
++# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSGD, kind: FK_390_TLS_CALL
++# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_GDCALL sym 0x0
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT32DBL target 0x2
++	.align 16
++	brasl %r14, target@plt:tls_gdcall:sym
++
++# CHECK: brasl %r14, target@PLT:tls_ldcall:sym  # encoding: [0xc0,0xe5,A,A,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC32DBL
++# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSLDM, kind: FK_390_TLS_CALL
++# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_LDCALL sym 0x0
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT32DBL target 0x2
++	.align 16
++	brasl %r14, target@plt:tls_ldcall:sym
++
++# CHECK: bras %r14, target                      # encoding: [0xa7,0xe5,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target+2, kind: FK_390_PC16DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PC16DBL target 0x2
++	.align 16
++	bras %r14, target
++
++# CHECK: bras %r14, target@PLT                  # encoding: [0xa7,0xe5,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC16DBL
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT16DBL target 0x2
++	.align 16
++	bras %r14, target@plt
++
++# CHECK: bras %r14, target@PLT:tls_gdcall:sym   # encoding: [0xa7,0xe5,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC16DBL
++# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSGD, kind: FK_390_TLS_CALL
++# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_GDCALL sym 0x0
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT16DBL target 0x2
++	.align 16
++	bras %r14, target@plt:tls_gdcall:sym
++
++# CHECK: bras %r14, target@PLT:tls_ldcall:sym   # encoding: [0xa7,0xe5,A,A]
++# CHECK-NEXT:                                   # fixup A - offset: 2, value: target@PLT+2, kind: FK_390_PC16DBL
++# CHECK-NEXT:                                   # fixup B - offset: 0, value: sym@TLSLDM, kind: FK_390_TLS_CALL
++# CHECK-REL:                                    0x{{[0-9A-F]*0}} R_390_TLS_LDCALL sym 0x0
++# CHECK-REL:                                    0x{{[0-9A-F]*2}} R_390_PLT16DBL target 0x2
++	.align 16
++	bras %r14, target@plt:tls_ldcall:sym
++
++
++# Data relocs
++# llvm-mc does not show any "encoding" string for data, so we just check the relocs
++
++# CHECK-REL: .rela.data
++	.data
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LE64 target 0x0
++	.align 16
++	.quad target@ntpoff
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDO64 target 0x0
++	.align 16
++	.quad target@dtpoff
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDM64 target 0x0
++	.align 16
++	.quad target@tlsldm
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_GD64 target 0x0
++	.align 16
++	.quad target@tlsgd
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LE32 target 0x0
++	.align 16
++	.long target@ntpoff
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDO32 target 0x0
++	.align 16
++	.long target@dtpoff
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_LDM32 target 0x0
++	.align 16
++	.long target@tlsldm
++
++# CHECK-REL: 0x{{[0-9A-F]*0}} R_390_TLS_GD32 target 0x0
++	.align 16
++	.long target@tlsgd
++
+Index: llvm-36/test/MC/SystemZ/insn-bad-z13.s
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/SystemZ/insn-bad-z13.s
+@@ -0,0 +1,1201 @@
++# For z13 only.
++# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=z13 < %s 2> %t
++# RUN: FileCheck < %t %s
++
++#CHECK: error: invalid operand
++#CHECK: lcbb	%r0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: lcbb	%r0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: lcbb	%r0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: lcbb	%r0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: lcbb	%r0, 0(%v1,%r2), 0
++
++	lcbb	%r0, 0, -1
++	lcbb	%r0, 0, 16
++	lcbb	%r0, -1, 0
++	lcbb	%r0, 4096, 0
++	lcbb	%r0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vcdgb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vcdgb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vcdgb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vcdgb	%v0, %v0, 16, 0
++
++	vcdgb	%v0, %v0, 0, -1
++	vcdgb	%v0, %v0, 0, 16
++	vcdgb	%v0, %v0, -1, 0
++	vcdgb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: vcdlgb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vcdlgb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vcdlgb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vcdlgb	%v0, %v0, 16, 0
++
++	vcdlgb	%v0, %v0, 0, -1
++	vcdlgb	%v0, %v0, 0, 16
++	vcdlgb	%v0, %v0, -1, 0
++	vcdlgb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: vcgdb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vcgdb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vcgdb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vcgdb	%v0, %v0, 16, 0
++
++	vcgdb	%v0, %v0, 0, -1
++	vcgdb	%v0, %v0, 0, 16
++	vcgdb	%v0, %v0, -1, 0
++	vcgdb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: vclgdb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vclgdb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vclgdb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vclgdb	%v0, %v0, 16, 0
++
++	vclgdb	%v0, %v0, 0, -1
++	vclgdb	%v0, %v0, 0, 16
++	vclgdb	%v0, %v0, -1, 0
++	vclgdb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: verimb	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verimb	%v0, %v0, %v0, 256
++
++	verimb	%v0, %v0, %v0, -1
++	verimb	%v0, %v0, %v0, 256
++
++#CHECK: error: invalid operand
++#CHECK: verimf	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verimf	%v0, %v0, %v0, 256
++
++	verimf	%v0, %v0, %v0, -1
++	verimf	%v0, %v0, %v0, 256
++
++#CHECK: error: invalid operand
++#CHECK: verimg	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verimg	%v0, %v0, %v0, 256
++
++	verimg	%v0, %v0, %v0, -1
++	verimg	%v0, %v0, %v0, 256
++
++#CHECK: error: invalid operand
++#CHECK: verimh	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verimh	%v0, %v0, %v0, 256
++
++	verimh	%v0, %v0, %v0, -1
++	verimh	%v0, %v0, %v0, 256
++
++#CHECK: error: invalid operand
++#CHECK: verllb	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verllb	%v0, %v0, 4096
++
++	verllb	%v0, %v0, -1
++	verllb	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: verllf	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verllf	%v0, %v0, 4096
++
++	verllf	%v0, %v0, -1
++	verllf	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: verllg	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verllg	%v0, %v0, 4096
++
++	verllg	%v0, %v0, -1
++	verllg	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: verllh	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: verllh	%v0, %v0, 4096
++
++	verllh	%v0, %v0, -1
++	verllh	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: veslb	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: veslb	%v0, %v0, 4096
++
++	veslb	%v0, %v0, -1
++	veslb	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: veslf	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: veslf	%v0, %v0, 4096
++
++	veslf	%v0, %v0, -1
++	veslf	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: veslg	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: veslg	%v0, %v0, 4096
++
++	veslg	%v0, %v0, -1
++	veslg	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: veslh	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: veslh	%v0, %v0, 4096
++
++	veslh	%v0, %v0, -1
++	veslh	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrab	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrab	%v0, %v0, 4096
++
++	vesrab	%v0, %v0, -1
++	vesrab	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesraf	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesraf	%v0, %v0, 4096
++
++	vesraf	%v0, %v0, -1
++	vesraf	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrag	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrag	%v0, %v0, 4096
++
++	vesrag	%v0, %v0, -1
++	vesrag	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrah	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrah	%v0, %v0, 4096
++
++	vesrah	%v0, %v0, -1
++	vesrah	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrlb	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrlb	%v0, %v0, 4096
++
++	vesrlb	%v0, %v0, -1
++	vesrlb	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrlf	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrlf	%v0, %v0, 4096
++
++	vesrlf	%v0, %v0, -1
++	vesrlf	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrlg	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrlg	%v0, %v0, 4096
++
++	vesrlg	%v0, %v0, -1
++	vesrlg	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vesrlh	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vesrlh	%v0, %v0, 4096
++
++	vesrlh	%v0, %v0, -1
++	vesrlh	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vfaeb	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vfaeb	%v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vfaeb	%v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vfaeb	%v0, %v0, %v0, 0, 0
++
++	vfaeb	%v0, %v0, %v0, -1
++	vfaeb	%v0, %v0, %v0, 16
++	vfaeb	%v0, %v0
++	vfaeb	%v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vfaebs	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vfaebs	%v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vfaebs	%v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vfaebs	%v0, %v0, %v0, 0, 0
++
++	vfaebs	%v0, %v0, %v0, -1
++	vfaebs	%v0, %v0, %v0, 16
++	vfaebs	%v0, %v0
++	vfaebs	%v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vfaef	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vfaef	%v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vfaef	%v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vfaef	%v0, %v0, %v0, 0, 0
++
++	vfaef	%v0, %v0, %v0, -1
++	vfaef	%v0, %v0, %v0, 16
++	vfaef	%v0, %v0
++	vfaef	%v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vfaeh	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vfaeh	%v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vfaeh	%v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vfaeh	%v0, %v0, %v0, 0, 0
++
++	vfaeh	%v0, %v0, %v0, -1
++	vfaeh	%v0, %v0, %v0, 16
++	vfaeh	%v0, %v0
++	vfaeh	%v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vfaezh	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vfaezh	%v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vfaezh	%v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vfaezh	%v0, %v0, %v0, 0, 0
++
++	vfaezh	%v0, %v0, %v0, -1
++	vfaezh	%v0, %v0, %v0, 16
++	vfaezh	%v0, %v0
++	vfaezh	%v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vfaezfs	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vfaezfs	%v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vfaezfs	%v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vfaezfs	%v0, %v0, %v0, 0, 0
++
++	vfaezfs	%v0, %v0, %v0, -1
++	vfaezfs	%v0, %v0, %v0, 16
++	vfaezfs	%v0, %v0
++	vfaezfs	%v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vfidb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vfidb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vfidb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vfidb	%v0, %v0, 16, 0
++
++	vfidb	%v0, %v0, 0, -1
++	vfidb	%v0, %v0, 0, 16
++	vfidb	%v0, %v0, -1, 0
++	vfidb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: vftcidb	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vftcidb	%v0, %v0, 4096
++
++	vftcidb	%v0, %v0, -1
++	vftcidb	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vgbm	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vgbm	%v0, 0x10000
++
++	vgbm	%v0, -1
++	vgbm	%v0, 0x10000
++
++#CHECK: error: vector index required
++#CHECK: vgef	%v0, 0(%r1), 0
++#CHECK: error: vector index required
++#CHECK: vgef	%v0, 0(%r2,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vgef	%v0, 0(%v0,%r1), -1
++#CHECK: error: invalid operand
++#CHECK: vgef	%v0, 0(%v0,%r1), 4
++#CHECK: error: invalid operand
++#CHECK: vgef	%v0, -1(%v0,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vgef	%v0, 4096(%v0,%r1), 0
++
++	vgef	%v0, 0(%r1), 0
++	vgef	%v0, 0(%r2,%r1), 0
++	vgef	%v0, 0(%v0,%r1), -1
++	vgef	%v0, 0(%v0,%r1), 4
++	vgef	%v0, -1(%v0,%r1), 0
++	vgef	%v0, 4096(%v0,%r1), 0
++
++#CHECK: error: vector index required
++#CHECK: vgeg	%v0, 0(%r1), 0
++#CHECK: error: vector index required
++#CHECK: vgeg	%v0, 0(%r2,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vgeg	%v0, 0(%v0,%r1), -1
++#CHECK: error: invalid operand
++#CHECK: vgeg	%v0, 0(%v0,%r1), 2
++#CHECK: error: invalid operand
++#CHECK: vgeg	%v0, -1(%v0,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vgeg	%v0, 4096(%v0,%r1), 0
++
++	vgeg	%v0, 0(%r1), 0
++	vgeg	%v0, 0(%r2,%r1), 0
++	vgeg	%v0, 0(%v0,%r1), -1
++	vgeg	%v0, 0(%v0,%r1), 2
++	vgeg	%v0, -1(%v0,%r1), 0
++	vgeg	%v0, 4096(%v0,%r1), 0
++
++#CHECK: error: invalid operand
++#CHECK: vgmb	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmb	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmb	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vgmb	%v0, 256, 0
++
++	vgmb	%v0, 0, -1
++	vgmb	%v0, 0, -1
++	vgmb	%v0, -1, 0
++	vgmb	%v0, 256, 0
++
++#CHECK: error: invalid operand
++#CHECK: vgmf	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmf	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmf	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vgmf	%v0, 256, 0
++
++	vgmf	%v0, 0, -1
++	vgmf	%v0, 0, -1
++	vgmf	%v0, -1, 0
++	vgmf	%v0, 256, 0
++
++#CHECK: error: invalid operand
++#CHECK: vgmg	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmg	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmg	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vgmg	%v0, 256, 0
++
++	vgmg	%v0, 0, -1
++	vgmg	%v0, 0, -1
++	vgmg	%v0, -1, 0
++	vgmg	%v0, 256, 0
++
++#CHECK: error: invalid operand
++#CHECK: vgmh	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmh	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vgmh	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vgmh	%v0, 256, 0
++
++	vgmh	%v0, 0, -1
++	vgmh	%v0, 0, -1
++	vgmh	%v0, -1, 0
++	vgmh	%v0, 256, 0
++
++#CHECK: error: invalid operand
++#CHECK: vl	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vl	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vl	%v0, 0(%v1,%r2)
++
++	vl	%v0, -1
++	vl	%v0, 4096
++	vl	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vlbb	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vlbb	%v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vlbb	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vlbb	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vlbb	%v0, 0(%v1,%r2), 0
++
++	vlbb	%v0, 0, -1
++	vlbb	%v0, 0, 16
++	vlbb	%v0, -1, 0
++	vlbb	%v0, 4096, 0
++	vlbb	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vleb	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleb	%v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vleb	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vleb	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vleb	%v0, 0(%v1,%r2), 0
++
++	vleb	%v0, 0, -1
++	vleb	%v0, 0, 16
++	vleb	%v0, -1, 0
++	vleb	%v0, 4096, 0
++	vleb	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vledb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vledb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vledb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vledb	%v0, %v0, 16, 0
++
++	vledb	%v0, %v0, 0, -1
++	vledb	%v0, %v0, 0, 16
++	vledb	%v0, %v0, -1, 0
++	vledb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: vlef	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vlef	%v0, 0, 4
++#CHECK: error: invalid operand
++#CHECK: vlef	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vlef	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vlef	%v0, 0(%v1,%r2), 0
++
++	vlef	%v0, 0, -1
++	vlef	%v0, 0, 4
++	vlef	%v0, -1, 0
++	vlef	%v0, 4096, 0
++	vlef	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vleg	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleg	%v0, 0, 2
++#CHECK: error: invalid operand
++#CHECK: vleg	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vleg	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vleg	%v0, 0(%v1,%r2), 0
++
++	vleg	%v0, 0, -1
++	vleg	%v0, 0, 2
++	vleg	%v0, -1, 0
++	vleg	%v0, 4096, 0
++	vleg	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vleh	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleh	%v0, 0, 8
++#CHECK: error: invalid operand
++#CHECK: vleh	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vleh	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vleh	%v0, 0(%v1,%r2), 0
++
++	vleh	%v0, 0, -1
++	vleh	%v0, 0, 8
++	vleh	%v0, -1, 0
++	vleh	%v0, 4096, 0
++	vleh	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vleib	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleib	%v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vleib	%v0, -32769, 0
++#CHECK: error: invalid operand
++#CHECK: vleib	%v0, 32768, 0
++
++	vleib	%v0, 0, -1
++	vleib	%v0, 0, 16
++	vleib	%v0, -32769, 0
++	vleib	%v0, 32768, 0
++
++#CHECK: error: invalid operand
++#CHECK: vleif	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleif	%v0, 0, 4
++#CHECK: error: invalid operand
++#CHECK: vleif	%v0, -32769, 0
++#CHECK: error: invalid operand
++#CHECK: vleif	%v0, 32768, 0
++
++	vleif	%v0, 0, -1
++	vleif	%v0, 0, 4
++	vleif	%v0, -32769, 0
++	vleif	%v0, 32768, 0
++
++#CHECK: error: invalid operand
++#CHECK: vleig	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleig	%v0, 0, 2
++#CHECK: error: invalid operand
++#CHECK: vleig	%v0, -32769, 0
++#CHECK: error: invalid operand
++#CHECK: vleig	%v0, 32768, 0
++
++	vleig	%v0, 0, -1
++	vleig	%v0, 0, 2
++	vleig	%v0, -32769, 0
++	vleig	%v0, 32768, 0
++
++#CHECK: error: invalid operand
++#CHECK: vleih	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vleih	%v0, 0, 8
++#CHECK: error: invalid operand
++#CHECK: vleih	%v0, -32769, 0
++#CHECK: error: invalid operand
++#CHECK: vleih	%v0, 32768, 0
++
++	vleih	%v0, 0, -1
++	vleih	%v0, 0, 8
++	vleih	%v0, -32769, 0
++	vleih	%v0, 32768, 0
++
++#CHECK: error: invalid operand
++#CHECK: vlgvb	%r0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlgvb	%r0, %v0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlgvb	%r0, %v0, 0(%r0)
++
++	vlgvb	%r0, %v0, -1
++	vlgvb	%r0, %v0, 4096
++	vlgvb	%r0, %v0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vlgvf	%r0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlgvf	%r0, %v0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlgvf	%r0, %v0, 0(%r0)
++
++	vlgvf	%r0, %v0, -1
++	vlgvf	%r0, %v0, 4096
++	vlgvf	%r0, %v0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vlgvg	%r0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlgvg	%r0, %v0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlgvg	%r0, %v0, 0(%r0)
++
++	vlgvg	%r0, %v0, -1
++	vlgvg	%r0, %v0, 4096
++	vlgvg	%r0, %v0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vlgvh	%r0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlgvh	%r0, %v0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlgvh	%r0, %v0, 0(%r0)
++
++	vlgvh	%r0, %v0, -1
++	vlgvh	%r0, %v0, 4096
++	vlgvh	%r0, %v0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vll	%v0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: vll	%v0, %r0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vll	%v0, %r0, 0(%r0)
++
++	vll	%v0, %r0, -1
++	vll	%v0, %r0, 4096
++	vll	%v0, %r0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vllezb	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vllezb	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vllezb	%v0, 0(%v1,%r2)
++
++	vllezb	%v0, -1
++	vllezb	%v0, 4096
++	vllezb	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vllezf	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vllezf	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vllezf	%v0, 0(%v1,%r2)
++
++	vllezf	%v0, -1
++	vllezf	%v0, 4096
++	vllezf	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vllezg	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vllezg	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vllezg	%v0, 0(%v1,%r2)
++
++	vllezg	%v0, -1
++	vllezg	%v0, 4096
++	vllezg	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vllezh	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vllezh	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vllezh	%v0, 0(%v1,%r2)
++
++	vllezh	%v0, -1
++	vllezh	%v0, 4096
++	vllezh	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vlm	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlm	%v0, %v0, 4096
++
++	vlm	%v0, %v0, -1
++	vlm	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vlrepb	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlrepb	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vlrepb	%v0, 0(%v1,%r2)
++
++	vlrepb	%v0, -1
++	vlrepb	%v0, 4096
++	vlrepb	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vlrepf	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlrepf	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vlrepf	%v0, 0(%v1,%r2)
++
++	vlrepf	%v0, -1
++	vlrepf	%v0, 4096
++	vlrepf	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vlrepg	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlrepg	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vlrepg	%v0, 0(%v1,%r2)
++
++	vlrepg	%v0, -1
++	vlrepg	%v0, 4096
++	vlrepg	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vlreph	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vlreph	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vlreph	%v0, 0(%v1,%r2)
++
++	vlreph	%v0, -1
++	vlreph	%v0, 4096
++	vlreph	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vlvgb	%v0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: vlvgb	%v0, %r0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlvgb	%v0, %r0, 0(%r0)
++
++	vlvgb	%v0, %r0, -1
++	vlvgb	%v0, %r0, 4096
++	vlvgb	%v0, %r0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vlvgf	%v0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: vlvgf	%v0, %r0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlvgf	%v0, %r0, 0(%r0)
++
++	vlvgf	%v0, %r0, -1
++	vlvgf	%v0, %r0, 4096
++	vlvgf	%v0, %r0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vlvgg	%v0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: vlvgg	%v0, %r0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlvgg	%v0, %r0, 0(%r0)
++
++	vlvgg	%v0, %r0, -1
++	vlvgg	%v0, %r0, 4096
++	vlvgg	%v0, %r0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vlvgh	%v0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: vlvgh	%v0, %r0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vlvgh	%v0, %r0, 0(%r0)
++
++	vlvgh	%v0, %r0, -1
++	vlvgh	%v0, %r0, 4096
++	vlvgh	%v0, %r0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vpdi	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vpdi	%v0, %v0, %v0, 16
++
++	vpdi	%v0, %v0, %v0, -1
++	vpdi	%v0, %v0, %v0, 16
++
++#CHECK: error: invalid operand
++#CHECK: vrepb	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vrepb	%v0, %v0, 65536
++
++	vrepb	%v0, %v0, -1
++	vrepb	%v0, %v0, 65536
++
++#CHECK: error: invalid operand
++#CHECK: vrepf	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vrepf	%v0, %v0, 65536
++
++	vrepf	%v0, %v0, -1
++	vrepf	%v0, %v0, 65536
++
++#CHECK: error: invalid operand
++#CHECK: vrepg	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vrepg	%v0, %v0, 65536
++
++	vrepg	%v0, %v0, -1
++	vrepg	%v0, %v0, 65536
++
++#CHECK: error: invalid operand
++#CHECK: vreph	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vreph	%v0, %v0, 65536
++
++	vreph	%v0, %v0, -1
++	vreph	%v0, %v0, 65536
++
++#CHECK: error: invalid operand
++#CHECK: vrepib	%v0, -32769
++#CHECK: error: invalid operand
++#CHECK: vrepib	%v0, 32768
++
++	vrepib	%v0, -32769
++	vrepib	%v0, 32768
++
++#CHECK: error: invalid operand
++#CHECK: vrepif	%v0, -32769
++#CHECK: error: invalid operand
++#CHECK: vrepif	%v0, 32768
++
++	vrepif	%v0, -32769
++	vrepif	%v0, 32768
++
++#CHECK: error: invalid operand
++#CHECK: vrepig	%v0, -32769
++#CHECK: error: invalid operand
++#CHECK: vrepig	%v0, 32768
++
++	vrepig	%v0, -32769
++	vrepig	%v0, 32768
++
++#CHECK: error: invalid operand
++#CHECK: vrepih	%v0, -32769
++#CHECK: error: invalid operand
++#CHECK: vrepih	%v0, 32768
++
++	vrepih	%v0, -32769
++	vrepih	%v0, 32768
++
++#CHECK: error: vector index required
++#CHECK: vscef	%v0, 0(%r1), 0
++#CHECK: error: vector index required
++#CHECK: vscef	%v0, 0(%r2,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vscef	%v0, 0(%v0,%r1), -1
++#CHECK: error: invalid operand
++#CHECK: vscef	%v0, 0(%v0,%r1), 4
++#CHECK: error: invalid operand
++#CHECK: vscef	%v0, -1(%v0,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vscef	%v0, 4096(%v0,%r1), 0
++
++	vscef	%v0, 0(%r1), 0
++	vscef	%v0, 0(%r2,%r1), 0
++	vscef	%v0, 0(%v0,%r1), -1
++	vscef	%v0, 0(%v0,%r1), 4
++	vscef	%v0, -1(%v0,%r1), 0
++	vscef	%v0, 4096(%v0,%r1), 0
++
++#CHECK: error: vector index required
++#CHECK: vsceg	%v0, 0(%r1), 0
++#CHECK: error: vector index required
++#CHECK: vsceg	%v0, 0(%r2,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vsceg	%v0, 0(%v0,%r1), -1
++#CHECK: error: invalid operand
++#CHECK: vsceg	%v0, 0(%v0,%r1), 2
++#CHECK: error: invalid operand
++#CHECK: vsceg	%v0, -1(%v0,%r1), 0
++#CHECK: error: invalid operand
++#CHECK: vsceg	%v0, 4096(%v0,%r1), 0
++
++	vsceg	%v0, 0(%r1), 0
++	vsceg	%v0, 0(%r2,%r1), 0
++	vsceg	%v0, 0(%v0,%r1), -1
++	vsceg	%v0, 0(%v0,%r1), 2
++	vsceg	%v0, -1(%v0,%r1), 0
++	vsceg	%v0, 4096(%v0,%r1), 0
++
++#CHECK: error: invalid operand
++#CHECK: vsldb	%v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vsldb	%v0, %v0, %v0, 256
++
++	vsldb	%v0, %v0, %v0, -1
++	vsldb	%v0, %v0, %v0, 256
++
++#CHECK: error: invalid operand
++#CHECK: vst	%v0, -1
++#CHECK: error: invalid operand
++#CHECK: vst	%v0, 4096
++#CHECK: error: invalid use of vector addressing
++#CHECK: vst	%v0, 0(%v1,%r2)
++
++	vst	%v0, -1
++	vst	%v0, 4096
++	vst	%v0, 0(%v1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: vsteb	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vsteb	%v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: vsteb	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vsteb	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vsteb	%v0, 0(%v1,%r2), 0
++
++	vsteb	%v0, 0, -1
++	vsteb	%v0, 0, 16
++	vsteb	%v0, -1, 0
++	vsteb	%v0, 4096, 0
++	vsteb	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vstef	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vstef	%v0, 0, 4
++#CHECK: error: invalid operand
++#CHECK: vstef	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vstef	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vstef	%v0, 0(%v1,%r2), 0
++
++	vstef	%v0, 0, -1
++	vstef	%v0, 0, 4
++	vstef	%v0, -1, 0
++	vstef	%v0, 4096, 0
++	vstef	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vsteg	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vsteg	%v0, 0, 2
++#CHECK: error: invalid operand
++#CHECK: vsteg	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vsteg	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vsteg	%v0, 0(%v1,%r2), 0
++
++	vsteg	%v0, 0, -1
++	vsteg	%v0, 0, 2
++	vsteg	%v0, -1, 0
++	vsteg	%v0, 4096, 0
++	vsteg	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vsteh	%v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: vsteh	%v0, 0, 8
++#CHECK: error: invalid operand
++#CHECK: vsteh	%v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: vsteh	%v0, 4096, 0
++#CHECK: error: invalid use of vector addressing
++#CHECK: vsteh	%v0, 0(%v1,%r2), 0
++
++	vsteh	%v0, 0, -1
++	vsteh	%v0, 0, 8
++	vsteh	%v0, -1, 0
++	vsteh	%v0, 4096, 0
++	vsteh	%v0, 0(%v1,%r2), 0
++
++#CHECK: error: invalid operand
++#CHECK: vstl	%v0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: vstl	%v0, %r0, 4096
++#CHECK: error: %r0 used in an address
++#CHECK: vstl	%v0, %r0, 0(%r0)
++
++	vstl	%v0, %r0, -1
++	vstl	%v0, %r0, 4096
++	vstl	%v0, %r0, 0(%r0)
++
++#CHECK: error: invalid operand
++#CHECK: vstm	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstm	%v0, %v0, 4096
++
++	vstm	%v0, %v0, -1
++	vstm	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vstrcb   %v0, %v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 0, 0
++
++        vstrcb   %v0, %v0, %v0, %v0, -1
++        vstrcb   %v0, %v0, %v0, %v0, 16
++        vstrcb   %v0, %v0, %v0
++        vstrcb   %v0, %v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vstrcbs  %v0, %v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstrcbs  %v0, %v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vstrcbs  %v0, %v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vstrcbs  %v0, %v0, %v0, %v0, 0, 0
++
++        vstrcbs  %v0, %v0, %v0, %v0, -1
++        vstrcbs  %v0, %v0, %v0, %v0, 16
++        vstrcbs  %v0, %v0, %v0
++        vstrcbs  %v0, %v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vstrcf   %v0, %v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 0, 0
++
++        vstrcf   %v0, %v0, %v0, %v0, -1
++        vstrcf   %v0, %v0, %v0, %v0, 16
++        vstrcf   %v0, %v0, %v0
++        vstrcf   %v0, %v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vstrch   %v0, %v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vstrch   %v0, %v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 0, 0
++
++        vstrch   %v0, %v0, %v0, %v0, -1
++        vstrch   %v0, %v0, %v0, %v0, 16
++        vstrch   %v0, %v0, %v0
++        vstrch   %v0, %v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vstrczh  %v0, %v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstrczh  %v0, %v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vstrczh  %v0, %v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vstrczh  %v0, %v0, %v0, %v0, 0, 0
++
++        vstrczh  %v0, %v0, %v0, %v0, -1
++        vstrczh  %v0, %v0, %v0, %v0, 16
++        vstrczh  %v0, %v0, %v0
++        vstrczh  %v0, %v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: vstrczfs %v0, %v0, %v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: vstrczfs %v0, %v0, %v0, %v0, 16
++#CHECK: error: too few operands
++#CHECK: vstrczfs %v0, %v0, %v0
++#CHECK: error: invalid operand
++#CHECK: vstrczfs %v0, %v0, %v0, %v0, 0, 0
++
++        vstrczfs %v0, %v0, %v0, %v0, -1
++        vstrczfs %v0, %v0, %v0, %v0, 16
++        vstrczfs %v0, %v0, %v0
++        vstrczfs %v0, %v0, %v0, %v0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: wcdgb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: wcdgb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: wcdgb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: wcdgb	%v0, %v0, 16, 0
++
++	wcdgb	%v0, %v0, 0, -1
++	wcdgb	%v0, %v0, 0, 16
++	wcdgb	%v0, %v0, -1, 0
++	wcdgb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: wcdlgb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: wcdlgb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: wcdlgb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: wcdlgb	%v0, %v0, 16, 0
++
++	wcdlgb	%v0, %v0, 0, -1
++	wcdlgb	%v0, %v0, 0, 16
++	wcdlgb	%v0, %v0, -1, 0
++	wcdlgb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: wcgdb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: wcgdb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: wcgdb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: wcgdb	%v0, %v0, 16, 0
++
++	wcgdb	%v0, %v0, 0, -1
++	wcgdb	%v0, %v0, 0, 16
++	wcgdb	%v0, %v0, -1, 0
++	wcgdb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: wclgdb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: wclgdb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: wclgdb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: wclgdb	%v0, %v0, 16, 0
++
++	wclgdb	%v0, %v0, 0, -1
++	wclgdb	%v0, %v0, 0, 16
++	wclgdb	%v0, %v0, -1, 0
++	wclgdb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: wfidb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: wfidb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: wfidb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: wfidb	%v0, %v0, 16, 0
++
++	wfidb	%v0, %v0, 0, -1
++	wfidb	%v0, %v0, 0, 16
++	wfidb	%v0, %v0, -1, 0
++	wfidb	%v0, %v0, 16, 0
++
++#CHECK: error: invalid operand
++#CHECK: wftcidb	%v0, %v0, -1
++#CHECK: error: invalid operand
++#CHECK: wftcidb	%v0, %v0, 4096
++
++	wftcidb	%v0, %v0, -1
++	wftcidb	%v0, %v0, 4096
++
++#CHECK: error: invalid operand
++#CHECK: wledb	%v0, %v0, 0, -1
++#CHECK: error: invalid operand
++#CHECK: wledb	%v0, %v0, 0, 16
++#CHECK: error: invalid operand
++#CHECK: wledb	%v0, %v0, -1, 0
++#CHECK: error: invalid operand
++#CHECK: wledb	%v0, %v0, 16, 0
++
++	wledb	%v0, %v0, 0, -1
++	wledb	%v0, %v0, 0, 16
++	wledb	%v0, %v0, -1, 0
++	wledb	%v0, %v0, 16, 0
+Index: llvm-36/test/MC/SystemZ/insn-bad-z196.s
+===================================================================
+--- llvm-36.orig/test/MC/SystemZ/insn-bad-z196.s
++++ llvm-36/test/MC/SystemZ/insn-bad-z196.s
+@@ -244,6 +244,11 @@
+ 	cxlgbr	%f0, 16, %r0, 0
+ 	cxlgbr	%f2, 0, %r0, 0
+ 
++#CHECK: error: {{(instruction requires: transactional-execution)?}}
++#CHECK: etnd	%r7
++
++	etnd	%r7
++
+ #CHECK: error: invalid operand
+ #CHECK: fidbra	%f0, 0, %f0, -1
+ #CHECK: error: invalid operand
+@@ -546,6 +551,21 @@
+ 	locr	%r0,%r0,-1
+ 	locr	%r0,%r0,16
+ 
++#CHECK: error: {{(instruction requires: transactional-execution)?}}
++#CHECK: ntstg	%r0, 524287(%r1,%r15)
++
++	ntstg	%r0, 524287(%r1,%r15)
++
++#CHECK: error: {{(instruction requires: processor-assist)?}}
++#CHECK: ppa	%r4, %r6, 7
++
++	ppa	%r4, %r6, 7
++
++#CHECK: error: {{(instruction requires: miscellaneous-extensions)?}}
++#CHECK: risbgn	%r1, %r2, 0, 0, 0
++
++	risbgn	%r1, %r2, 0, 0, 0
++
+ #CHECK: error: invalid operand
+ #CHECK: risbhg	%r0,%r0,0,0,-1
+ #CHECK: error: invalid operand
+@@ -685,3 +705,24 @@
+ 	stocg	%r0,-524289,1
+ 	stocg	%r0,524288,1
+ 	stocg	%r0,0(%r1,%r2),1
++
++#CHECK: error: {{(instruction requires: transactional-execution)?}}
++#CHECK: tabort	4095(%r1)
++
++	tabort	4095(%r1)
++
++#CHECK: error: {{(instruction requires: transactional-execution)?}}
++#CHECK: tbegin	4095(%r1), 42
++
++	tbegin	4095(%r1), 42
++
++#CHECK: error: {{(instruction requires: transactional-execution)?}}
++#CHECK: tbeginc	4095(%r1), 42
++
++	tbeginc	4095(%r1), 42
++
++#CHECK: error: {{(instruction requires: transactional-execution)?}}
++#CHECK: tend
++
++	tend
++
+Index: llvm-36/test/MC/SystemZ/insn-bad-zEC12.s
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/SystemZ/insn-bad-zEC12.s
+@@ -0,0 +1,1578 @@
++# For zEC12 only.
++# RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=zEC12 < %s 2> %t
++# RUN: FileCheck < %t %s
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: lcbb	%r0, 0, 0
++
++	lcbb	%r0, 0, 0
++
++#CHECK: error: invalid operand
++#CHECK: ntstg	%r0, -524289
++#CHECK: error: invalid operand
++#CHECK: ntstg	%r0, 524288
++
++	ntstg	%r0, -524289
++	ntstg	%r0, 524288
++
++#CHECK: error: invalid operand
++#CHECK: ppa	%r0, %r0, -1
++#CHECK: error: invalid operand
++#CHECK: ppa	%r0, %r0, 16
++
++	ppa	%r0, %r0, -1
++	ppa	%r0, %r0, 16
++
++#CHECK: error: invalid operand
++#CHECK: risbgn	%r0,%r0,0,0,-1
++#CHECK: error: invalid operand
++#CHECK: risbgn	%r0,%r0,0,0,64
++#CHECK: error: invalid operand
++#CHECK: risbgn	%r0,%r0,0,-1,0
++#CHECK: error: invalid operand
++#CHECK: risbgn	%r0,%r0,0,256,0
++#CHECK: error: invalid operand
++#CHECK: risbgn	%r0,%r0,-1,0,0
++#CHECK: error: invalid operand
++#CHECK: risbgn	%r0,%r0,256,0,0
++
++	risbgn	%r0,%r0,0,0,-1
++	risbgn	%r0,%r0,0,0,64
++	risbgn	%r0,%r0,0,-1,0
++	risbgn	%r0,%r0,0,256,0
++	risbgn	%r0,%r0,-1,0,0
++	risbgn	%r0,%r0,256,0,0
++
++#CHECK: error: invalid operand
++#CHECK: tabort	-1
++#CHECK: error: invalid operand
++#CHECK: tabort	4096
++#CHECK: error: invalid use of indexed addressing
++#CHECK: tabort	0(%r1,%r2)
++
++	tabort	-1
++	tabort	4096
++	tabort	0(%r1,%r2)
++
++#CHECK: error: invalid operand
++#CHECK: tbegin	-1, 0
++#CHECK: error: invalid operand
++#CHECK: tbegin	4096, 0
++#CHECK: error: invalid use of indexed addressing
++#CHECK: tbegin	0(%r1,%r2), 0
++#CHECK: error: invalid operand
++#CHECK: tbegin	0, -1
++#CHECK: error: invalid operand
++#CHECK: tbegin	0, 65536
++
++	tbegin	-1, 0
++	tbegin	4096, 0
++	tbegin	0(%r1,%r2), 0
++	tbegin	0, -1
++	tbegin	0, 65536
++
++#CHECK: error: invalid operand
++#CHECK: tbeginc	-1, 0
++#CHECK: error: invalid operand
++#CHECK: tbeginc	4096, 0
++#CHECK: error: invalid use of indexed addressing
++#CHECK: tbeginc	0(%r1,%r2), 0
++#CHECK: error: invalid operand
++#CHECK: tbeginc	0, -1
++#CHECK: error: invalid operand
++#CHECK: tbeginc	0, 65536
++
++	tbeginc	-1, 0
++	tbeginc	4096, 0
++	tbeginc	0(%r1,%r2), 0
++	tbeginc	0, -1
++	tbeginc	0, 65536
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vab	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vaf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vag	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vah	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vaq	%v0, %v0, %v0
++
++	vab	%v0, %v0, %v0
++	vaf	%v0, %v0, %v0
++	vag	%v0, %v0, %v0
++	vah	%v0, %v0, %v0
++	vaq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vaccb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vaccf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vaccg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vacch	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vaccq	%v0, %v0, %v0
++
++	vaccb	%v0, %v0, %v0
++	vaccf	%v0, %v0, %v0
++	vaccg	%v0, %v0, %v0
++	vacch	%v0, %v0, %v0
++	vaccq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vacccq	%v0, %v0, %v0
++
++	vacccq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vacq	%v0, %v0, %v0
++
++	vacq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavgb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavgf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavgg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavgh	%v0, %v0, %v0
++
++	vavgb	%v0, %v0, %v0
++	vavgf	%v0, %v0, %v0
++	vavgg	%v0, %v0, %v0
++	vavgh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavglb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavglf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavglg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vavglh	%v0, %v0, %v0
++
++	vavglb	%v0, %v0, %v0
++	vavglf	%v0, %v0, %v0
++	vavglg	%v0, %v0, %v0
++	vavglh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vcdgb	%v0, %v0, 0, 0
++
++	vcdgb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vcdlgb	%v0, %v0, 0, 0
++
++	vcdlgb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqbs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqhs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqfs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vceqgs	%v0, %v0, %v0
++
++	vceqb	%v0, %v0, %v0
++	vceqf	%v0, %v0, %v0
++	vceqg	%v0, %v0, %v0
++	vceqh	%v0, %v0, %v0
++	vceqbs	%v0, %v0, %v0
++	vceqhs	%v0, %v0, %v0
++	vceqfs	%v0, %v0, %v0
++	vceqgs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vcgdb	%v0, %v0, 0, 0
++
++	vcgdb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchbs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchhs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchfs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchgs	%v0, %v0, %v0
++
++	vchb	%v0, %v0, %v0
++	vchf	%v0, %v0, %v0
++	vchg	%v0, %v0, %v0
++	vchh	%v0, %v0, %v0
++	vchbs	%v0, %v0, %v0
++	vchhs	%v0, %v0, %v0
++	vchfs	%v0, %v0, %v0
++	vchgs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlbs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlhs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlfs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vchlgs	%v0, %v0, %v0
++
++	vchlb	%v0, %v0, %v0
++	vchlf	%v0, %v0, %v0
++	vchlg	%v0, %v0, %v0
++	vchlh	%v0, %v0, %v0
++	vchlbs	%v0, %v0, %v0
++	vchlhs	%v0, %v0, %v0
++	vchlfs	%v0, %v0, %v0
++	vchlgs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vcksm	%v0, %v0, %v0
++
++	vcksm	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vclgdb	%v0, %v0, 0, 0
++
++	vclgdb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vclzb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vclzf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vclzg	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vclzh	%v0, %v0
++
++	vclzb	%v0, %v0
++	vclzf	%v0, %v0
++	vclzg	%v0, %v0
++	vclzh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vctzb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vctzf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vctzg	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vctzh	%v0, %v0
++
++	vctzb	%v0, %v0
++	vctzf	%v0, %v0
++	vctzg	%v0, %v0
++	vctzh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vecb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vecf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vecg	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vech	%v0, %v0
++
++	vecb	%v0, %v0
++	vecf	%v0, %v0
++	vecg	%v0, %v0
++	vech	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verimb	%v0, %v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verimf	%v0, %v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verimg	%v0, %v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verimh	%v0, %v0, %v0, 0
++
++	verimb	%v0, %v0, %v0, 0
++	verimf	%v0, %v0, %v0, 0
++	verimg	%v0, %v0, %v0, 0
++	verimh	%v0, %v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veclb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veclf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veclg	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veclh	%v0, %v0
++
++	veclb	%v0, %v0
++	veclf	%v0, %v0
++	veclg	%v0, %v0
++	veclh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllvb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllvf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllvg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllvh	%v0, %v0, %v0
++
++	verllvb	%v0, %v0, %v0
++	verllvf	%v0, %v0, %v0
++	verllvg	%v0, %v0, %v0
++	verllvh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllb	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllf	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllg	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: verllh	%v0, %v0, 0
++
++	verllb	%v0, %v0, 0
++	verllf	%v0, %v0, 0
++	verllg	%v0, %v0, 0
++	verllh	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslvb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslvf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslvg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslvh	%v0, %v0, %v0
++
++	veslvb	%v0, %v0, %v0
++	veslvf	%v0, %v0, %v0
++	veslvg	%v0, %v0, %v0
++	veslvh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslb	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslf	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslg	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: veslh	%v0, %v0, 0
++
++	veslb	%v0, %v0, 0
++	veslf	%v0, %v0, 0
++	veslg	%v0, %v0, 0
++	veslh	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesravb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesravf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesravg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesravh	%v0, %v0, %v0
++
++	vesravb	%v0, %v0, %v0
++	vesravf	%v0, %v0, %v0
++	vesravg	%v0, %v0, %v0
++	vesravh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrab	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesraf	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrag	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrah	%v0, %v0, 0
++
++	vesrab	%v0, %v0, 0
++	vesraf	%v0, %v0, 0
++	vesrag	%v0, %v0, 0
++	vesrah	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlvb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlvf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlvg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlvh	%v0, %v0, %v0
++
++	vesrlvb	%v0, %v0, %v0
++	vesrlvf	%v0, %v0, %v0
++	vesrlvg	%v0, %v0, %v0
++	vesrlvh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlb	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlf	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlg	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vesrlh	%v0, %v0, 0
++
++	vesrlb	%v0, %v0, 0
++	vesrlf	%v0, %v0, 0
++	vesrlg	%v0, %v0, 0
++	vesrlh	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfadb	%v0, %v0, %v0
++
++	vfadb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfcedb	%v0, %v0, %v0
++#CHECK: vfcedbs	%v0, %v0, %v0
++
++	vfcedb	%v0, %v0, %v0
++	vfcedbs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfchdb	%v0, %v0, %v0
++#CHECK: vfchdbs	%v0, %v0, %v0
++
++	vfchdb	%v0, %v0, %v0
++	vfchdbs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfddb	%v0, %v0, %v0
++
++	vfddb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaeb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaezb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaebs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaezbs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaeh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaezh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaehs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaezhs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaef	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaezf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaefs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfaezfs	%v0, %v0, %v0
++
++	vfaeb	%v0, %v0, %v0
++	vfaezb	%v0, %v0, %v0
++	vfaebs	%v0, %v0, %v0
++	vfaezbs	%v0, %v0, %v0
++	vfaeh	%v0, %v0, %v0
++	vfaezh	%v0, %v0, %v0
++	vfaehs	%v0, %v0, %v0
++	vfaezhs	%v0, %v0, %v0
++	vfaef	%v0, %v0, %v0
++	vfaezf	%v0, %v0, %v0
++	vfaefs	%v0, %v0, %v0
++	vfaezfs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeeb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeezb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeebs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeezbs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeeh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeezh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeehs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeezhs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeef	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeezf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeefs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeezfs	%v0, %v0, %v0
++
++	vfeeb	%v0, %v0, %v0
++	vfeezb	%v0, %v0, %v0
++	vfeebs	%v0, %v0, %v0
++	vfeezbs	%v0, %v0, %v0
++	vfeeh	%v0, %v0, %v0
++	vfeezh	%v0, %v0, %v0
++	vfeehs	%v0, %v0, %v0
++	vfeezhs	%v0, %v0, %v0
++	vfeef	%v0, %v0, %v0
++	vfeezf	%v0, %v0, %v0
++	vfeefs	%v0, %v0, %v0
++	vfeezfs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeneb   %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenezb  %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenebs  %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenezbs %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfeneh   %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenezh  %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenehs  %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenezhs %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenef   %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenezf  %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenefs  %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfenezfs %v0, %v0, %v0
++
++	vfeneb   %v0, %v0, %v0
++	vfenezb  %v0, %v0, %v0
++	vfenebs  %v0, %v0, %v0
++	vfenezbs %v0, %v0, %v0
++	vfeneh   %v0, %v0, %v0
++	vfenezh  %v0, %v0, %v0
++	vfenehs  %v0, %v0, %v0
++	vfenezhs %v0, %v0, %v0
++	vfenef   %v0, %v0, %v0
++	vfenezf  %v0, %v0, %v0
++	vfenefs  %v0, %v0, %v0
++	vfenezfs %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfidb	%v0, %v0, 0, 0
++
++	vfidb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vistrb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vistrbs	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vistrh	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vistrhs	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vistrf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vistrfs	%v0, %v0
++
++	vistrb	%v0, %v0
++	vistrbs	%v0, %v0
++	vistrh	%v0, %v0
++	vistrhs	%v0, %v0
++	vistrf	%v0, %v0
++	vistrfs	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vflcdb	%v0, %v0
++
++	vflcdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vflndb	%v0, %v0
++
++	vflndb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vflpdb	%v0, %v0
++
++	vflpdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfmadb	%v0, %v0, %v0, %v0
++
++	vfmadb	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfmdb	%v0, %v0, %v0
++
++	vfmdb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfmsdb	%v0, %v0, %v0, %v0
++
++	vfmsdb	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfsdb	%v0, %v0, %v0
++
++	vfsdb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vfsqdb	%v0, %v0
++
++	vfsqdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vftcidb	%v0, %v0, 0
++
++	vftcidb	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgbm	%v0, 0
++
++	vgbm	%v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgef	%v0, 0(%v0, %r1), 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgeg	%v0, 0(%v0, %r1), 0
++
++	vgef	%v0, 0(%v0, %r1), 0
++	vgeg	%v0, 0(%v0, %r1), 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmab	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmaf	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmag	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmah	%v0, %v0, %v0, %v0
++
++	vgfmab	%v0, %v0, %v0, %v0
++	vgfmaf	%v0, %v0, %v0, %v0
++	vgfmag	%v0, %v0, %v0, %v0
++	vgfmah	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgfmh	%v0, %v0, %v0
++
++	vgfmb	%v0, %v0, %v0
++	vgfmf	%v0, %v0, %v0
++	vgfmg	%v0, %v0, %v0
++	vgfmh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgmb	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgmf	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgmg	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vgmh	%v0, 0, 0
++
++	vgmb	%v0, 0, 0
++	vgmf	%v0, 0, 0
++	vgmg	%v0, 0, 0
++	vgmh	%v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vl	%v0, 0
++
++	vl	%v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlbb	%v0, 0, 0
++
++	vlbb	%v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlcb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlcf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlcg	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlch	%v0, %v0
++
++	vlcb	%v0, %v0
++	vlcf	%v0, %v0
++	vlcg	%v0, %v0
++	vlch	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vldeb	%v0, %v0
++
++	vldeb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleb	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlef	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleg	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleh	%v0, 0, 0
++
++	vleb	%v0, 0, 0
++	vlef	%v0, 0, 0
++	vleg	%v0, 0, 0
++	vleh	%v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vledb	%v0, %v0, 0, 0
++
++	vledb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleib	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleif	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleig	%v0, 0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vleih	%v0, 0, 0
++
++	vleib	%v0, 0, 0
++	vleif	%v0, 0, 0
++	vleig	%v0, 0, 0
++	vleih	%v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlgvb	%r0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlgvf	%r0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlgvg	%r0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlgvh	%r0, %v0, 0
++
++	vlgvb	%r0, %v0, 0
++	vlgvf	%r0, %v0, 0
++	vlgvg	%r0, %v0, 0
++	vlgvh	%r0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vll	%v0, %r0, 0
++
++	vll	%v0, %r0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vllezb	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vllezf	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vllezg	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vllezh	%v0, 0
++
++	vllezb	%v0, 0
++	vllezf	%v0, 0
++	vllezg	%v0, 0
++	vllezh	%v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlm	%v0, %v0, 0
++
++	vlm	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlpb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlpf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlpg	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlph	%v0, %v0
++
++	vlpb	%v0, %v0
++	vlpf	%v0, %v0
++	vlpg	%v0, %v0
++	vlph	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlr	%v0, %v0
++
++	vlr	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlrepb	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlrepf	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlrepg	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlreph	%v0, 0
++
++	vlrepb	%v0, 0
++	vlrepf	%v0, 0
++	vlrepg	%v0, 0
++	vlreph	%v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlvgb	%v0, %r0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlvgf	%v0, %r0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlvgg	%v0, %r0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlvgh	%v0, %r0, 0
++
++	vlvgb	%v0, %r0, 0
++	vlvgf	%v0, %r0, 0
++	vlvgg	%v0, %r0, 0
++	vlvgh	%v0, %r0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vlvgp	%v0, %r0, %r0
++
++	vlvgp	%v0, %r0, %r0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaeb	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaef	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaeh	%v0, %v0, %v0, %v0
++
++	vmaeb	%v0, %v0, %v0, %v0
++	vmaef	%v0, %v0, %v0, %v0
++	vmaeh	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmahb	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmahf	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmahh	%v0, %v0, %v0, %v0
++
++	vmahb	%v0, %v0, %v0, %v0
++	vmahf	%v0, %v0, %v0, %v0
++	vmahh	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalb	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalf	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalhw	%v0, %v0, %v0, %v0
++
++	vmalb	%v0, %v0, %v0, %v0
++	vmalf	%v0, %v0, %v0, %v0
++	vmalhw	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaleb	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalef	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaleh	%v0, %v0, %v0, %v0
++
++	vmaleb	%v0, %v0, %v0, %v0
++	vmalef	%v0, %v0, %v0, %v0
++	vmaleh	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalhb	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalhf	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalhh	%v0, %v0, %v0, %v0
++
++	vmalhb	%v0, %v0, %v0, %v0
++	vmalhf	%v0, %v0, %v0, %v0
++	vmalhh	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalob	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmalof	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaloh	%v0, %v0, %v0, %v0
++
++	vmalob	%v0, %v0, %v0, %v0
++	vmalof	%v0, %v0, %v0, %v0
++	vmaloh	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaob	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaof	%v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmaoh	%v0, %v0, %v0, %v0
++
++	vmaob	%v0, %v0, %v0, %v0
++	vmaof	%v0, %v0, %v0, %v0
++	vmaoh	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmeb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmef	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmeh	%v0, %v0, %v0
++
++	vmeb	%v0, %v0, %v0
++	vmef	%v0, %v0, %v0
++	vmeh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmhb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmhf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmhh	%v0, %v0, %v0
++
++	vmhb	%v0, %v0, %v0
++	vmhf	%v0, %v0, %v0
++	vmhh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlh	%v0, %v0, %v0
++
++	vmlb	%v0, %v0, %v0
++	vmlf	%v0, %v0, %v0
++	vmlh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmleb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlef	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmleh	%v0, %v0, %v0
++
++	vmleb	%v0, %v0, %v0
++	vmlef	%v0, %v0, %v0
++	vmleh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlhb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlhf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlhh	%v0, %v0, %v0
++
++	vmlhb	%v0, %v0, %v0
++	vmlhf	%v0, %v0, %v0
++	vmlhh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlob	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmlof	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmloh	%v0, %v0, %v0
++
++	vmlob	%v0, %v0, %v0
++	vmlof	%v0, %v0, %v0
++	vmloh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmng	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnh	%v0, %v0, %v0
++
++	vmnb	%v0, %v0, %v0
++	vmnf	%v0, %v0, %v0
++	vmng	%v0, %v0, %v0
++	vmnh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnlb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnlf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnlg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmnlh	%v0, %v0, %v0
++
++	vmnlb	%v0, %v0, %v0
++	vmnlf	%v0, %v0, %v0
++	vmnlg	%v0, %v0, %v0
++	vmnlh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmob	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmof	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmoh	%v0, %v0, %v0
++
++	vmob	%v0, %v0, %v0
++	vmof	%v0, %v0, %v0
++	vmoh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrhb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrhf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrhg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrhh	%v0, %v0, %v0
++
++	vmrhb	%v0, %v0, %v0
++	vmrhf	%v0, %v0, %v0
++	vmrhg	%v0, %v0, %v0
++	vmrhh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrlb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrlf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrlg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmrlh	%v0, %v0, %v0
++
++	vmrlb	%v0, %v0, %v0
++	vmrlf	%v0, %v0, %v0
++	vmrlg	%v0, %v0, %v0
++	vmrlh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxh	%v0, %v0, %v0
++
++	vmxb	%v0, %v0, %v0
++	vmxf	%v0, %v0, %v0
++	vmxg	%v0, %v0, %v0
++	vmxh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxlb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxlf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxlg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vmxlh	%v0, %v0, %v0
++
++	vmxlb	%v0, %v0, %v0
++	vmxlf	%v0, %v0, %v0
++	vmxlg	%v0, %v0, %v0
++	vmxlh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vn	%v0, %v0, %v0
++
++	vn	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vnc	%v0, %v0, %v0
++
++	vnc	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vno	%v0, %v0, %v0
++
++	vno	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vo	%v0, %v0, %v0
++
++	vo	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vone	%v0
++
++	vone	%v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpdi	%v0, %v0, %v0, 0
++
++	vpdi	%v0, %v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vperm	%v0, %v0, %v0, %v0
++
++	vperm	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpkf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpkg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpkh	%v0, %v0, %v0
++
++	vpkf	%v0, %v0, %v0
++	vpkg	%v0, %v0, %v0
++	vpkh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpksf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpksg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpksh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpksfs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpksgs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpkshs	%v0, %v0, %v0
++
++	vpksf	%v0, %v0, %v0
++	vpksg	%v0, %v0, %v0
++	vpksh	%v0, %v0, %v0
++	vpksfs	%v0, %v0, %v0
++	vpksgs	%v0, %v0, %v0
++	vpkshs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpklsf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpklsg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpklsh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpklsfs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpklsgs	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpklshs	%v0, %v0, %v0
++
++	vpklsf	%v0, %v0, %v0
++	vpklsg	%v0, %v0, %v0
++	vpklsh	%v0, %v0, %v0
++	vpklsfs	%v0, %v0, %v0
++	vpklsgs	%v0, %v0, %v0
++	vpklshs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vpopct	%v0, %v0
++
++	vpopct	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepb	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepf	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepg	%v0, %v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vreph	%v0, %v0, 0
++
++	vrepb	%v0, %v0, 0
++	vrepf	%v0, %v0, 0
++	vrepg	%v0, %v0, 0
++	vreph	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepib	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepif	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepig	%v0, 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vrepih	%v0, 0
++
++	vrepib	%v0, 0
++	vrepif	%v0, 0
++	vrepig	%v0, 0
++	vrepih	%v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsg	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsq	%v0, %v0, %v0
++
++	vsb	%v0, %v0, %v0
++	vsf	%v0, %v0, %v0
++	vsg	%v0, %v0, %v0
++	vsh	%v0, %v0, %v0
++	vsq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsbcbiq	%v0, %v0, %v0
++
++	vsbcbiq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsbiq	%v0, %v0, %v0
++
++	vsbiq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vscbib	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vscbif	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vscbig	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vscbih	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vscbiq	%v0, %v0, %v0
++
++	vscbib	%v0, %v0, %v0
++	vscbif	%v0, %v0, %v0
++	vscbig	%v0, %v0, %v0
++	vscbih	%v0, %v0, %v0
++	vscbiq	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vscef	%v0, 0(%v0, %r1), 0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsceg	%v0, 0(%v0, %r1), 0
++
++	vscef	%v0, 0(%v0, %r1), 0
++	vsceg	%v0, 0(%v0, %r1), 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsegb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsegf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsegh	%v0, %v0
++
++	vsegb	%v0, %v0
++	vsegf	%v0, %v0
++	vsegh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsel	%v0, %v0, %v0, %v0
++
++	vsel	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsl	%v0, %v0, %v0
++
++	vsl	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vslb	%v0, %v0, %v0
++
++	vslb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsldb	%v0, %v0, %v0, 0
++
++	vsldb	%v0, %v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsra	%v0, %v0, %v0
++
++	vsra	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsrab	%v0, %v0, %v0
++
++	vsrab	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsrl	%v0, %v0, %v0
++
++	vsrl	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsrlb	%v0, %v0, %v0
++
++	vsrlb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vst	%v0, 0
++
++	vst	%v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstl	%v0, %r0, 0
++
++	vstl	%v0, %r0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstm	%v0, %v0, 0
++
++	vstm	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrcb   %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrczb  %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrcbs  %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrczbs %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrch   %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrczh  %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrchs  %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrczhs %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrcf   %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrczf  %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrcfs  %v0, %v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vstrczfs %v0, %v0, %v0, %v0
++
++        vstrcb   %v0, %v0, %v0, %v0
++        vstrczb  %v0, %v0, %v0, %v0
++        vstrcbs  %v0, %v0, %v0, %v0
++        vstrczbs %v0, %v0, %v0, %v0
++        vstrch   %v0, %v0, %v0, %v0
++        vstrczh  %v0, %v0, %v0, %v0
++        vstrchs  %v0, %v0, %v0, %v0
++        vstrczhs %v0, %v0, %v0, %v0
++        vstrcf   %v0, %v0, %v0, %v0
++        vstrczf  %v0, %v0, %v0, %v0
++        vstrcfs  %v0, %v0, %v0, %v0
++        vstrczfs %v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsumgh	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsumgf	%v0, %v0, %v0
++
++	vsumgh	%v0, %v0, %v0
++	vsumgf	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsumqf	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsumqg	%v0, %v0, %v0
++
++	vsumqf	%v0, %v0, %v0
++	vsumqg	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsumb	%v0, %v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vsumh	%v0, %v0, %v0
++
++	vsumb	%v0, %v0, %v0
++	vsumh	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vtm	%v0, %v0
++
++	vtm	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuphb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuphf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuphh	%v0, %v0
++
++	vuphb	%v0, %v0
++	vuphf	%v0, %v0
++	vuphh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuplhb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuplhf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuplhh	%v0, %v0
++
++	vuplhb	%v0, %v0
++	vuplhf	%v0, %v0
++	vuplhh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuplb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuplf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vuplhw	%v0, %v0
++
++	vuplb	%v0, %v0
++	vuplf	%v0, %v0
++	vuplhw	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vupllb	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vupllf	%v0, %v0
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vupllh	%v0, %v0
++
++	vupllb	%v0, %v0
++	vupllf	%v0, %v0
++	vupllh	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vx	%v0, %v0, %v0
++
++	vx	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: vzero	%v0
++
++	vzero	%v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wcdgb	%v0, %v0, 0, 0
++
++	wcdgb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wcdlgb	%v0, %v0, 0, 0
++
++	wcdlgb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wcgdb	%v0, %v0, 0, 0
++
++	wcgdb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wclgdb	%v0, %v0, 0, 0
++
++	wclgdb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfadb	%v0, %v0, %v0
++
++	wfadb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfcdb	%v0, %v0
++
++	wfcdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfcedb	%v0, %v0, %v0
++#CHECK: wfcedbs	%v0, %v0, %v0
++
++	wfcedb	%v0, %v0, %v0
++	wfcedbs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfchdb	%v0, %v0, %v0
++#CHECK: wfchdbs	%v0, %v0, %v0
++
++	wfchdb	%v0, %v0, %v0
++	wfchdbs	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfchedb	%v0, %v0, %v0
++#CHECK: wfchedbs %v0, %v0, %v0
++
++	wfchedb	%v0, %v0, %v0
++	wfchedbs %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfddb	%v0, %v0, %v0
++
++	wfddb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfidb	%v0, %v0, 0, 0
++
++	wfidb	%v0, %v0, 0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfkdb	%v0, %v0
++
++	wfkdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wflcdb	%v0, %v0
++
++	wflcdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wflndb	%v0, %v0
++
++	wflndb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wflpdb	%v0, %v0
++
++	wflpdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfmadb	%v0, %v0, %v0, %v0
++
++	wfmadb	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfmdb	%v0, %v0, %v0
++
++	wfmdb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfmsdb	%v0, %v0, %v0, %v0
++
++	wfmsdb	%v0, %v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfsdb	%v0, %v0, %v0
++
++	wfsdb	%v0, %v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wfsqdb	%v0, %v0
++
++	wfsqdb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wftcidb	%v0, %v0, 0
++
++	wftcidb	%v0, %v0, 0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wldeb	%v0, %v0
++
++	wldeb	%v0, %v0
++
++#CHECK: error: {{(instruction requires: vector)?}}
++#CHECK: wledb	%v0, %v0, 0, 0
++
++	wledb	%v0, %v0, 0, 0
+Index: llvm-36/test/MC/SystemZ/insn-bad.s
+===================================================================
+--- llvm-36.orig/test/MC/SystemZ/insn-bad.s
++++ llvm-36/test/MC/SystemZ/insn-bad.s
+@@ -2666,6 +2666,11 @@
+ 	pfdrl	1, 1
+ 	pfdrl	1, 0x100000000
+ 
++#CHECK: error: {{(instruction requires: population-count)?}}
++#CHECK: popcnt	%r0, %r0
++
++	popcnt	%r0, %r0
++
+ #CHECK: error: invalid operand
+ #CHECK: risbg	%r0,%r0,0,0,-1
+ #CHECK: error: invalid operand
+Index: llvm-36/test/MC/SystemZ/insn-good-z13.s
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/SystemZ/insn-good-z13.s
+@@ -0,0 +1,5039 @@
++# For z13 and above.
++# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=z13 -show-encoding %s \
++# RUN:   | FileCheck %s
++
++#CHECK: lcbb    %r0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x27]
++#CHECK: lcbb    %r0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x27]
++#CHECK: lcbb    %r0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x27]
++#CHECK: lcbb    %r0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x27]
++#CHECK: lcbb    %r0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x27]
++#CHECK: lcbb    %r15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x27]
++#CHECK: lcbb    %r2, 1383(%r3,%r4), 8   # encoding: [0xe7,0x23,0x45,0x67,0x80,0x27]
++
++	lcbb	%r0, 0, 0
++	lcbb	%r0, 0, 15
++	lcbb	%r0, 4095, 0
++	lcbb	%r0, 0(%r15), 0
++	lcbb	%r0, 0(%r15,%r1), 0
++	lcbb	%r15, 0, 0
++	lcbb	%r2, 1383(%r3,%r4), 8
++
++#CHECK: vab     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf3]
++#CHECK: vab     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf3]
++#CHECK: vab     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf3]
++#CHECK: vab     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf3]
++#CHECK: vab     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf3]
++
++	vab	%v0, %v0, %v0
++	vab	%v0, %v0, %v31
++	vab	%v0, %v31, %v0
++	vab	%v31, %v0, %v0
++	vab	%v18, %v3, %v20
++
++#CHECK: vaccb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf1]
++#CHECK: vaccb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf1]
++#CHECK: vaccb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf1]
++#CHECK: vaccb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf1]
++#CHECK: vaccb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf1]
++
++	vaccb	%v0, %v0, %v0
++	vaccb	%v0, %v0, %v31
++	vaccb	%v0, %v31, %v0
++	vaccb	%v31, %v0, %v0
++	vaccb	%v18, %v3, %v20
++
++#CHECK: vacccq  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x00,0x00,0xb9]
++#CHECK: vacccq  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x00,0xf1,0xb9]
++#CHECK: vacccq  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x00,0x02,0xb9]
++#CHECK: vacccq  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x00,0x04,0xb9]
++#CHECK: vacccq  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x00,0x08,0xb9]
++#CHECK: vacccq  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x00,0x97,0xb9]
++
++	vacccq	%v0, %v0, %v0, %v0
++	vacccq	%v0, %v0, %v0, %v31
++	vacccq	%v0, %v0, %v31, %v0
++	vacccq	%v0, %v31, %v0, %v0
++	vacccq	%v31, %v0, %v0, %v0
++	vacccq	%v13, %v17, %v21, %v25
++
++#CHECK: vaccf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf1]
++#CHECK: vaccf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf1]
++#CHECK: vaccf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf1]
++#CHECK: vaccf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf1]
++#CHECK: vaccf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf1]
++
++	vaccf	%v0, %v0, %v0
++	vaccf	%v0, %v0, %v31
++	vaccf	%v0, %v31, %v0
++	vaccf	%v31, %v0, %v0
++	vaccf	%v18, %v3, %v20
++
++#CHECK: vaccg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf1]
++#CHECK: vaccg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf1]
++#CHECK: vaccg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf1]
++#CHECK: vaccg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf1]
++#CHECK: vaccg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf1]
++
++	vaccg	%v0, %v0, %v0
++	vaccg	%v0, %v0, %v31
++	vaccg	%v0, %v31, %v0
++	vaccg	%v31, %v0, %v0
++	vaccg	%v18, %v3, %v20
++
++#CHECK: vacch   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf1]
++#CHECK: vacch   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf1]
++#CHECK: vacch   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf1]
++#CHECK: vacch   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf1]
++#CHECK: vacch   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf1]
++
++	vacch	%v0, %v0, %v0
++	vacch	%v0, %v0, %v31
++	vacch	%v0, %v31, %v0
++	vacch	%v31, %v0, %v0
++	vacch	%v18, %v3, %v20
++
++#CHECK: vaccq   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x40,0xf1]
++#CHECK: vaccq   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x42,0xf1]
++#CHECK: vaccq   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xf1]
++#CHECK: vaccq   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xf1]
++#CHECK: vaccq   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x4a,0xf1]
++
++	vaccq	%v0, %v0, %v0
++	vaccq	%v0, %v0, %v31
++	vaccq	%v0, %v31, %v0
++	vaccq	%v31, %v0, %v0
++	vaccq	%v18, %v3, %v20
++
++#CHECK: vacq    %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x00,0x00,0xbb]
++#CHECK: vacq    %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x00,0xf1,0xbb]
++#CHECK: vacq    %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x00,0x02,0xbb]
++#CHECK: vacq    %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x00,0x04,0xbb]
++#CHECK: vacq    %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x00,0x08,0xbb]
++#CHECK: vacq    %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x00,0x97,0xbb]
++
++	vacq	%v0, %v0, %v0, %v0
++	vacq	%v0, %v0, %v0, %v31
++	vacq	%v0, %v0, %v31, %v0
++	vacq	%v0, %v31, %v0, %v0
++	vacq	%v31, %v0, %v0, %v0
++	vacq	%v13, %v17, %v21, %v25
++
++#CHECK: vaf     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf3]
++#CHECK: vaf     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf3]
++#CHECK: vaf     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf3]
++#CHECK: vaf     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf3]
++#CHECK: vaf     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf3]
++
++	vaf	%v0, %v0, %v0
++	vaf	%v0, %v0, %v31
++	vaf	%v0, %v31, %v0
++	vaf	%v31, %v0, %v0
++	vaf	%v18, %v3, %v20
++
++#CHECK: vag     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf3]
++#CHECK: vag     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf3]
++#CHECK: vag     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf3]
++#CHECK: vag     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf3]
++#CHECK: vag     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf3]
++
++	vag	%v0, %v0, %v0
++	vag	%v0, %v0, %v31
++	vag	%v0, %v31, %v0
++	vag	%v31, %v0, %v0
++	vag	%v18, %v3, %v20
++
++#CHECK: vah     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf3]
++#CHECK: vah     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf3]
++#CHECK: vah     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf3]
++#CHECK: vah     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf3]
++#CHECK: vah     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf3]
++
++	vah	%v0, %v0, %v0
++	vah	%v0, %v0, %v31
++	vah	%v0, %v31, %v0
++	vah	%v31, %v0, %v0
++	vah	%v18, %v3, %v20
++
++#CHECK: vaq     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x40,0xf3]
++#CHECK: vaq     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x42,0xf3]
++#CHECK: vaq     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xf3]
++#CHECK: vaq     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xf3]
++#CHECK: vaq     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x4a,0xf3]
++
++	vaq	%v0, %v0, %v0
++	vaq	%v0, %v0, %v31
++	vaq	%v0, %v31, %v0
++	vaq	%v31, %v0, %v0
++	vaq	%v18, %v3, %v20
++
++#CHECK: vavgb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf2]
++#CHECK: vavgb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf2]
++#CHECK: vavgb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf2]
++#CHECK: vavgb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf2]
++#CHECK: vavgb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf2]
++
++	vavgb	%v0, %v0, %v0
++	vavgb	%v0, %v0, %v31
++	vavgb	%v0, %v31, %v0
++	vavgb	%v31, %v0, %v0
++	vavgb	%v18, %v3, %v20
++
++#CHECK: vavgf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf2]
++#CHECK: vavgf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf2]
++#CHECK: vavgf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf2]
++#CHECK: vavgf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf2]
++#CHECK: vavgf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf2]
++
++	vavgf	%v0, %v0, %v0
++	vavgf	%v0, %v0, %v31
++	vavgf	%v0, %v31, %v0
++	vavgf	%v31, %v0, %v0
++	vavgf	%v18, %v3, %v20
++
++#CHECK: vavgg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf2]
++#CHECK: vavgg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf2]
++#CHECK: vavgg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf2]
++#CHECK: vavgg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf2]
++#CHECK: vavgg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf2]
++
++	vavgg	%v0, %v0, %v0
++	vavgg	%v0, %v0, %v31
++	vavgg	%v0, %v31, %v0
++	vavgg	%v31, %v0, %v0
++	vavgg	%v18, %v3, %v20
++
++#CHECK: vavgh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf2]
++#CHECK: vavgh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf2]
++#CHECK: vavgh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf2]
++#CHECK: vavgh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf2]
++#CHECK: vavgh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf2]
++
++	vavgh	%v0, %v0, %v0
++	vavgh	%v0, %v0, %v31
++	vavgh	%v0, %v31, %v0
++	vavgh	%v31, %v0, %v0
++	vavgh	%v18, %v3, %v20
++
++#CHECK: vavglb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf0]
++#CHECK: vavglb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf0]
++#CHECK: vavglb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf0]
++#CHECK: vavglb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf0]
++#CHECK: vavglb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf0]
++
++	vavglb	%v0, %v0, %v0
++	vavglb	%v0, %v0, %v31
++	vavglb	%v0, %v31, %v0
++	vavglb	%v31, %v0, %v0
++	vavglb	%v18, %v3, %v20
++
++#CHECK: vavglf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf0]
++#CHECK: vavglf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf0]
++#CHECK: vavglf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf0]
++#CHECK: vavglf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf0]
++#CHECK: vavglf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf0]
++
++	vavglf	%v0, %v0, %v0
++	vavglf	%v0, %v0, %v31
++	vavglf	%v0, %v31, %v0
++	vavglf	%v31, %v0, %v0
++	vavglf	%v18, %v3, %v20
++
++#CHECK: vavglg  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf0]
++#CHECK: vavglg  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf0]
++#CHECK: vavglg  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf0]
++#CHECK: vavglg  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf0]
++#CHECK: vavglg  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf0]
++
++	vavglg	%v0, %v0, %v0
++	vavglg	%v0, %v0, %v31
++	vavglg	%v0, %v31, %v0
++	vavglg	%v31, %v0, %v0
++	vavglg	%v18, %v3, %v20
++
++#CHECK: vavglh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf0]
++#CHECK: vavglh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf0]
++#CHECK: vavglh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf0]
++#CHECK: vavglh  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf0]
++#CHECK: vavglh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf0]
++
++	vavglh	%v0, %v0, %v0
++	vavglh	%v0, %v0, %v31
++	vavglh	%v0, %v31, %v0
++	vavglh	%v31, %v0, %v0
++	vavglh	%v18, %v3, %v20
++
++#CHECK: vcdgb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc3]
++#CHECK: vcdgb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc3]
++#CHECK: vcdgb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc3]
++#CHECK: vcdgb   %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
++#CHECK: vcdgb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc3]
++#CHECK: vcdgb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc3]
++#CHECK: vcdgb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc3]
++
++	vcdgb	%v0, %v0, 0, 0
++	vcdgb	%v0, %v0, 0, 15
++	vcdgb	%v0, %v0, 4, 0
++	vcdgb	%v0, %v0, 12, 0
++	vcdgb	%v0, %v31, 0, 0
++	vcdgb	%v31, %v0, 0, 0
++	vcdgb	%v14, %v17, 4, 10
++
++#CHECK: vcdlgb  %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc1]
++#CHECK: vcdlgb  %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc1]
++#CHECK: vcdlgb  %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc1]
++#CHECK: vcdlgb  %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
++#CHECK: vcdlgb  %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc1]
++#CHECK: vcdlgb  %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc1]
++#CHECK: vcdlgb  %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc1]
++
++	vcdlgb	%v0, %v0, 0, 0
++	vcdlgb	%v0, %v0, 0, 15
++	vcdlgb	%v0, %v0, 4, 0
++	vcdlgb	%v0, %v0, 12, 0
++	vcdlgb	%v0, %v31, 0, 0
++	vcdlgb	%v31, %v0, 0, 0
++	vcdlgb	%v14, %v17, 4, 10
++
++#CHECK: vcksm   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x66]
++#CHECK: vcksm   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x66]
++#CHECK: vcksm   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x66]
++#CHECK: vcksm   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x66]
++#CHECK: vcksm   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x66]
++
++	vcksm	%v0, %v0, %v0
++	vcksm	%v0, %v0, %v31
++	vcksm	%v0, %v31, %v0
++	vcksm	%v31, %v0, %v0
++	vcksm	%v18, %v3, %v20
++
++#CHECK: vceqb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf8]
++#CHECK: vceqb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf8]
++#CHECK: vceqb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf8]
++#CHECK: vceqb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf8]
++#CHECK: vceqb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf8]
++#CHECK: vceqbs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x04,0xf8]
++
++	vceqb	%v0, %v0, %v0
++	vceqb	%v0, %v0, %v31
++	vceqb	%v0, %v31, %v0
++	vceqb	%v31, %v0, %v0
++	vceqb	%v18, %v3, %v20
++	vceqbs	%v5, %v22, %v7
++
++#CHECK: vceqf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf8]
++#CHECK: vceqf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf8]
++#CHECK: vceqf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf8]
++#CHECK: vceqf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf8]
++#CHECK: vceqf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf8]
++#CHECK: vceqfs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0xf8]
++
++	vceqf	%v0, %v0, %v0
++	vceqf	%v0, %v0, %v31
++	vceqf	%v0, %v31, %v0
++	vceqf	%v31, %v0, %v0
++	vceqf	%v18, %v3, %v20
++	vceqfs	%v5, %v22, %v7
++
++#CHECK: vceqg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf8]
++#CHECK: vceqg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf8]
++#CHECK: vceqg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf8]
++#CHECK: vceqg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf8]
++#CHECK: vceqg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf8]
++#CHECK: vceqgs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x34,0xf8]
++
++	vceqg	%v0, %v0, %v0
++	vceqg	%v0, %v0, %v31
++	vceqg	%v0, %v31, %v0
++	vceqg	%v31, %v0, %v0
++	vceqg	%v18, %v3, %v20
++	vceqgs	%v5, %v22, %v7
++
++#CHECK: vceqh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf8]
++#CHECK: vceqh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf8]
++#CHECK: vceqh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf8]
++#CHECK: vceqh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf8]
++#CHECK: vceqh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf8]
++#CHECK: vceqhs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x14,0xf8]
++
++	vceqh	%v0, %v0, %v0
++	vceqh	%v0, %v0, %v31
++	vceqh	%v0, %v31, %v0
++	vceqh	%v31, %v0, %v0
++	vceqh	%v18, %v3, %v20
++	vceqhs	%v5, %v22, %v7
++
++#CHECK: vcgdb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc2]
++#CHECK: vcgdb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc2]
++#CHECK: vcgdb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc2]
++#CHECK: vcgdb   %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
++#CHECK: vcgdb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc2]
++#CHECK: vcgdb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc2]
++#CHECK: vcgdb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc2]
++
++	vcgdb	%v0, %v0, 0, 0
++	vcgdb	%v0, %v0, 0, 15
++	vcgdb	%v0, %v0, 4, 0
++	vcgdb	%v0, %v0, 12, 0
++	vcgdb	%v0, %v31, 0, 0
++	vcgdb	%v31, %v0, 0, 0
++	vcgdb	%v14, %v17, 4, 10
++
++#CHECK: vchb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xfb]
++#CHECK: vchb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xfb]
++#CHECK: vchb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xfb]
++#CHECK: vchb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xfb]
++#CHECK: vchb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xfb]
++#CHECK: vchbs   %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x04,0xfb]
++
++	vchb	%v0, %v0, %v0
++	vchb	%v0, %v0, %v31
++	vchb	%v0, %v31, %v0
++	vchb	%v31, %v0, %v0
++	vchb	%v18, %v3, %v20
++	vchbs	%v5, %v22, %v7
++
++#CHECK: vchf    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xfb]
++#CHECK: vchf    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xfb]
++#CHECK: vchf    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xfb]
++#CHECK: vchf    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xfb]
++#CHECK: vchf    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xfb]
++#CHECK: vchfs   %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0xfb]
++
++	vchf	%v0, %v0, %v0
++	vchf	%v0, %v0, %v31
++	vchf	%v0, %v31, %v0
++	vchf	%v31, %v0, %v0
++	vchf	%v18, %v3, %v20
++	vchfs	%v5, %v22, %v7
++
++#CHECK: vchg    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xfb]
++#CHECK: vchg    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xfb]
++#CHECK: vchg    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xfb]
++#CHECK: vchg    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xfb]
++#CHECK: vchg    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xfb]
++#CHECK: vchgs   %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x34,0xfb]
++
++	vchg	%v0, %v0, %v0
++	vchg	%v0, %v0, %v31
++	vchg	%v0, %v31, %v0
++	vchg	%v31, %v0, %v0
++	vchg	%v18, %v3, %v20
++	vchgs	%v5, %v22, %v7
++
++#CHECK: vchh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xfb]
++#CHECK: vchh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xfb]
++#CHECK: vchh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xfb]
++#CHECK: vchh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xfb]
++#CHECK: vchh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xfb]
++#CHECK: vchhs   %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x14,0xfb]
++
++	vchh	%v0, %v0, %v0
++	vchh	%v0, %v0, %v31
++	vchh	%v0, %v31, %v0
++	vchh	%v31, %v0, %v0
++	vchh	%v18, %v3, %v20
++	vchhs	%v5, %v22, %v7
++
++#CHECK: vchlb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf9]
++#CHECK: vchlb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf9]
++#CHECK: vchlb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf9]
++#CHECK: vchlb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf9]
++#CHECK: vchlb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf9]
++#CHECK: vchlbs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x04,0xf9]
++
++	vchlb	%v0, %v0, %v0
++	vchlb	%v0, %v0, %v31
++	vchlb	%v0, %v31, %v0
++	vchlb	%v31, %v0, %v0
++	vchlb	%v18, %v3, %v20
++	vchlbs	%v5, %v22, %v7
++
++#CHECK: vchlf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf9]
++#CHECK: vchlf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf9]
++#CHECK: vchlf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf9]
++#CHECK: vchlf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf9]
++#CHECK: vchlf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf9]
++#CHECK: vchlfs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0xf9]
++
++	vchlf	%v0, %v0, %v0
++	vchlf	%v0, %v0, %v31
++	vchlf	%v0, %v31, %v0
++	vchlf	%v31, %v0, %v0
++	vchlf	%v18, %v3, %v20
++	vchlfs	%v5, %v22, %v7
++
++#CHECK: vchlg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf9]
++#CHECK: vchlg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf9]
++#CHECK: vchlg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf9]
++#CHECK: vchlg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf9]
++#CHECK: vchlg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf9]
++#CHECK: vchlgs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x34,0xf9]
++
++	vchlg	%v0, %v0, %v0
++	vchlg	%v0, %v0, %v31
++	vchlg	%v0, %v31, %v0
++	vchlg	%v31, %v0, %v0
++	vchlg	%v18, %v3, %v20
++	vchlgs	%v5, %v22, %v7
++
++#CHECK: vchlh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf9]
++#CHECK: vchlh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf9]
++#CHECK: vchlh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf9]
++#CHECK: vchlh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf9]
++#CHECK: vchlh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf9]
++#CHECK: vchlhs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x14,0xf9]
++
++	vchlh	%v0, %v0, %v0
++	vchlh	%v0, %v0, %v31
++	vchlh	%v0, %v31, %v0
++	vchlh	%v31, %v0, %v0
++	vchlh	%v18, %v3, %v20
++	vchlhs	%v5, %v22, %v7
++
++#CHECK: vclgdb  %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc0]
++#CHECK: vclgdb  %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc0]
++#CHECK: vclgdb  %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc0]
++#CHECK: vclgdb  %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
++#CHECK: vclgdb  %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc0]
++#CHECK: vclgdb  %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc0]
++#CHECK: vclgdb  %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc0]
++
++	vclgdb	%v0, %v0, 0, 0
++	vclgdb	%v0, %v0, 0, 15
++	vclgdb	%v0, %v0, 4, 0
++	vclgdb	%v0, %v0, 12, 0
++	vclgdb	%v0, %v31, 0, 0
++	vclgdb	%v31, %v0, 0, 0
++	vclgdb	%v14, %v17, 4, 10
++
++#CHECK: vclzb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0x53]
++#CHECK: vclzb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x53]
++#CHECK: vclzb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x53]
++#CHECK: vclzb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x53]
++#CHECK: vclzb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x53]
++#CHECK: vclzb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x53]
++
++	vclzb	%v0, %v0
++	vclzb	%v0, %v15
++	vclzb	%v0, %v31
++	vclzb	%v15, %v0
++	vclzb	%v31, %v0
++	vclzb	%v14, %v17
++
++#CHECK: vclzf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0x53]
++#CHECK: vclzf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x53]
++#CHECK: vclzf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x53]
++#CHECK: vclzf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x53]
++#CHECK: vclzf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x53]
++#CHECK: vclzf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0x53]
++
++	vclzf	%v0, %v0
++	vclzf	%v0, %v15
++	vclzf	%v0, %v31
++	vclzf	%v15, %v0
++	vclzf	%v31, %v0
++	vclzf	%v14, %v17
++
++#CHECK: vclzg   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0x53]
++#CHECK: vclzg   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x53]
++#CHECK: vclzg   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x53]
++#CHECK: vclzg   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x53]
++#CHECK: vclzg   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x53]
++#CHECK: vclzg   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0x53]
++
++	vclzg	%v0, %v0
++	vclzg	%v0, %v15
++	vclzg	%v0, %v31
++	vclzg	%v15, %v0
++	vclzg	%v31, %v0
++	vclzg	%v14, %v17
++
++#CHECK: vclzh   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0x53]
++#CHECK: vclzh   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x53]
++#CHECK: vclzh   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x53]
++#CHECK: vclzh   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x53]
++#CHECK: vclzh   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x53]
++#CHECK: vclzh   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0x53]
++
++	vclzh	%v0, %v0
++	vclzh	%v0, %v15
++	vclzh	%v0, %v31
++	vclzh	%v15, %v0
++	vclzh	%v31, %v0
++	vclzh	%v14, %v17
++
++#CHECK: vctzb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0x52]
++#CHECK: vctzb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x52]
++#CHECK: vctzb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x52]
++#CHECK: vctzb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x52]
++#CHECK: vctzb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x52]
++#CHECK: vctzb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x52]
++
++	vctzb	%v0, %v0
++	vctzb	%v0, %v15
++	vctzb	%v0, %v31
++	vctzb	%v15, %v0
++	vctzb	%v31, %v0
++	vctzb	%v14, %v17
++
++#CHECK: vctzf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0x52]
++#CHECK: vctzf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x52]
++#CHECK: vctzf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x52]
++#CHECK: vctzf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x52]
++#CHECK: vctzf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x52]
++#CHECK: vctzf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0x52]
++
++	vctzf	%v0, %v0
++	vctzf	%v0, %v15
++	vctzf	%v0, %v31
++	vctzf	%v15, %v0
++	vctzf	%v31, %v0
++	vctzf	%v14, %v17
++
++#CHECK: vctzg   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0x52]
++#CHECK: vctzg   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x52]
++#CHECK: vctzg   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x52]
++#CHECK: vctzg   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x52]
++#CHECK: vctzg   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x52]
++#CHECK: vctzg   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0x52]
++
++	vctzg	%v0, %v0
++	vctzg	%v0, %v15
++	vctzg	%v0, %v31
++	vctzg	%v15, %v0
++	vctzg	%v31, %v0
++	vctzg	%v14, %v17
++
++#CHECK: vctzh   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0x52]
++#CHECK: vctzh   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x52]
++#CHECK: vctzh   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x52]
++#CHECK: vctzh   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x52]
++#CHECK: vctzh   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x52]
++#CHECK: vctzh   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0x52]
++
++	vctzh	%v0, %v0
++	vctzh	%v0, %v15
++	vctzh	%v0, %v31
++	vctzh	%v15, %v0
++	vctzh	%v31, %v0
++	vctzh	%v14, %v17
++
++#CHECK: vecb    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xdb]
++#CHECK: vecb    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xdb]
++#CHECK: vecb    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xdb]
++#CHECK: vecb    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xdb]
++#CHECK: vecb    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xdb]
++#CHECK: vecb    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xdb]
++
++	vecb	%v0, %v0
++	vecb	%v0, %v15
++	vecb	%v0, %v31
++	vecb	%v15, %v0
++	vecb	%v31, %v0
++	vecb	%v14, %v17
++
++#CHECK: vecf    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xdb]
++#CHECK: vecf    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xdb]
++#CHECK: vecf    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xdb]
++#CHECK: vecf    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xdb]
++#CHECK: vecf    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xdb]
++#CHECK: vecf    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xdb]
++
++	vecf	%v0, %v0
++	vecf	%v0, %v15
++	vecf	%v0, %v31
++	vecf	%v15, %v0
++	vecf	%v31, %v0
++	vecf	%v14, %v17
++
++#CHECK: vecg    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xdb]
++#CHECK: vecg    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xdb]
++#CHECK: vecg    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xdb]
++#CHECK: vecg    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xdb]
++#CHECK: vecg    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xdb]
++#CHECK: vecg    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xdb]
++
++	vecg	%v0, %v0
++	vecg	%v0, %v15
++	vecg	%v0, %v31
++	vecg	%v15, %v0
++	vecg	%v31, %v0
++	vecg	%v14, %v17
++
++#CHECK: vech    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xdb]
++#CHECK: vech    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xdb]
++#CHECK: vech    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xdb]
++#CHECK: vech    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xdb]
++#CHECK: vech    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xdb]
++#CHECK: vech    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xdb]
++
++	vech	%v0, %v0
++	vech	%v0, %v15
++	vech	%v0, %v31
++	vech	%v15, %v0
++	vech	%v31, %v0
++	vech	%v14, %v17
++
++#CHECK: veclb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd9]
++#CHECK: veclb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd9]
++#CHECK: veclb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd9]
++#CHECK: veclb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd9]
++#CHECK: veclb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd9]
++#CHECK: veclb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd9]
++
++	veclb	%v0, %v0
++	veclb	%v0, %v15
++	veclb	%v0, %v31
++	veclb	%v15, %v0
++	veclb	%v31, %v0
++	veclb	%v14, %v17
++
++#CHECK: veclf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd9]
++#CHECK: veclf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd9]
++#CHECK: veclf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd9]
++#CHECK: veclf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd9]
++#CHECK: veclf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd9]
++#CHECK: veclf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd9]
++
++	veclf	%v0, %v0
++	veclf	%v0, %v15
++	veclf	%v0, %v31
++	veclf	%v15, %v0
++	veclf	%v31, %v0
++	veclf	%v14, %v17
++
++#CHECK: veclg   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xd9]
++#CHECK: veclg   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xd9]
++#CHECK: veclg   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xd9]
++#CHECK: veclg   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xd9]
++#CHECK: veclg   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xd9]
++#CHECK: veclg   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xd9]
++
++	veclg	%v0, %v0
++	veclg	%v0, %v15
++	veclg	%v0, %v31
++	veclg	%v15, %v0
++	veclg	%v31, %v0
++	veclg	%v14, %v17
++
++#CHECK: veclh   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd9]
++#CHECK: veclh   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd9]
++#CHECK: veclh   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd9]
++#CHECK: veclh   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xd9]
++#CHECK: veclh   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xd9]
++#CHECK: veclh   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xd9]
++
++	veclh	%v0, %v0
++	veclh	%v0, %v15
++	veclh	%v0, %v31
++	veclh	%v15, %v0
++	veclh	%v31, %v0
++	veclh	%v14, %v17
++
++#CHECK: verimb  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x72]
++#CHECK: verimb  %v0, %v0, %v0, 255      # encoding: [0xe7,0x00,0x00,0xff,0x00,0x72]
++#CHECK: verimb  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x72]
++#CHECK: verimb  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x72]
++#CHECK: verimb  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x72]
++#CHECK: verimb  %v13, %v17, %v21, 121   # encoding: [0xe7,0xd1,0x50,0x79,0x06,0x72]
++
++	verimb	%v0, %v0, %v0, 0
++	verimb	%v0, %v0, %v0, 255
++	verimb	%v0, %v0, %v31, 0
++	verimb	%v0, %v31, %v0, 0
++	verimb	%v31, %v0, %v0, 0
++	verimb 	%v13, %v17, %v21, 0x79
++
++#CHECK: verimf  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x72]
++#CHECK: verimf  %v0, %v0, %v0, 255      # encoding: [0xe7,0x00,0x00,0xff,0x20,0x72]
++#CHECK: verimf  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x72]
++#CHECK: verimf  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x72]
++#CHECK: verimf  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x72]
++#CHECK: verimf  %v13, %v17, %v21, 121   # encoding: [0xe7,0xd1,0x50,0x79,0x26,0x72]
++
++	verimf	%v0, %v0, %v0, 0
++	verimf	%v0, %v0, %v0, 255
++	verimf	%v0, %v0, %v31, 0
++	verimf	%v0, %v31, %v0, 0
++	verimf	%v31, %v0, %v0, 0
++	verimf 	%v13, %v17, %v21, 0x79
++
++#CHECK: verimg  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x30,0x72]
++#CHECK: verimg  %v0, %v0, %v0, 255      # encoding: [0xe7,0x00,0x00,0xff,0x30,0x72]
++#CHECK: verimg  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x72]
++#CHECK: verimg  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x72]
++#CHECK: verimg  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x72]
++#CHECK: verimg  %v13, %v17, %v21, 121   # encoding: [0xe7,0xd1,0x50,0x79,0x36,0x72]
++
++	verimg	%v0, %v0, %v0, 0
++	verimg	%v0, %v0, %v0, 255
++	verimg	%v0, %v0, %v31, 0
++	verimg	%v0, %v31, %v0, 0
++	verimg	%v31, %v0, %v0, 0
++	verimg 	%v13, %v17, %v21, 0x79
++
++#CHECK: verimh  %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x10,0x72]
++#CHECK: verimh  %v0, %v0, %v0, 255      # encoding: [0xe7,0x00,0x00,0xff,0x10,0x72]
++#CHECK: verimh  %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x72]
++#CHECK: verimh  %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x72]
++#CHECK: verimh  %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x72]
++#CHECK: verimh  %v13, %v17, %v21, 121   # encoding: [0xe7,0xd1,0x50,0x79,0x16,0x72]
++
++	verimh	%v0, %v0, %v0, 0
++	verimh	%v0, %v0, %v0, 255
++	verimh	%v0, %v0, %v31, 0
++	verimh	%v0, %v31, %v0, 0
++	verimh	%v31, %v0, %v0, 0
++	verimh 	%v13, %v17, %v21, 0x79
++
++#CHECK: verllvb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x73]
++#CHECK: verllvb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x73]
++#CHECK: verllvb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x73]
++#CHECK: verllvb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x73]
++#CHECK: verllvb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x73]
++
++	verllvb	%v0, %v0, %v0
++	verllvb	%v0, %v0, %v31
++	verllvb	%v0, %v31, %v0
++	verllvb	%v31, %v0, %v0
++	verllvb	%v18, %v3, %v20
++
++#CHECK: verllvf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x73]
++#CHECK: verllvf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x73]
++#CHECK: verllvf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x73]
++#CHECK: verllvf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x73]
++#CHECK: verllvf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x73]
++
++	verllvf	%v0, %v0, %v0
++	verllvf	%v0, %v0, %v31
++	verllvf	%v0, %v31, %v0
++	verllvf	%v31, %v0, %v0
++	verllvf	%v18, %v3, %v20
++
++#CHECK: verllvg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x73]
++#CHECK: verllvg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x73]
++#CHECK: verllvg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x73]
++#CHECK: verllvg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x73]
++#CHECK: verllvg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x73]
++
++	verllvg	%v0, %v0, %v0
++	verllvg	%v0, %v0, %v31
++	verllvg	%v0, %v31, %v0
++	verllvg	%v31, %v0, %v0
++	verllvg	%v18, %v3, %v20
++
++#CHECK: verllvh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x73]
++#CHECK: verllvh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x73]
++#CHECK: verllvh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x73]
++#CHECK: verllvh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x73]
++#CHECK: verllvh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x73]
++
++	verllvh	%v0, %v0, %v0
++	verllvh	%v0, %v0, %v31
++	verllvh	%v0, %v31, %v0
++	verllvh	%v31, %v0, %v0
++	verllvh	%v18, %v3, %v20
++
++#CHECK: verllb  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x33]
++#CHECK: verllb  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x33]
++#CHECK: verllb  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x33]
++#CHECK: verllb  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x33]
++#CHECK: verllb  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x33]
++#CHECK: verllb  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x04,0x33]
++
++	verllb	%v0, %v0, 0
++	verllb	%v0, %v0, 4095
++	verllb	%v0, %v0, 0(%r15)
++	verllb	%v0, %v31, 0
++	verllb	%v31, %v0, 0
++	verllb	%v14, %v17, 1074(%r5)
++
++#CHECK: verllf  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x33]
++#CHECK: verllf  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x33]
++#CHECK: verllf  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x33]
++#CHECK: verllf  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x33]
++#CHECK: verllf  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x33]
++#CHECK: verllf  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x24,0x33]
++
++	verllf	%v0, %v0, 0
++	verllf	%v0, %v0, 4095
++	verllf	%v0, %v0, 0(%r15)
++	verllf	%v0, %v31, 0
++	verllf	%v31, %v0, 0
++	verllf	%v14, %v17, 1074(%r5)
++
++#CHECK: verllg  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x33]
++#CHECK: verllg  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x33]
++#CHECK: verllg  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x33]
++#CHECK: verllg  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x33]
++#CHECK: verllg  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x33]
++#CHECK: verllg  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x34,0x33]
++
++	verllg	%v0, %v0, 0
++	verllg	%v0, %v0, 4095
++	verllg	%v0, %v0, 0(%r15)
++	verllg	%v0, %v31, 0
++	verllg	%v31, %v0, 0
++	verllg	%v14, %v17, 1074(%r5)
++
++#CHECK: verllh  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x33]
++#CHECK: verllh  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x33]
++#CHECK: verllh  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x33]
++#CHECK: verllh  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x33]
++#CHECK: verllh  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x33]
++#CHECK: verllh  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x14,0x33]
++
++	verllh	%v0, %v0, 0
++	verllh	%v0, %v0, 4095
++	verllh	%v0, %v0, 0(%r15)
++	verllh	%v0, %v31, 0
++	verllh	%v31, %v0, 0
++	verllh	%v14, %v17, 1074(%r5)
++
++#CHECK: veslvb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x70]
++#CHECK: veslvb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x70]
++#CHECK: veslvb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x70]
++#CHECK: veslvb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x70]
++#CHECK: veslvb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x70]
++
++	veslvb	%v0, %v0, %v0
++	veslvb	%v0, %v0, %v31
++	veslvb	%v0, %v31, %v0
++	veslvb	%v31, %v0, %v0
++	veslvb	%v18, %v3, %v20
++
++#CHECK: veslvf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x70]
++#CHECK: veslvf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x70]
++#CHECK: veslvf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x70]
++#CHECK: veslvf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x70]
++#CHECK: veslvf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x70]
++
++	veslvf	%v0, %v0, %v0
++	veslvf	%v0, %v0, %v31
++	veslvf	%v0, %v31, %v0
++	veslvf	%v31, %v0, %v0
++	veslvf	%v18, %v3, %v20
++
++#CHECK: veslvg  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x70]
++#CHECK: veslvg  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x70]
++#CHECK: veslvg  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x70]
++#CHECK: veslvg  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x70]
++#CHECK: veslvg  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x70]
++
++	veslvg	%v0, %v0, %v0
++	veslvg	%v0, %v0, %v31
++	veslvg	%v0, %v31, %v0
++	veslvg	%v31, %v0, %v0
++	veslvg	%v18, %v3, %v20
++
++#CHECK: veslvh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x70]
++#CHECK: veslvh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x70]
++#CHECK: veslvh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x70]
++#CHECK: veslvh  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x70]
++#CHECK: veslvh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x70]
++
++	veslvh	%v0, %v0, %v0
++	veslvh	%v0, %v0, %v31
++	veslvh	%v0, %v31, %v0
++	veslvh	%v31, %v0, %v0
++	veslvh	%v18, %v3, %v20
++
++#CHECK: veslb   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x30]
++#CHECK: veslb   %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x30]
++#CHECK: veslb   %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x30]
++#CHECK: veslb   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x30]
++#CHECK: veslb   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x30]
++#CHECK: veslb   %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x04,0x30]
++
++	veslb	%v0, %v0, 0
++	veslb	%v0, %v0, 4095
++	veslb	%v0, %v0, 0(%r15)
++	veslb	%v0, %v31, 0
++	veslb	%v31, %v0, 0
++	veslb	%v14, %v17, 1074(%r5)
++
++#CHECK: veslf   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x30]
++#CHECK: veslf   %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x30]
++#CHECK: veslf   %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x30]
++#CHECK: veslf   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x30]
++#CHECK: veslf   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x30]
++#CHECK: veslf   %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x24,0x30]
++
++	veslf	%v0, %v0, 0
++	veslf	%v0, %v0, 4095
++	veslf	%v0, %v0, 0(%r15)
++	veslf	%v0, %v31, 0
++	veslf	%v31, %v0, 0
++	veslf	%v14, %v17, 1074(%r5)
++
++#CHECK: veslg   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x30]
++#CHECK: veslg   %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x30]
++#CHECK: veslg   %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x30]
++#CHECK: veslg   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x30]
++#CHECK: veslg   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x30]
++#CHECK: veslg   %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x34,0x30]
++
++	veslg	%v0, %v0, 0
++	veslg	%v0, %v0, 4095
++	veslg	%v0, %v0, 0(%r15)
++	veslg	%v0, %v31, 0
++	veslg	%v31, %v0, 0
++	veslg	%v14, %v17, 1074(%r5)
++
++#CHECK: veslh   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x30]
++#CHECK: veslh   %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x30]
++#CHECK: veslh   %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x30]
++#CHECK: veslh   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x30]
++#CHECK: veslh   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x30]
++#CHECK: veslh   %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x14,0x30]
++
++	veslh	%v0, %v0, 0
++	veslh	%v0, %v0, 4095
++	veslh	%v0, %v0, 0(%r15)
++	veslh	%v0, %v31, 0
++	veslh	%v31, %v0, 0
++	veslh	%v14, %v17, 1074(%r5)
++
++#CHECK: vesravb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7a]
++#CHECK: vesravb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7a]
++#CHECK: vesravb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7a]
++#CHECK: vesravb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7a]
++#CHECK: vesravb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7a]
++
++	vesravb	%v0, %v0, %v0
++	vesravb	%v0, %v0, %v31
++	vesravb	%v0, %v31, %v0
++	vesravb	%v31, %v0, %v0
++	vesravb	%v18, %v3, %v20
++
++#CHECK: vesravf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x7a]
++#CHECK: vesravf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x7a]
++#CHECK: vesravf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x7a]
++#CHECK: vesravf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x7a]
++#CHECK: vesravf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x7a]
++
++	vesravf	%v0, %v0, %v0
++	vesravf	%v0, %v0, %v31
++	vesravf	%v0, %v31, %v0
++	vesravf	%v31, %v0, %v0
++	vesravf	%v18, %v3, %v20
++
++#CHECK: vesravg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x7a]
++#CHECK: vesravg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x7a]
++#CHECK: vesravg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x7a]
++#CHECK: vesravg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x7a]
++#CHECK: vesravg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x7a]
++
++	vesravg	%v0, %v0, %v0
++	vesravg	%v0, %v0, %v31
++	vesravg	%v0, %v31, %v0
++	vesravg	%v31, %v0, %v0
++	vesravg	%v18, %v3, %v20
++
++#CHECK: vesravh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x7a]
++#CHECK: vesravh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x7a]
++#CHECK: vesravh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x7a]
++#CHECK: vesravh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x7a]
++#CHECK: vesravh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x7a]
++
++	vesravh	%v0, %v0, %v0
++	vesravh	%v0, %v0, %v31
++	vesravh	%v0, %v31, %v0
++	vesravh	%v31, %v0, %v0
++	vesravh	%v18, %v3, %v20
++
++#CHECK: vesrab  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x3a]
++#CHECK: vesrab  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x3a]
++#CHECK: vesrab  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x3a]
++#CHECK: vesrab  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x3a]
++#CHECK: vesrab  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x3a]
++#CHECK: vesrab  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x04,0x3a]
++
++	vesrab	%v0, %v0, 0
++	vesrab	%v0, %v0, 4095
++	vesrab	%v0, %v0, 0(%r15)
++	vesrab	%v0, %v31, 0
++	vesrab	%v31, %v0, 0
++	vesrab	%v14, %v17, 1074(%r5)
++
++#CHECK: vesraf  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x3a]
++#CHECK: vesraf  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x3a]
++#CHECK: vesraf  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x3a]
++#CHECK: vesraf  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x3a]
++#CHECK: vesraf  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x3a]
++#CHECK: vesraf  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x24,0x3a]
++
++	vesraf	%v0, %v0, 0
++	vesraf	%v0, %v0, 4095
++	vesraf	%v0, %v0, 0(%r15)
++	vesraf	%v0, %v31, 0
++	vesraf	%v31, %v0, 0
++	vesraf	%v14, %v17, 1074(%r5)
++
++#CHECK: vesrag  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x3a]
++#CHECK: vesrag  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x3a]
++#CHECK: vesrag  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x3a]
++#CHECK: vesrag  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x3a]
++#CHECK: vesrag  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x3a]
++#CHECK: vesrag  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x34,0x3a]
++
++	vesrag	%v0, %v0, 0
++	vesrag	%v0, %v0, 4095
++	vesrag	%v0, %v0, 0(%r15)
++	vesrag	%v0, %v31, 0
++	vesrag	%v31, %v0, 0
++	vesrag	%v14, %v17, 1074(%r5)
++
++#CHECK: vesrah  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x3a]
++#CHECK: vesrah  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x3a]
++#CHECK: vesrah  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x3a]
++#CHECK: vesrah  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x3a]
++#CHECK: vesrah  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x3a]
++#CHECK: vesrah  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x14,0x3a]
++
++	vesrah	%v0, %v0, 0
++	vesrah	%v0, %v0, 4095
++	vesrah	%v0, %v0, 0(%r15)
++	vesrah	%v0, %v31, 0
++	vesrah	%v31, %v0, 0
++	vesrah	%v14, %v17, 1074(%r5)
++
++#CHECK: vesrlvb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x78]
++#CHECK: vesrlvb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x78]
++#CHECK: vesrlvb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x78]
++#CHECK: vesrlvb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x78]
++#CHECK: vesrlvb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x78]
++
++	vesrlvb	%v0, %v0, %v0
++	vesrlvb	%v0, %v0, %v31
++	vesrlvb	%v0, %v31, %v0
++	vesrlvb	%v31, %v0, %v0
++	vesrlvb	%v18, %v3, %v20
++
++#CHECK: vesrlvf %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x78]
++#CHECK: vesrlvf %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x78]
++#CHECK: vesrlvf %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x78]
++#CHECK: vesrlvf %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x78]
++#CHECK: vesrlvf %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x78]
++
++	vesrlvf	%v0, %v0, %v0
++	vesrlvf	%v0, %v0, %v31
++	vesrlvf	%v0, %v31, %v0
++	vesrlvf	%v31, %v0, %v0
++	vesrlvf	%v18, %v3, %v20
++
++#CHECK: vesrlvg %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x78]
++#CHECK: vesrlvg %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x78]
++#CHECK: vesrlvg %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x78]
++#CHECK: vesrlvg %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x78]
++#CHECK: vesrlvg %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x78]
++
++	vesrlvg	%v0, %v0, %v0
++	vesrlvg	%v0, %v0, %v31
++	vesrlvg	%v0, %v31, %v0
++	vesrlvg	%v31, %v0, %v0
++	vesrlvg	%v18, %v3, %v20
++
++#CHECK: vesrlvh %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x78]
++#CHECK: vesrlvh %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x78]
++#CHECK: vesrlvh %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x78]
++#CHECK: vesrlvh %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x78]
++#CHECK: vesrlvh %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x78]
++
++	vesrlvh	%v0, %v0, %v0
++	vesrlvh	%v0, %v0, %v31
++	vesrlvh	%v0, %v31, %v0
++	vesrlvh	%v31, %v0, %v0
++	vesrlvh	%v18, %v3, %v20
++
++#CHECK: vesrlb  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x38]
++#CHECK: vesrlb  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x38]
++#CHECK: vesrlb  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x38]
++#CHECK: vesrlb  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x38]
++#CHECK: vesrlb  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x38]
++#CHECK: vesrlb  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x04,0x38]
++
++	vesrlb	%v0, %v0, 0
++	vesrlb	%v0, %v0, 4095
++	vesrlb	%v0, %v0, 0(%r15)
++	vesrlb	%v0, %v31, 0
++	vesrlb	%v31, %v0, 0
++	vesrlb	%v14, %v17, 1074(%r5)
++
++#CHECK: vesrlf  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x38]
++#CHECK: vesrlf  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x38]
++#CHECK: vesrlf  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x38]
++#CHECK: vesrlf  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x38]
++#CHECK: vesrlf  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x38]
++#CHECK: vesrlf  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x24,0x38]
++
++	vesrlf	%v0, %v0, 0
++	vesrlf	%v0, %v0, 4095
++	vesrlf	%v0, %v0, 0(%r15)
++	vesrlf	%v0, %v31, 0
++	vesrlf	%v31, %v0, 0
++	vesrlf	%v14, %v17, 1074(%r5)
++
++#CHECK: vesrlg  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x38]
++#CHECK: vesrlg  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x38]
++#CHECK: vesrlg  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x38]
++#CHECK: vesrlg  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x38]
++#CHECK: vesrlg  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x38]
++#CHECK: vesrlg  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x34,0x38]
++
++	vesrlg	%v0, %v0, 0
++	vesrlg	%v0, %v0, 4095
++	vesrlg	%v0, %v0, 0(%r15)
++	vesrlg	%v0, %v31, 0
++	vesrlg	%v31, %v0, 0
++	vesrlg	%v14, %v17, 1074(%r5)
++
++#CHECK: vesrlh  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x38]
++#CHECK: vesrlh  %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x38]
++#CHECK: vesrlh  %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x38]
++#CHECK: vesrlh  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x38]
++#CHECK: vesrlh  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x38]
++#CHECK: vesrlh  %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x14,0x38]
++
++	vesrlh	%v0, %v0, 0
++	vesrlh	%v0, %v0, 4095
++	vesrlh	%v0, %v0, 0(%r15)
++	vesrlh	%v0, %v31, 0
++	vesrlh	%v31, %v0, 0
++	vesrlh	%v14, %v17, 1074(%r5)
++
++#CHECK: vfadb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe3]
++#CHECK: vfadb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe3]
++#CHECK: vfadb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe3]
++#CHECK: vfadb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xe3]
++#CHECK: vfadb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xe3]
++
++	vfadb	%v0, %v0, %v0
++	vfadb	%v0, %v0, %v31
++	vfadb	%v0, %v31, %v0
++	vfadb	%v31, %v0, %v0
++	vfadb	%v18, %v3, %v20
++
++#CHECK: vfaeb   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x82]
++#CHECK: vfaeb   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x82]
++#CHECK: vfaeb   %v0, %v0, %v0, 12       # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x82]
++#CHECK: vfaeb   %v0, %v0, %v15, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x82]
++#CHECK: vfaeb   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x82]
++#CHECK: vfaeb   %v0, %v15, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x82]
++#CHECK: vfaeb   %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x82]
++#CHECK: vfaeb   %v15, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x82]
++#CHECK: vfaeb   %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x82]
++#CHECK: vfaeb   %v18, %v3, %v20, 4      # encoding: [0xe7,0x23,0x40,0x40,0x0a,0x82]
++#CHECK: vfaeb   %v18, %v3, %v20, 15     # encoding: [0xe7,0x23,0x40,0xf0,0x0a,0x82]
++#CHECK: vfaebs  %v18, %v3, %v20, 8      # encoding: [0xe7,0x23,0x40,0x90,0x0a,0x82]
++#CHECK: vfaezb  %v18, %v3, %v20, 4      # encoding: [0xe7,0x23,0x40,0x60,0x0a,0x82]
++#CHECK: vfaezbs %v18, %v3, %v20, 8      # encoding: [0xe7,0x23,0x40,0xb0,0x0a,0x82]
++#CHECK: vfaezbs %v18, %v3, %v20, 15     # encoding: [0xe7,0x23,0x40,0xf0,0x0a,0x82]
++
++	vfaeb	%v0, %v0, %v0
++	vfaeb	%v0, %v0, %v0, 0
++	vfaeb	%v0, %v0, %v0, 12
++	vfaeb	%v0, %v0, %v15
++	vfaeb	%v0, %v0, %v31
++	vfaeb	%v0, %v15, %v0
++	vfaeb	%v0, %v31, %v0
++	vfaeb	%v15, %v0, %v0
++	vfaeb	%v31, %v0, %v0
++	vfaeb	%v18, %v3, %v20, 4
++	vfaeb	%v18, %v3, %v20, 15
++	vfaebs	%v18, %v3, %v20, 8
++	vfaezb	%v18, %v3, %v20, 4
++	vfaezbs	%v18, %v3, %v20, 8
++	vfaezbs	%v18, %v3, %v20, 15
++
++#CHECK: vfaef   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x82]
++#CHECK: vfaef   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x20,0x82]
++#CHECK: vfaef   %v0, %v0, %v0, 12       # encoding: [0xe7,0x00,0x00,0xc0,0x20,0x82]
++#CHECK: vfaef   %v0, %v0, %v15, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x82]
++#CHECK: vfaef   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x82]
++#CHECK: vfaef   %v0, %v15, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x82]
++#CHECK: vfaef   %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x82]
++#CHECK: vfaef   %v15, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x82]
++#CHECK: vfaef   %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x82]
++#CHECK: vfaef   %v18, %v3, %v20, 4      # encoding: [0xe7,0x23,0x40,0x40,0x2a,0x82]
++#CHECK: vfaef   %v18, %v3, %v20, 15     # encoding: [0xe7,0x23,0x40,0xf0,0x2a,0x82]
++#CHECK: vfaefs  %v18, %v3, %v20, 8      # encoding: [0xe7,0x23,0x40,0x90,0x2a,0x82]
++#CHECK: vfaezf  %v18, %v3, %v20, 4      # encoding: [0xe7,0x23,0x40,0x60,0x2a,0x82]
++#CHECK: vfaezfs %v18, %v3, %v20, 8      # encoding: [0xe7,0x23,0x40,0xb0,0x2a,0x82]
++#CHECK: vfaezfs %v18, %v3, %v20, 15     # encoding: [0xe7,0x23,0x40,0xf0,0x2a,0x82]
++
++	vfaef	%v0, %v0, %v0
++	vfaef	%v0, %v0, %v0, 0
++	vfaef	%v0, %v0, %v0, 12
++	vfaef	%v0, %v0, %v15
++	vfaef	%v0, %v0, %v31
++	vfaef	%v0, %v15, %v0
++	vfaef	%v0, %v31, %v0
++	vfaef	%v15, %v0, %v0
++	vfaef	%v31, %v0, %v0
++	vfaef	%v18, %v3, %v20, 4
++	vfaef	%v18, %v3, %v20, 15
++	vfaefs	%v18, %v3, %v20, 8
++	vfaezf	%v18, %v3, %v20, 4
++	vfaezfs	%v18, %v3, %v20, 8
++	vfaezfs	%v18, %v3, %v20, 15
++
++#CHECK: vfaeh   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x10,0x82]
++#CHECK: vfaeh   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x10,0x82]
++#CHECK: vfaeh   %v0, %v0, %v0, 12       # encoding: [0xe7,0x00,0x00,0xc0,0x10,0x82]
++#CHECK: vfaeh   %v0, %v0, %v15, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x82]
++#CHECK: vfaeh   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x82]
++#CHECK: vfaeh   %v0, %v15, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x82]
++#CHECK: vfaeh   %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x82]
++#CHECK: vfaeh   %v15, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x82]
++#CHECK: vfaeh   %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x82]
++#CHECK: vfaeh   %v18, %v3, %v20, 4      # encoding: [0xe7,0x23,0x40,0x40,0x1a,0x82]
++#CHECK: vfaeh   %v18, %v3, %v20, 15     # encoding: [0xe7,0x23,0x40,0xf0,0x1a,0x82]
++#CHECK: vfaehs  %v18, %v3, %v20, 8      # encoding: [0xe7,0x23,0x40,0x90,0x1a,0x82]
++#CHECK: vfaezh  %v18, %v3, %v20, 4      # encoding: [0xe7,0x23,0x40,0x60,0x1a,0x82]
++#CHECK: vfaezhs %v18, %v3, %v20, 8      # encoding: [0xe7,0x23,0x40,0xb0,0x1a,0x82]
++#CHECK: vfaezhs %v18, %v3, %v20, 15     # encoding: [0xe7,0x23,0x40,0xf0,0x1a,0x82]
++
++	vfaeh	%v0, %v0, %v0
++	vfaeh	%v0, %v0, %v0, 0
++	vfaeh	%v0, %v0, %v0, 12
++	vfaeh	%v0, %v0, %v15
++	vfaeh	%v0, %v0, %v31
++	vfaeh	%v0, %v15, %v0
++	vfaeh	%v0, %v31, %v0
++	vfaeh	%v15, %v0, %v0
++	vfaeh	%v31, %v0, %v0
++	vfaeh	%v18, %v3, %v20, 4
++	vfaeh	%v18, %v3, %v20, 15
++	vfaehs	%v18, %v3, %v20, 8
++	vfaezh	%v18, %v3, %v20, 4
++	vfaezhs	%v18, %v3, %v20, 8
++	vfaezhs	%v18, %v3, %v20, 15
++
++#CHECK: vfcedb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe8]
++#CHECK: vfcedb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe8]
++#CHECK: vfcedb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe8]
++#CHECK: vfcedb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xe8]
++#CHECK: vfcedb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xe8]
++
++	vfcedb	%v0, %v0, %v0
++	vfcedb	%v0, %v0, %v31
++	vfcedb	%v0, %v31, %v0
++	vfcedb	%v31, %v0, %v0
++	vfcedb	%v18, %v3, %v20
++
++#CHECK: vfcedbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x10,0x30,0xe8]
++#CHECK: vfcedbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x10,0x32,0xe8]
++#CHECK: vfcedbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x10,0x34,0xe8]
++#CHECK: vfcedbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x10,0x38,0xe8]
++#CHECK: vfcedbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x10,0x3a,0xe8]
++
++	vfcedbs	%v0, %v0, %v0
++	vfcedbs	%v0, %v0, %v31
++	vfcedbs	%v0, %v31, %v0
++	vfcedbs	%v31, %v0, %v0
++	vfcedbs	%v18, %v3, %v20
++
++#CHECK: vfchdb  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xeb]
++#CHECK: vfchdb  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xeb]
++#CHECK: vfchdb  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xeb]
++#CHECK: vfchdb  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xeb]
++#CHECK: vfchdb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xeb]
++
++	vfchdb	%v0, %v0, %v0
++	vfchdb	%v0, %v0, %v31
++	vfchdb	%v0, %v31, %v0
++	vfchdb	%v31, %v0, %v0
++	vfchdb	%v18, %v3, %v20
++
++#CHECK: vfchdbs %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x10,0x30,0xeb]
++#CHECK: vfchdbs %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x10,0x32,0xeb]
++#CHECK: vfchdbs %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x10,0x34,0xeb]
++#CHECK: vfchdbs %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x10,0x38,0xeb]
++#CHECK: vfchdbs %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x10,0x3a,0xeb]
++
++	vfchdbs	%v0, %v0, %v0
++	vfchdbs	%v0, %v0, %v31
++	vfchdbs	%v0, %v31, %v0
++	vfchdbs	%v31, %v0, %v0
++	vfchdbs	%v18, %v3, %v20
++
++#CHECK: vfchedb %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xea]
++#CHECK: vfchedb %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xea]
++#CHECK: vfchedb %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xea]
++#CHECK: vfchedb %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xea]
++#CHECK: vfchedb %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xea]
++
++	vfchedb	%v0, %v0, %v0
++	vfchedb	%v0, %v0, %v31
++	vfchedb	%v0, %v31, %v0
++	vfchedb	%v31, %v0, %v0
++	vfchedb	%v18, %v3, %v20
++
++#CHECK: vfchedbs %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x10,0x30,0xea]
++#CHECK: vfchedbs %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x10,0x32,0xea]
++#CHECK: vfchedbs %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x10,0x34,0xea]
++#CHECK: vfchedbs %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x10,0x38,0xea]
++#CHECK: vfchedbs %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x10,0x3a,0xea]
++
++	vfchedbs %v0, %v0, %v0
++	vfchedbs %v0, %v0, %v31
++	vfchedbs %v0, %v31, %v0
++	vfchedbs %v31, %v0, %v0
++	vfchedbs %v18, %v3, %v20
++
++#CHECK: vfddb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe5]
++#CHECK: vfddb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe5]
++#CHECK: vfddb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe5]
++#CHECK: vfddb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xe5]
++#CHECK: vfddb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xe5]
++
++	vfddb	%v0, %v0, %v0
++	vfddb	%v0, %v0, %v31
++	vfddb	%v0, %v31, %v0
++	vfddb	%v31, %v0, %v0
++	vfddb	%v18, %v3, %v20
++
++#CHECK: vfeeb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x80]
++#CHECK: vfeeb   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x80]
++#CHECK: vfeeb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x80]
++#CHECK: vfeeb   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x80]
++#CHECK: vfeeb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x80]
++#CHECK: vfeeb   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x80]
++#CHECK: vfeeb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x80]
++#CHECK: vfeeb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x80]
++#CHECK: vfeebs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x04,0x80]
++#CHECK: vfeezb  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x20,0x0a,0x80]
++#CHECK: vfeezbs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x30,0x04,0x80]
++
++	vfeeb	%v0, %v0, %v0
++	vfeeb	%v0, %v0, %v15
++	vfeeb	%v0, %v0, %v31
++	vfeeb	%v0, %v15, %v0
++	vfeeb	%v0, %v31, %v0
++	vfeeb	%v15, %v0, %v0
++	vfeeb	%v31, %v0, %v0
++	vfeeb	%v18, %v3, %v20
++	vfeebs	%v5, %v22, %v7
++	vfeezb	%v18, %v3, %v20
++	vfeezbs	%v5, %v22, %v7
++
++#CFECK: vfeef   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x80]
++#CFECK: vfeef   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x80]
++#CFECK: vfeef   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x80]
++#CFECK: vfeef   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x80]
++#CFECK: vfeef   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x80]
++#CFECK: vfeef   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x80]
++#CFECK: vfeef   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x80]
++#CFECK: vfeef   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x80]
++#CFECK: vfeefs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0x80]
++#CFECK: vfeezf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x80]
++#CFECK: vfeezfs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x30,0x24,0x80]
++
++	vfeef	%v0, %v0, %v0
++	vfeef	%v0, %v0, %v15
++	vfeef	%v0, %v0, %v31
++	vfeef	%v0, %v15, %v0
++	vfeef	%v0, %v31, %v0
++	vfeef	%v15, %v0, %v0
++	vfeef	%v31, %v0, %v0
++	vfeef	%v18, %v3, %v20
++	vfeefs	%v5, %v22, %v7
++	vfeezf	%v18, %v3, %v20
++	vfeezfs	%v5, %v22, %v7
++
++#CHECK: vfeeh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x80]
++#CHECK: vfeeh   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x80]
++#CHECK: vfeeh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x80]
++#CHECK: vfeeh   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x80]
++#CHECK: vfeeh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x80]
++#CHECK: vfeeh   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x80]
++#CHECK: vfeeh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x80]
++#CHECK: vfeeh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x80]
++#CHECK: vfeehs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x14,0x80]
++#CHECK: vfeezh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x20,0x1a,0x80]
++#CHECK: vfeezhs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x30,0x14,0x80]
++
++	vfeeh	%v0, %v0, %v0
++	vfeeh	%v0, %v0, %v15
++	vfeeh	%v0, %v0, %v31
++	vfeeh	%v0, %v15, %v0
++	vfeeh	%v0, %v31, %v0
++	vfeeh	%v15, %v0, %v0
++	vfeeh	%v31, %v0, %v0
++	vfeeh	%v18, %v3, %v20
++	vfeehs	%v5, %v22, %v7
++	vfeezh	%v18, %v3, %v20
++	vfeezhs	%v5, %v22, %v7
++
++#CHECK: vfeneb   %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x81]
++#CHECK: vfeneb   %v0, %v0, %v15         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x81]
++#CHECK: vfeneb   %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x81]
++#CHECK: vfeneb   %v0, %v15, %v0         # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x81]
++#CHECK: vfeneb   %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x81]
++#CHECK: vfeneb   %v15, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x81]
++#CHECK: vfeneb   %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x81]
++#CHECK: vfeneb   %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x81]
++#CHECK: vfenebs  %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x10,0x04,0x81]
++#CHECK: vfenezb  %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x20,0x0a,0x81]
++#CHECK: vfenezbs %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x30,0x04,0x81]
++
++	vfeneb   %v0, %v0, %v0
++	vfeneb   %v0, %v0, %v15
++	vfeneb   %v0, %v0, %v31
++	vfeneb   %v0, %v15, %v0
++	vfeneb   %v0, %v31, %v0
++	vfeneb   %v15, %v0, %v0
++	vfeneb   %v31, %v0, %v0
++	vfeneb   %v18, %v3, %v20
++	vfenebs  %v5, %v22, %v7
++	vfenezb  %v18, %v3, %v20
++	vfenezbs %v5, %v22, %v7
++
++#CFECK: vfenef   %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x00,0x20,0x81]
++#CFECK: vfenef   %v0, %v0, %v15         # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x81]
++#CFECK: vfenef   %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x81]
++#CFECK: vfenef   %v0, %v15, %v0         # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x81]
++#CFECK: vfenef   %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x81]
++#CFECK: vfenef   %v15, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x81]
++#CFECK: vfenef   %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x81]
++#CFECK: vfenef   %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x81]
++#CFECK: vfenefs  %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x10,0x24,0x81]
++#CFECK: vfenezf  %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x20,0x2a,0x81]
++#CFECK: vfenezfs %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x30,0x24,0x81]
++
++	vfenef   %v0, %v0, %v0
++	vfenef   %v0, %v0, %v15
++	vfenef   %v0, %v0, %v31
++	vfenef   %v0, %v15, %v0
++	vfenef   %v0, %v31, %v0
++	vfenef   %v15, %v0, %v0
++	vfenef   %v31, %v0, %v0
++	vfenef   %v18, %v3, %v20
++	vfenefs  %v5, %v22, %v7
++	vfenezf  %v18, %v3, %v20
++	vfenezfs %v5, %v22, %v7
++
++#CHECK: vfeneh   %v0, %v0, %v0          # encoding: [0xe7,0x00,0x00,0x00,0x10,0x81]
++#CHECK: vfeneh   %v0, %v0, %v15         # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x81]
++#CHECK: vfeneh   %v0, %v0, %v31         # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x81]
++#CHECK: vfeneh   %v0, %v15, %v0         # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x81]
++#CHECK: vfeneh   %v0, %v31, %v0         # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x81]
++#CHECK: vfeneh   %v15, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x81]
++#CHECK: vfeneh   %v31, %v0, %v0         # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x81]
++#CHECK: vfeneh   %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x81]
++#CHECK: vfenehs  %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x10,0x14,0x81]
++#CHECK: vfenezh  %v18, %v3, %v20        # encoding: [0xe7,0x23,0x40,0x20,0x1a,0x81]
++#CHECK: vfenezhs %v5, %v22, %v7         # encoding: [0xe7,0x56,0x70,0x30,0x14,0x81]
++
++	vfeneh   %v0, %v0, %v0
++	vfeneh   %v0, %v0, %v15
++	vfeneh   %v0, %v0, %v31
++	vfeneh   %v0, %v15, %v0
++	vfeneh   %v0, %v31, %v0
++	vfeneh   %v15, %v0, %v0
++	vfeneh   %v31, %v0, %v0
++	vfeneh   %v18, %v3, %v20
++	vfenehs  %v5, %v22, %v7
++	vfenezh  %v18, %v3, %v20
++	vfenezhs %v5, %v22, %v7
++
++#CHECK: vfidb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc7]
++#CHECK: vfidb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc7]
++#CHECK: vfidb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc7]
++#CHECK: vfidb   %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
++#CHECK: vfidb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc7]
++#CHECK: vfidb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc7]
++#CHECK: vfidb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc7]
++
++	vfidb	%v0, %v0, 0, 0
++	vfidb	%v0, %v0, 0, 15
++	vfidb	%v0, %v0, 4, 0
++	vfidb	%v0, %v0, 12, 0
++	vfidb	%v0, %v31, 0, 0
++	vfidb	%v31, %v0, 0, 0
++	vfidb	%v14, %v17, 4, 10
++
++#CHECK: vistrb   %v0, %v0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5c]
++#CHECK: vistrb   %v0, %v15              # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5c]
++#CHECK: vistrb   %v0, %v31              # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5c]
++#CHECK: vistrb   %v15, %v0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5c]
++#CHECK: vistrb   %v31, %v0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5c]
++#CHECK: vistrb   %v18, %v3              # encoding: [0xe7,0x23,0x00,0x00,0x08,0x5c]
++#CHECK: vistrbs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x04,0x5c]
++
++	vistrb   %v0, %v0
++	vistrb   %v0, %v15
++	vistrb   %v0, %v31
++	vistrb   %v15, %v0
++	vistrb   %v31, %v0
++	vistrb   %v18, %v3
++	vistrbs  %v5, %v22
++
++#CBECK: vistrf   %v0, %v0               # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5c]
++#CBECK: vistrf   %v0, %v15              # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x5c]
++#CBECK: vistrf   %v0, %v31              # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x5c]
++#CBECK: vistrf   %v15, %v0              # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x5c]
++#CBECK: vistrf   %v31, %v0              # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x5c]
++#CBECK: vistrf   %v18, %v3              # encoding: [0xe7,0x23,0x00,0x00,0x28,0x5c]
++#CBECK: vistrfs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x24,0x5c]
++
++	vistrf   %v0, %v0
++	vistrf   %v0, %v15
++	vistrf   %v0, %v31
++	vistrf   %v15, %v0
++	vistrf   %v31, %v0
++	vistrf   %v18, %v3
++	vistrfs  %v5, %v22
++
++#CHECK: vistrh   %v0, %v0               # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5c]
++#CHECK: vistrh   %v0, %v15              # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x5c]
++#CHECK: vistrh   %v0, %v31              # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x5c]
++#CHECK: vistrh   %v15, %v0              # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x5c]
++#CHECK: vistrh   %v31, %v0              # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x5c]
++#CHECK: vistrh   %v18, %v3              # encoding: [0xe7,0x23,0x00,0x00,0x18,0x5c]
++#CHECK: vistrhs  %v5, %v22              # encoding: [0xe7,0x56,0x00,0x10,0x14,0x5c]
++
++	vistrh   %v0, %v0
++	vistrh   %v0, %v15
++	vistrh   %v0, %v31
++	vistrh   %v15, %v0
++	vistrh   %v31, %v0
++	vistrh   %v18, %v3
++	vistrhs  %v5, %v22
++
++#CHECK: vflcdb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xcc]
++#CHECK: vflcdb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcc]
++#CHECK: vflcdb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcc]
++#CHECK: vflcdb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcc]
++#CHECK: vflcdb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcc]
++#CHECK: vflcdb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xcc]
++
++	vflcdb	%v0, %v0
++	vflcdb	%v0, %v15
++	vflcdb	%v0, %v31
++	vflcdb	%v15, %v0
++	vflcdb	%v31, %v0
++	vflcdb	%v14, %v17
++
++#CHECK: vflndb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x10,0x30,0xcc]
++#CHECK: vflndb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x10,0x30,0xcc]
++#CHECK: vflndb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x10,0x34,0xcc]
++#CHECK: vflndb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x10,0x30,0xcc]
++#CHECK: vflndb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x10,0x38,0xcc]
++#CHECK: vflndb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x10,0x34,0xcc]
++
++	vflndb	%v0, %v0
++	vflndb	%v0, %v15
++	vflndb	%v0, %v31
++	vflndb	%v15, %v0
++	vflndb	%v31, %v0
++	vflndb	%v14, %v17
++
++#CHECK: vflpdb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x20,0x30,0xcc]
++#CHECK: vflpdb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x20,0x30,0xcc]
++#CHECK: vflpdb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x20,0x34,0xcc]
++#CHECK: vflpdb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x20,0x30,0xcc]
++#CHECK: vflpdb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x20,0x38,0xcc]
++#CHECK: vflpdb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x20,0x34,0xcc]
++
++	vflpdb	%v0, %v0
++	vflpdb	%v0, %v15
++	vflpdb	%v0, %v31
++	vflpdb	%v15, %v0
++	vflpdb	%v31, %v0
++	vflpdb	%v14, %v17
++
++#CHECK: vfmadb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0x8f]
++#CHECK: vfmadb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x8f]
++#CHECK: vfmadb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x8f]
++#CHECK: vfmadb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x8f]
++#CHECK: vfmadb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x8f]
++#CHECK: vfmadb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x8f]
++
++	vfmadb	%v0, %v0, %v0, %v0
++	vfmadb	%v0, %v0, %v0, %v31
++	vfmadb	%v0, %v0, %v31, %v0
++	vfmadb	%v0, %v31, %v0, %v0
++	vfmadb	%v31, %v0, %v0, %v0
++	vfmadb	%v13, %v17, %v21, %v25
++
++#CHECK: vfmdb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe7]
++#CHECK: vfmdb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe7]
++#CHECK: vfmdb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe7]
++#CHECK: vfmdb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xe7]
++#CHECK: vfmdb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xe7]
++
++	vfmdb	%v0, %v0, %v0
++	vfmdb	%v0, %v0, %v31
++	vfmdb	%v0, %v31, %v0
++	vfmdb	%v31, %v0, %v0
++	vfmdb	%v18, %v3, %v20
++
++#CHECK: vfmsdb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0x8e]
++#CHECK: vfmsdb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0x8e]
++#CHECK: vfmsdb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0x8e]
++#CHECK: vfmsdb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0x8e]
++#CHECK: vfmsdb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0x8e]
++#CHECK: vfmsdb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0x8e]
++
++	vfmsdb	%v0, %v0, %v0, %v0
++	vfmsdb	%v0, %v0, %v0, %v31
++	vfmsdb	%v0, %v0, %v31, %v0
++	vfmsdb	%v0, %v31, %v0, %v0
++	vfmsdb	%v31, %v0, %v0, %v0
++	vfmsdb	%v13, %v17, %v21, %v25
++
++#CHECK: vfsdb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xe2]
++#CHECK: vfsdb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xe2]
++#CHECK: vfsdb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xe2]
++#CHECK: vfsdb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xe2]
++#CHECK: vfsdb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xe2]
++
++	vfsdb	%v0, %v0, %v0
++	vfsdb	%v0, %v0, %v31
++	vfsdb	%v0, %v31, %v0
++	vfsdb	%v31, %v0, %v0
++	vfsdb	%v18, %v3, %v20
++
++#CHECK: vfsqdb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xce]
++#CHECK: vfsqdb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xce]
++#CHECK: vfsqdb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xce]
++#CHECK: vfsqdb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xce]
++#CHECK: vfsqdb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xce]
++#CHECK: vfsqdb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xce]
++
++	vfsqdb	%v0, %v0
++	vfsqdb	%v0, %v15
++	vfsqdb	%v0, %v31
++	vfsqdb	%v15, %v0
++	vfsqdb	%v31, %v0
++	vfsqdb	%v14, %v17
++
++#CHECK: vftcidb %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x4a]
++#CHECK: vftcidb %v0, %v0, 4095          # encoding: [0xe7,0x00,0xff,0xf0,0x30,0x4a]
++#CHECK: vftcidb %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x4a]
++#CHECK: vftcidb %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x4a]
++#CHECK: vftcidb %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x4a]
++#CHECK: vftcidb %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x4a]
++#CHECK: vftcidb %v4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x80,0x34,0x4a]
++
++	vftcidb	%v0, %v0, 0
++	vftcidb	%v0, %v0, 4095
++	vftcidb	%v0, %v15, 0
++	vftcidb	%v0, %v31, 0
++	vftcidb	%v15, %v0, 0
++	vftcidb	%v31, %v0, 0
++	vftcidb	%v4, %v21, 0x678
++
++#CHECK: vgbm    %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x44]
++#CHECK: vgbm    %v0, 65535              # encoding: [0xe7,0x00,0xff,0xff,0x00,0x44]
++#CHECK: vgbm    %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x44]
++#CHECK: vgbm    %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x44]
++#CHECK: vgbm    %v17, 4660              # encoding: [0xe7,0x10,0x12,0x34,0x08,0x44]
++
++	vgbm	%v0, 0
++	vgbm	%v0, 0xffff
++	vgbm	%v15, 0
++	vgbm	%v31, 0
++	vgbm	%v17, 0x1234
++
++#CHECK: vgef    %v0, 0(%v0), 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x13]
++#CHECK: vgef    %v0, 0(%v0,%r1), 0      # encoding: [0xe7,0x00,0x10,0x00,0x00,0x13]
++#CHECK: vgef    %v0, 0(%v0,%r1), 3      # encoding: [0xe7,0x00,0x10,0x00,0x30,0x13]
++#CHECK: vgef    %v0, 0(%v0,%r15), 0     # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x13]
++#CHECK: vgef    %v0, 0(%v15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x13]
++#CHECK: vgef    %v0, 0(%v31,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x04,0x13]
++#CHECK: vgef    %v0, 4095(%v0,%r1), 0   # encoding: [0xe7,0x00,0x1f,0xff,0x00,0x13]
++#CHECK: vgef    %v15, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x00,0x13]
++#CHECK: vgef    %v31, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x08,0x13]
++#CHECK: vgef    %v10, 1000(%v19,%r7), 1 # encoding: [0xe7,0xa3,0x73,0xe8,0x14,0x13]
++
++	vgef	%v0, 0(%v0), 0
++	vgef	%v0, 0(%v0,%r1), 0
++	vgef	%v0, 0(%v0,%r1), 3
++	vgef	%v0, 0(%v0,%r15), 0
++	vgef	%v0, 0(%v15,%r1), 0
++	vgef	%v0, 0(%v31,%r1), 0
++	vgef	%v0, 4095(%v0, %r1), 0
++	vgef	%v15, 0(%v0,%r1), 0
++	vgef	%v31, 0(%v0,%r1), 0
++	vgef	%v10, 1000(%v19,%r7), 1
++
++#CHECK: vgeg    %v0, 0(%v0), 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x12]
++#CHECK: vgeg    %v0, 0(%v0,%r1), 0      # encoding: [0xe7,0x00,0x10,0x00,0x00,0x12]
++#CHECK: vgeg    %v0, 0(%v0,%r1), 1      # encoding: [0xe7,0x00,0x10,0x00,0x10,0x12]
++#CHECK: vgeg    %v0, 0(%v0,%r15), 0     # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x12]
++#CHECK: vgeg    %v0, 0(%v15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x12]
++#CHECK: vgeg    %v0, 0(%v31,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x04,0x12]
++#CHECK: vgeg    %v0, 4095(%v0,%r1), 0   # encoding: [0xe7,0x00,0x1f,0xff,0x00,0x12]
++#CHECK: vgeg    %v15, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x00,0x12]
++#CHECK: vgeg    %v31, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x08,0x12]
++#CHECK: vgeg    %v10, 1000(%v19,%r7), 1 # encoding: [0xe7,0xa3,0x73,0xe8,0x14,0x12]
++
++	vgeg	%v0, 0(%v0), 0
++	vgeg	%v0, 0(%v0,%r1), 0
++	vgeg	%v0, 0(%v0,%r1), 1
++	vgeg	%v0, 0(%v0,%r15), 0
++	vgeg	%v0, 0(%v15,%r1), 0
++	vgeg	%v0, 0(%v31,%r1), 0
++	vgeg	%v0, 4095(%v0,%r1), 0
++	vgeg	%v15, 0(%v0,%r1), 0
++	vgeg	%v31, 0(%v0,%r1), 0
++	vgeg	%v10, 1000(%v19,%r7), 1
++
++#CHECK: vgfmab  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xbc]
++#CHECK: vgfmab  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xbc]
++#CHECK: vgfmab  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xbc]
++#CHECK: vgfmab  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xbc]
++#CHECK: vgfmab  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xbc]
++#CHECK: vgfmab  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xbc]
++
++	vgfmab	%v0, %v0, %v0, %v0
++	vgfmab	%v0, %v0, %v0, %v31
++	vgfmab	%v0, %v0, %v31, %v0
++	vgfmab	%v0, %v31, %v0, %v0
++	vgfmab	%v31, %v0, %v0, %v0
++	vgfmab	%v13, %v17, %v21, %v25
++
++#CHECK: vgfmaf  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xbc]
++#CHECK: vgfmaf  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xbc]
++#CHECK: vgfmaf  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xbc]
++#CHECK: vgfmaf  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xbc]
++#CHECK: vgfmaf  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xbc]
++#CHECK: vgfmaf  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xbc]
++
++	vgfmaf	%v0, %v0, %v0, %v0
++	vgfmaf	%v0, %v0, %v0, %v31
++	vgfmaf	%v0, %v0, %v31, %v0
++	vgfmaf	%v0, %v31, %v0, %v0
++	vgfmaf	%v31, %v0, %v0, %v0
++	vgfmaf	%v13, %v17, %v21, %v25
++
++#CHECK: vgfmag  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x03,0x00,0x00,0xbc]
++#CHECK: vgfmag  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x03,0x00,0xf1,0xbc]
++#CHECK: vgfmag  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf3,0x00,0x02,0xbc]
++#CHECK: vgfmag  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x03,0x00,0x04,0xbc]
++#CHECK: vgfmag  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x03,0x00,0x08,0xbc]
++#CHECK: vgfmag  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x00,0x97,0xbc]
++
++	vgfmag	%v0, %v0, %v0, %v0
++	vgfmag	%v0, %v0, %v0, %v31
++	vgfmag	%v0, %v0, %v31, %v0
++	vgfmag	%v0, %v31, %v0, %v0
++	vgfmag	%v31, %v0, %v0, %v0
++	vgfmag	%v13, %v17, %v21, %v25
++
++#CHECK: vgfmah  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xbc]
++#CHECK: vgfmah  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xbc]
++#CHECK: vgfmah  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xbc]
++#CHECK: vgfmah  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xbc]
++#CHECK: vgfmah  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xbc]
++#CHECK: vgfmah  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xbc]
++
++	vgfmah	%v0, %v0, %v0, %v0
++	vgfmah	%v0, %v0, %v0, %v31
++	vgfmah	%v0, %v0, %v31, %v0
++	vgfmah	%v0, %v31, %v0, %v0
++	vgfmah	%v31, %v0, %v0, %v0
++	vgfmah	%v13, %v17, %v21, %v25
++
++#CHECK: vgfmb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xb4]
++#CHECK: vgfmb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xb4]
++#CHECK: vgfmb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xb4]
++#CHECK: vgfmb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xb4]
++#CHECK: vgfmb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xb4]
++
++	vgfmb	%v0, %v0, %v0
++	vgfmb	%v0, %v0, %v31
++	vgfmb	%v0, %v31, %v0
++	vgfmb	%v31, %v0, %v0
++	vgfmb	%v18, %v3, %v20
++
++#CHECK: vgfmf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xb4]
++#CHECK: vgfmf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xb4]
++#CHECK: vgfmf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xb4]
++#CHECK: vgfmf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xb4]
++#CHECK: vgfmf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xb4]
++
++	vgfmf	%v0, %v0, %v0
++	vgfmf	%v0, %v0, %v31
++	vgfmf	%v0, %v31, %v0
++	vgfmf	%v31, %v0, %v0
++	vgfmf	%v18, %v3, %v20
++
++#CHECK: vgfmg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xb4]
++#CHECK: vgfmg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xb4]
++#CHECK: vgfmg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xb4]
++#CHECK: vgfmg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xb4]
++#CHECK: vgfmg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xb4]
++
++	vgfmg	%v0, %v0, %v0
++	vgfmg	%v0, %v0, %v31
++	vgfmg	%v0, %v31, %v0
++	vgfmg	%v31, %v0, %v0
++	vgfmg	%v18, %v3, %v20
++
++#CHECK: vgfmh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xb4]
++#CHECK: vgfmh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xb4]
++#CHECK: vgfmh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xb4]
++#CHECK: vgfmh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xb4]
++#CHECK: vgfmh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xb4]
++
++	vgfmh	%v0, %v0, %v0
++	vgfmh	%v0, %v0, %v31
++	vgfmh	%v0, %v31, %v0
++	vgfmh	%v31, %v0, %v0
++	vgfmh	%v18, %v3, %v20
++
++#CHECK: vgmb    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x46]
++#CHECK: vgmb    %v0, 0, 255             # encoding: [0xe7,0x00,0x00,0xff,0x00,0x46]
++#CHECK: vgmb    %v0, 255, 0             # encoding: [0xe7,0x00,0xff,0x00,0x00,0x46]
++#CHECK: vgmb    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x46]
++#CHECK: vgmb    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x46]
++#CHECK: vgmb    %v21, 2, 3              # encoding: [0xe7,0x50,0x02,0x03,0x08,0x46]
++
++	vgmb	%v0, 0, 0
++	vgmb	%v0, 0, 255
++	vgmb	%v0, 255, 0
++	vgmb	%v15, 0, 0
++	vgmb	%v31, 0, 0
++	vgmb	%v21, 2, 3
++
++#CHECK: vgmf    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x20,0x46]
++#CHECK: vgmf    %v0, 0, 255             # encoding: [0xe7,0x00,0x00,0xff,0x20,0x46]
++#CHECK: vgmf    %v0, 255, 0             # encoding: [0xe7,0x00,0xff,0x00,0x20,0x46]
++#CHECK: vgmf    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x46]
++#CHECK: vgmf    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x46]
++#CHECK: vgmf    %v21, 2, 3              # encoding: [0xe7,0x50,0x02,0x03,0x28,0x46]
++
++	vgmf	%v0, 0, 0
++	vgmf	%v0, 0, 255
++	vgmf	%v0, 255, 0
++	vgmf	%v15, 0, 0
++	vgmf	%v31, 0, 0
++	vgmf	%v21, 2, 3
++
++#CHECK: vgmg    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x30,0x46]
++#CHECK: vgmg    %v0, 0, 255             # encoding: [0xe7,0x00,0x00,0xff,0x30,0x46]
++#CHECK: vgmg    %v0, 255, 0             # encoding: [0xe7,0x00,0xff,0x00,0x30,0x46]
++#CHECK: vgmg    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x46]
++#CHECK: vgmg    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x46]
++#CHECK: vgmg    %v21, 2, 3              # encoding: [0xe7,0x50,0x02,0x03,0x38,0x46]
++
++	vgmg	%v0, 0, 0
++	vgmg	%v0, 0, 255
++	vgmg	%v0, 255, 0
++	vgmg	%v15, 0, 0
++	vgmg	%v31, 0, 0
++	vgmg	%v21, 2, 3
++
++#CHECK: vgmh    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x10,0x46]
++#CHECK: vgmh    %v0, 0, 255             # encoding: [0xe7,0x00,0x00,0xff,0x10,0x46]
++#CHECK: vgmh    %v0, 255, 0             # encoding: [0xe7,0x00,0xff,0x00,0x10,0x46]
++#CHECK: vgmh    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x46]
++#CHECK: vgmh    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x46]
++#CHECK: vgmh    %v21, 2, 3              # encoding: [0xe7,0x50,0x02,0x03,0x18,0x46]
++
++	vgmh	%v0, 0, 0
++	vgmh	%v0, 0, 255
++	vgmh	%v0, 255, 0
++	vgmh	%v15, 0, 0
++	vgmh	%v31, 0, 0
++	vgmh	%v21, 2, 3
++
++#CHECK: vl      %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x06]
++#CHECK: vl      %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x06]
++#CHECK: vl      %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x06]
++#CHECK: vl      %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x06]
++#CHECK: vl      %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x06]
++#CHECK: vl      %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x06]
++#CHECK: vl      %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x08,0x06]
++
++	vl	%v0, 0
++	vl	%v0, 4095
++	vl	%v0, 0(%r15)
++	vl	%v0, 0(%r15,%r1)
++	vl	%v15, 0
++	vl	%v31, 0
++	vl	%v18, 0x567(%r3,%r4)
++
++#CHECK: vlbb    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x07]
++#CHECK: vlbb    %v0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x07]
++#CHECK: vlbb    %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x07]
++#CHECK: vlbb    %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x07]
++#CHECK: vlbb    %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x07]
++#CHECK: vlbb    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x07]
++#CHECK: vlbb    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x07]
++#CHECK: vlbb    %v18, 1383(%r3,%r4), 8  # encoding: [0xe7,0x23,0x45,0x67,0x88,0x07]
++
++	vlbb	%v0, 0, 0
++	vlbb	%v0, 0, 15
++	vlbb	%v0, 4095, 0
++	vlbb	%v0, 0(%r15), 0
++	vlbb	%v0, 0(%r15,%r1), 0
++	vlbb	%v15, 0, 0
++	vlbb	%v31, 0, 0
++	vlbb	%v18, 1383(%r3,%r4), 8
++
++#CHECK: vlcb    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xde]
++#CHECK: vlcb    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xde]
++#CHECK: vlcb    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xde]
++#CHECK: vlcb    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xde]
++#CHECK: vlcb    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xde]
++#CHECK: vlcb    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xde]
++
++	vlcb	%v0, %v0
++	vlcb	%v0, %v15
++	vlcb	%v0, %v31
++	vlcb	%v15, %v0
++	vlcb	%v31, %v0
++	vlcb	%v14, %v17
++
++#CHECK: vlcf    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xde]
++#CHECK: vlcf    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xde]
++#CHECK: vlcf    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xde]
++#CHECK: vlcf    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xde]
++#CHECK: vlcf    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xde]
++#CHECK: vlcf    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xde]
++
++	vlcf	%v0, %v0
++	vlcf	%v0, %v15
++	vlcf	%v0, %v31
++	vlcf	%v15, %v0
++	vlcf	%v31, %v0
++	vlcf	%v14, %v17
++
++#CHECK: vlcg    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xde]
++#CHECK: vlcg    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xde]
++#CHECK: vlcg    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xde]
++#CHECK: vlcg    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xde]
++#CHECK: vlcg    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xde]
++#CHECK: vlcg    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xde]
++
++	vlcg	%v0, %v0
++	vlcg	%v0, %v15
++	vlcg	%v0, %v31
++	vlcg	%v15, %v0
++	vlcg	%v31, %v0
++	vlcg	%v14, %v17
++
++#CHECK: vlch    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xde]
++#CHECK: vlch    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xde]
++#CHECK: vlch    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xde]
++#CHECK: vlch    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xde]
++#CHECK: vlch    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xde]
++#CHECK: vlch    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xde]
++
++	vlch	%v0, %v0
++	vlch	%v0, %v15
++	vlch	%v0, %v31
++	vlch	%v15, %v0
++	vlch	%v31, %v0
++	vlch	%v14, %v17
++
++#CHECK: vldeb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xc4]
++#CHECK: vldeb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xc4]
++#CHECK: vldeb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xc4]
++#CHECK: vldeb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xc4]
++#CHECK: vldeb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xc4]
++#CHECK: vldeb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xc4]
++
++	vldeb	%v0, %v0
++	vldeb	%v0, %v15
++	vldeb	%v0, %v31
++	vldeb	%v15, %v0
++	vldeb	%v31, %v0
++	vldeb	%v14, %v17
++
++#CHECK: vleb    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x00]
++#CHECK: vleb    %v0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x00]
++#CHECK: vleb    %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x00]
++#CHECK: vleb    %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x00]
++#CHECK: vleb    %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x00]
++#CHECK: vleb    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x00]
++#CHECK: vleb    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x00]
++#CHECK: vleb    %v18, 1383(%r3,%r4), 8  # encoding: [0xe7,0x23,0x45,0x67,0x88,0x00]
++
++	vleb	%v0, 0, 0
++	vleb	%v0, 0, 15
++	vleb	%v0, 4095, 0
++	vleb	%v0, 0(%r15), 0
++	vleb	%v0, 0(%r15,%r1), 0
++	vleb	%v15, 0, 0
++	vleb	%v31, 0, 0
++	vleb	%v18, 1383(%r3,%r4), 8
++
++#CHECK: vledb   %v0, %v0, 0, 0          # encoding: [0xe7,0x00,0x00,0x00,0x30,0xc5]
++#CHECK: vledb   %v0, %v0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf0,0x30,0xc5]
++#CHECK: vledb   %v0, %v0, 4, 0          # encoding: [0xe7,0x00,0x00,0x04,0x30,0xc5]
++#CHECK: vledb   %v0, %v0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
++#CHECK: vledb   %v0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xc5]
++#CHECK: vledb   %v31, %v0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xc5]
++#CHECK: vledb   %v14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xa4,0x34,0xc5]
++
++	vledb	%v0, %v0, 0, 0
++	vledb	%v0, %v0, 0, 15
++	vledb	%v0, %v0, 4, 0
++	vledb	%v0, %v0, 12, 0
++	vledb	%v0, %v31, 0, 0
++	vledb	%v31, %v0, 0, 0
++	vledb	%v14, %v17, 4, 10
++
++#CHECK: vlef    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x03]
++#CHECK: vlef    %v0, 0, 3               # encoding: [0xe7,0x00,0x00,0x00,0x30,0x03]
++#CHECK: vlef    %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x03]
++#CHECK: vlef    %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x03]
++#CHECK: vlef    %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x03]
++#CHECK: vlef    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x03]
++#CHECK: vlef    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x03]
++#CHECK: vlef    %v18, 1383(%r3,%r4), 2  # encoding: [0xe7,0x23,0x45,0x67,0x28,0x03]
++
++	vlef	%v0, 0, 0
++	vlef	%v0, 0, 3
++	vlef	%v0, 4095, 0
++	vlef	%v0, 0(%r15), 0
++	vlef	%v0, 0(%r15,%r1), 0
++	vlef	%v15, 0, 0
++	vlef	%v31, 0, 0
++	vlef	%v18, 1383(%r3,%r4), 2
++
++#CHECK: vleg    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x02]
++#CHECK: vleg    %v0, 0, 1               # encoding: [0xe7,0x00,0x00,0x00,0x10,0x02]
++#CHECK: vleg    %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x02]
++#CHECK: vleg    %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x02]
++#CHECK: vleg    %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x02]
++#CHECK: vleg    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x02]
++#CHECK: vleg    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x02]
++#CHECK: vleg    %v18, 1383(%r3,%r4), 1  # encoding: [0xe7,0x23,0x45,0x67,0x18,0x02]
++
++	vleg	%v0, 0, 0
++	vleg	%v0, 0, 1
++	vleg	%v0, 4095, 0
++	vleg	%v0, 0(%r15), 0
++	vleg	%v0, 0(%r15,%r1), 0
++	vleg	%v15, 0, 0
++	vleg	%v31, 0, 0
++	vleg	%v18, 1383(%r3,%r4), 1
++
++#CHECK: vleh    %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x01]
++#CHECK: vleh    %v0, 0, 7               # encoding: [0xe7,0x00,0x00,0x00,0x70,0x01]
++#CHECK: vleh    %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x01]
++#CHECK: vleh    %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x01]
++#CHECK: vleh    %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x01]
++#CHECK: vleh    %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x01]
++#CHECK: vleh    %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x01]
++#CHECK: vleh    %v18, 1383(%r3,%r4), 4  # encoding: [0xe7,0x23,0x45,0x67,0x48,0x01]
++
++	vleh	%v0, 0, 0
++	vleh	%v0, 0, 7
++	vleh	%v0, 4095, 0
++	vleh	%v0, 0(%r15), 0
++	vleh	%v0, 0(%r15,%r1), 0
++	vleh	%v15, 0, 0
++	vleh	%v31, 0, 0
++	vleh	%v18, 1383(%r3,%r4), 4
++
++#CHECK: vleib   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x40]
++#CHECK: vleib   %v0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x40]
++#CHECK: vleib   %v0, -32768, 0          # encoding: [0xe7,0x00,0x80,0x00,0x00,0x40]
++#CHECK: vleib   %v0, 32767, 0           # encoding: [0xe7,0x00,0x7f,0xff,0x00,0x40]
++#CHECK: vleib   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x40]
++#CHECK: vleib   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x40]
++#CHECK: vleib   %v18, 13398, 11         # encoding: [0xe7,0x20,0x34,0x56,0xb8,0x40]
++
++	vleib	%v0, 0, 0
++	vleib	%v0, 0, 15
++	vleib	%v0, -32768, 0
++	vleib	%v0, 32767, 0
++	vleib	%v15, 0, 0
++	vleib	%v31, 0, 0
++	vleib	%v18, 0x3456, 11
++
++#CHECK: vleif   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x43]
++#CHECK: vleif   %v0, 0, 3               # encoding: [0xe7,0x00,0x00,0x00,0x30,0x43]
++#CHECK: vleif   %v0, -32768, 0          # encoding: [0xe7,0x00,0x80,0x00,0x00,0x43]
++#CHECK: vleif   %v0, 32767, 0           # encoding: [0xe7,0x00,0x7f,0xff,0x00,0x43]
++#CHECK: vleif   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x43]
++#CHECK: vleif   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x43]
++#CHECK: vleif   %v18, 13398, 3          # encoding: [0xe7,0x20,0x34,0x56,0x38,0x43]
++
++	vleif	%v0, 0, 0
++	vleif	%v0, 0, 3
++	vleif	%v0, -32768, 0
++	vleif	%v0, 32767, 0
++	vleif	%v15, 0, 0
++	vleif	%v31, 0, 0
++	vleif	%v18, 0x3456, 3
++
++#CHECK: vleig   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x42]
++#CHECK: vleig   %v0, 0, 1               # encoding: [0xe7,0x00,0x00,0x00,0x10,0x42]
++#CHECK: vleig   %v0, -32768, 0          # encoding: [0xe7,0x00,0x80,0x00,0x00,0x42]
++#CHECK: vleig   %v0, 32767, 0           # encoding: [0xe7,0x00,0x7f,0xff,0x00,0x42]
++#CHECK: vleig   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x42]
++#CHECK: vleig   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x42]
++#CHECK: vleig   %v18, 13398, 1          # encoding: [0xe7,0x20,0x34,0x56,0x18,0x42]
++
++	vleig	%v0, 0, 0
++	vleig	%v0, 0, 1
++	vleig	%v0, -32768, 0
++	vleig	%v0, 32767, 0
++	vleig	%v15, 0, 0
++	vleig	%v31, 0, 0
++	vleig	%v18, 0x3456, 1
++
++#CHECK: vleih   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x41]
++#CHECK: vleih   %v0, 0, 7               # encoding: [0xe7,0x00,0x00,0x00,0x70,0x41]
++#CHECK: vleih   %v0, -32768, 0          # encoding: [0xe7,0x00,0x80,0x00,0x00,0x41]
++#CHECK: vleih   %v0, 32767, 0           # encoding: [0xe7,0x00,0x7f,0xff,0x00,0x41]
++#CHECK: vleih   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x41]
++#CHECK: vleih   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x41]
++#CHECK: vleih   %v18, 13398, 7          # encoding: [0xe7,0x20,0x34,0x56,0x78,0x41]
++
++	vleih	%v0, 0, 0
++	vleih	%v0, 0, 7
++	vleih	%v0, -32768, 0
++	vleih	%v0, 32767, 0
++	vleih	%v15, 0, 0
++	vleih	%v31, 0, 0
++	vleih	%v18, 0x3456, 7
++
++#CHECK: vlgvb   %r0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x21]
++#CHECK: vlgvb   %r0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x21]
++#CHECK: vlgvb   %r0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x21]
++#CHECK: vlgvb   %r0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x21]
++#CHECK: vlgvb   %r0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x21]
++#CHECK: vlgvb   %r15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x21]
++#CHECK: vlgvb   %r2, %v19, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x04,0x21]
++
++	vlgvb	%r0, %v0, 0
++	vlgvb	%r0, %v0, 4095
++	vlgvb	%r0, %v0, 0(%r15)
++	vlgvb	%r0, %v15, 0
++	vlgvb	%r0, %v31, 0
++	vlgvb	%r15, %v0, 0
++	vlgvb	%r2, %v19, 1383(%r4)
++
++#CHECK: vlgvf   %r0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x21]
++#CHECK: vlgvf   %r0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x21]
++#CHECK: vlgvf   %r0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x21]
++#CHECK: vlgvf   %r0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x21]
++#CHECK: vlgvf   %r0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x21]
++#CHECK: vlgvf   %r15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x21]
++#CHECK: vlgvf   %r2, %v19, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x24,0x21]
++
++	vlgvf	%r0, %v0, 0
++	vlgvf	%r0, %v0, 4095
++	vlgvf	%r0, %v0, 0(%r15)
++	vlgvf	%r0, %v15, 0
++	vlgvf	%r0, %v31, 0
++	vlgvf	%r15, %v0, 0
++	vlgvf	%r2, %v19, 1383(%r4)
++
++#CHECK: vlgvg   %r0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x21]
++#CHECK: vlgvg   %r0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x21]
++#CHECK: vlgvg   %r0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x21]
++#CHECK: vlgvg   %r0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x21]
++#CHECK: vlgvg   %r0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x21]
++#CHECK: vlgvg   %r15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x21]
++#CHECK: vlgvg   %r2, %v19, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x34,0x21]
++
++	vlgvg	%r0, %v0, 0
++	vlgvg	%r0, %v0, 4095
++	vlgvg	%r0, %v0, 0(%r15)
++	vlgvg	%r0, %v15, 0
++	vlgvg	%r0, %v31, 0
++	vlgvg	%r15, %v0, 0
++	vlgvg	%r2, %v19, 1383(%r4)
++
++#CHECK: vlgvh   %r0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x21]
++#CHECK: vlgvh   %r0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x21]
++#CHECK: vlgvh   %r0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x21]
++#CHECK: vlgvh   %r0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x21]
++#CHECK: vlgvh   %r0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x21]
++#CHECK: vlgvh   %r15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x21]
++#CHECK: vlgvh   %r2, %v19, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x14,0x21]
++
++	vlgvh	%r0, %v0, 0
++	vlgvh	%r0, %v0, 4095
++	vlgvh	%r0, %v0, 0(%r15)
++	vlgvh	%r0, %v15, 0
++	vlgvh	%r0, %v31, 0
++	vlgvh	%r15, %v0, 0
++	vlgvh	%r2, %v19, 1383(%r4)
++
++#CHECK: vll     %v0, %r0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x37]
++#CHECK: vll     %v0, %r0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x37]
++#CHECK: vll     %v0, %r0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x37]
++#CHECK: vll     %v0, %r15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x37]
++#CHECK: vll     %v15, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x37]
++#CHECK: vll     %v31, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x37]
++#CHECK: vll     %v18, %r3, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x08,0x37]
++
++	vll	%v0, %r0, 0
++	vll	%v0, %r0, 4095
++	vll	%v0, %r0, 0(%r15)
++	vll	%v0, %r15, 0
++	vll	%v15, %r0, 0
++	vll	%v31, %r0, 0
++	vll	%v18, %r3, 1383(%r4)
++
++#CHECK: vllezb  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x04]
++#CHECK: vllezb  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x04]
++#CHECK: vllezb  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x04]
++#CHECK: vllezb  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x04]
++#CHECK: vllezb  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x04]
++#CHECK: vllezb  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x04]
++#CHECK: vllezb  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x08,0x04]
++
++	vllezb	%v0, 0
++	vllezb	%v0, 4095
++	vllezb	%v0, 0(%r15)
++	vllezb	%v0, 0(%r15,%r1)
++	vllezb	%v15, 0
++	vllezb	%v31, 0
++	vllezb	%v18, 0x567(%r3,%r4)
++
++#CHECK: vllezf  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x20,0x04]
++#CHECK: vllezf  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x04]
++#CHECK: vllezf  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x04]
++#CHECK: vllezf  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x20,0x04]
++#CHECK: vllezf  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x04]
++#CHECK: vllezf  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x04]
++#CHECK: vllezf  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x28,0x04]
++
++	vllezf	%v0, 0
++	vllezf	%v0, 4095
++	vllezf	%v0, 0(%r15)
++	vllezf	%v0, 0(%r15,%r1)
++	vllezf	%v15, 0
++	vllezf	%v31, 0
++	vllezf	%v18, 0x567(%r3,%r4)
++
++#CHECK: vllezg  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x30,0x04]
++#CHECK: vllezg  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x04]
++#CHECK: vllezg  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x04]
++#CHECK: vllezg  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x30,0x04]
++#CHECK: vllezg  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x04]
++#CHECK: vllezg  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x04]
++#CHECK: vllezg  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x38,0x04]
++
++	vllezg	%v0, 0
++	vllezg	%v0, 4095
++	vllezg	%v0, 0(%r15)
++	vllezg	%v0, 0(%r15,%r1)
++	vllezg	%v15, 0
++	vllezg	%v31, 0
++	vllezg	%v18, 0x567(%r3,%r4)
++
++#CHECK: vllezh  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x10,0x04]
++#CHECK: vllezh  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x04]
++#CHECK: vllezh  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x04]
++#CHECK: vllezh  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x10,0x04]
++#CHECK: vllezh  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x04]
++#CHECK: vllezh  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x04]
++#CHECK: vllezh  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x18,0x04]
++
++	vllezh	%v0, 0
++	vllezh	%v0, 4095
++	vllezh	%v0, 0(%r15)
++	vllezh	%v0, 0(%r15,%r1)
++	vllezh	%v15, 0
++	vllezh	%v31, 0
++	vllezh	%v18, 0x567(%r3,%r4)
++
++#CHECK: vlm     %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x36]
++#CHECK: vlm     %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x36]
++#CHECK: vlm     %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x36]
++#CHECK: vlm     %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x36]
++#CHECK: vlm     %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x36]
++#CHECK: vlm     %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x04,0x36]
++
++	vlm	%v0, %v0, 0
++	vlm	%v0, %v0, 4095
++	vlm	%v0, %v0, 0(%r15)
++	vlm	%v0, %v31, 0
++	vlm	%v31, %v0, 0
++	vlm	%v14, %v17, 1074(%r5)
++
++#CHECK: vlpb    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xdf]
++#CHECK: vlpb    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xdf]
++#CHECK: vlpb    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xdf]
++#CHECK: vlpb    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xdf]
++#CHECK: vlpb    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xdf]
++#CHECK: vlpb    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xdf]
++
++	vlpb	%v0, %v0
++	vlpb	%v0, %v15
++	vlpb	%v0, %v31
++	vlpb	%v15, %v0
++	vlpb	%v31, %v0
++	vlpb	%v14, %v17
++
++#CHECK: vlpf    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xdf]
++#CHECK: vlpf    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xdf]
++#CHECK: vlpf    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xdf]
++#CHECK: vlpf    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xdf]
++#CHECK: vlpf    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xdf]
++#CHECK: vlpf    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xdf]
++
++	vlpf	%v0, %v0
++	vlpf	%v0, %v15
++	vlpf	%v0, %v31
++	vlpf	%v15, %v0
++	vlpf	%v31, %v0
++	vlpf	%v14, %v17
++
++#CHECK: vlpg    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xdf]
++#CHECK: vlpg    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xdf]
++#CHECK: vlpg    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xdf]
++#CHECK: vlpg    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xdf]
++#CHECK: vlpg    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xdf]
++#CHECK: vlpg    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xdf]
++
++	vlpg	%v0, %v0
++	vlpg	%v0, %v15
++	vlpg	%v0, %v31
++	vlpg	%v15, %v0
++	vlpg	%v31, %v0
++	vlpg	%v14, %v17
++
++#CHECK: vlph    %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xdf]
++#CHECK: vlph    %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xdf]
++#CHECK: vlph    %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xdf]
++#CHECK: vlph    %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xdf]
++#CHECK: vlph    %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xdf]
++#CHECK: vlph    %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xdf]
++
++	vlph	%v0, %v0
++	vlph	%v0, %v15
++	vlph	%v0, %v31
++	vlph	%v15, %v0
++	vlph	%v31, %v0
++	vlph	%v14, %v17
++
++#CHECK: vlr     %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0x56]
++#CHECK: vlr     %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x56]
++#CHECK: vlr     %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x56]
++#CHECK: vlr     %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x56]
++#CHECK: vlr     %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x56]
++#CHECK: vlr     %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x56]
++
++	vlr	%v0, %v0
++	vlr	%v0, %v15
++	vlr	%v0, %v31
++	vlr	%v15, %v0
++	vlr	%v31, %v0
++	vlr	%v14, %v17
++
++#CHECK: vlrepb  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x05]
++#CHECK: vlrepb  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x05]
++#CHECK: vlrepb  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x05]
++#CHECK: vlrepb  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x05]
++#CHECK: vlrepb  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x05]
++#CHECK: vlrepb  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x05]
++#CHECK: vlrepb  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x08,0x05]
++
++	vlrepb	%v0, 0
++	vlrepb	%v0, 4095
++	vlrepb	%v0, 0(%r15)
++	vlrepb	%v0, 0(%r15,%r1)
++	vlrepb	%v15, 0
++	vlrepb	%v31, 0
++	vlrepb	%v18, 0x567(%r3,%r4)
++
++#CHECK: vlrepf  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x20,0x05]
++#CHECK: vlrepf  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x05]
++#CHECK: vlrepf  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x05]
++#CHECK: vlrepf  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x20,0x05]
++#CHECK: vlrepf  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x05]
++#CHECK: vlrepf  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x05]
++#CHECK: vlrepf  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x28,0x05]
++
++	vlrepf	%v0, 0
++	vlrepf	%v0, 4095
++	vlrepf	%v0, 0(%r15)
++	vlrepf	%v0, 0(%r15,%r1)
++	vlrepf	%v15, 0
++	vlrepf	%v31, 0
++	vlrepf	%v18, 0x567(%r3,%r4)
++
++#CHECK: vlrepg  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x30,0x05]
++#CHECK: vlrepg  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x05]
++#CHECK: vlrepg  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x05]
++#CHECK: vlrepg  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x30,0x05]
++#CHECK: vlrepg  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x05]
++#CHECK: vlrepg  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x05]
++#CHECK: vlrepg  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x38,0x05]
++
++	vlrepg	%v0, 0
++	vlrepg	%v0, 4095
++	vlrepg	%v0, 0(%r15)
++	vlrepg	%v0, 0(%r15,%r1)
++	vlrepg	%v15, 0
++	vlrepg	%v31, 0
++	vlrepg	%v18, 0x567(%r3,%r4)
++
++#CHECK: vlreph  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x10,0x05]
++#CHECK: vlreph  %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x05]
++#CHECK: vlreph  %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x05]
++#CHECK: vlreph  %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x10,0x05]
++#CHECK: vlreph  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x05]
++#CHECK: vlreph  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x05]
++#CHECK: vlreph  %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x18,0x05]
++
++	vlreph	%v0, 0
++	vlreph	%v0, 4095
++	vlreph	%v0, 0(%r15)
++	vlreph	%v0, 0(%r15,%r1)
++	vlreph	%v15, 0
++	vlreph	%v31, 0
++	vlreph	%v18, 0x567(%r3,%r4)
++
++#CHECK: vlvgb   %v0, %r0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x22]
++#CHECK: vlvgb   %v0, %r0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x22]
++#CHECK: vlvgb   %v0, %r0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x22]
++#CHECK: vlvgb   %v0, %r15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x22]
++#CHECK: vlvgb   %v15, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x22]
++#CHECK: vlvgb   %v31, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x22]
++#CHECK: vlvgb   %v18, %r3, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x08,0x22]
++
++	vlvgb	%v0, %r0, 0
++	vlvgb	%v0, %r0, 4095
++	vlvgb	%v0, %r0, 0(%r15)
++	vlvgb	%v0, %r15, 0
++	vlvgb	%v15, %r0, 0
++	vlvgb	%v31, %r0, 0
++	vlvgb	%v18, %r3, 1383(%r4)
++
++#CHECK: vlvgf   %v0, %r0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x22]
++#CHECK: vlvgf   %v0, %r0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x20,0x22]
++#CHECK: vlvgf   %v0, %r0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x22]
++#CHECK: vlvgf   %v0, %r15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x22]
++#CHECK: vlvgf   %v15, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x22]
++#CHECK: vlvgf   %v31, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x22]
++#CHECK: vlvgf   %v18, %r3, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x28,0x22]
++
++	vlvgf	%v0, %r0, 0
++	vlvgf	%v0, %r0, 4095
++	vlvgf	%v0, %r0, 0(%r15)
++	vlvgf	%v0, %r15, 0
++	vlvgf	%v15, %r0, 0
++	vlvgf	%v31, %r0, 0
++	vlvgf	%v18, %r3, 1383(%r4)
++
++#CHECK: vlvgg   %v0, %r0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x22]
++#CHECK: vlvgg   %v0, %r0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x30,0x22]
++#CHECK: vlvgg   %v0, %r0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x22]
++#CHECK: vlvgg   %v0, %r15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x22]
++#CHECK: vlvgg   %v15, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x22]
++#CHECK: vlvgg   %v31, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x22]
++#CHECK: vlvgg   %v18, %r3, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x38,0x22]
++
++	vlvgg	%v0, %r0, 0
++	vlvgg	%v0, %r0, 4095
++	vlvgg	%v0, %r0, 0(%r15)
++	vlvgg	%v0, %r15, 0
++	vlvgg	%v15, %r0, 0
++	vlvgg	%v31, %r0, 0
++	vlvgg	%v18, %r3, 1383(%r4)
++
++#CHECK: vlvgh   %v0, %r0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x22]
++#CHECK: vlvgh   %v0, %r0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x10,0x22]
++#CHECK: vlvgh   %v0, %r0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x22]
++#CHECK: vlvgh   %v0, %r15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x22]
++#CHECK: vlvgh   %v15, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x22]
++#CHECK: vlvgh   %v31, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x22]
++#CHECK: vlvgh   %v18, %r3, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x18,0x22]
++
++	vlvgh	%v0, %r0, 0
++	vlvgh	%v0, %r0, 4095
++	vlvgh	%v0, %r0, 0(%r15)
++	vlvgh	%v0, %r15, 0
++	vlvgh	%v15, %r0, 0
++	vlvgh	%v31, %r0, 0
++	vlvgh	%v18, %r3, 1383(%r4)
++
++#CHECK: vlvgp   %v0, %r0, %r0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x62]
++#CHECK: vlvgp   %v0, %r0, %r15          # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x62]
++#CHECK: vlvgp   %v0, %r15, %r0          # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x62]
++#CHECK: vlvgp   %v15, %r0, %r0          # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x62]
++#CHECK: vlvgp   %v31, %r0, %r0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x62]
++#CHECK: vlvgp   %v18, %r3, %r4          # encoding: [0xe7,0x23,0x40,0x00,0x08,0x62]
++
++	vlvgp	%v0, %r0, %r0
++	vlvgp	%v0, %r0, %r15
++	vlvgp	%v0, %r15, %r0
++	vlvgp	%v15, %r0, %r0
++	vlvgp	%v31, %r0, %r0
++	vlvgp	%v18, %r3, %r4
++
++#CHECK: vmaeb   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xae]
++#CHECK: vmaeb   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xae]
++#CHECK: vmaeb   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xae]
++#CHECK: vmaeb   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xae]
++#CHECK: vmaeb   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xae]
++#CHECK: vmaeb   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xae]
++
++	vmaeb	%v0, %v0, %v0, %v0
++	vmaeb	%v0, %v0, %v0, %v31
++	vmaeb	%v0, %v0, %v31, %v0
++	vmaeb	%v0, %v31, %v0, %v0
++	vmaeb	%v31, %v0, %v0, %v0
++	vmaeb	%v13, %v17, %v21, %v25
++
++#CHECK: vmaef   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xae]
++#CHECK: vmaef   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xae]
++#CHECK: vmaef   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xae]
++#CHECK: vmaef   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xae]
++#CHECK: vmaef   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xae]
++#CHECK: vmaef   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xae]
++
++	vmaef	%v0, %v0, %v0, %v0
++	vmaef	%v0, %v0, %v0, %v31
++	vmaef	%v0, %v0, %v31, %v0
++	vmaef	%v0, %v31, %v0, %v0
++	vmaef	%v31, %v0, %v0, %v0
++	vmaef	%v13, %v17, %v21, %v25
++
++#CHECK: vmaeh   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xae]
++#CHECK: vmaeh   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xae]
++#CHECK: vmaeh   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xae]
++#CHECK: vmaeh   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xae]
++#CHECK: vmaeh   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xae]
++#CHECK: vmaeh   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xae]
++
++	vmaeh	%v0, %v0, %v0, %v0
++	vmaeh	%v0, %v0, %v0, %v31
++	vmaeh	%v0, %v0, %v31, %v0
++	vmaeh	%v0, %v31, %v0, %v0
++	vmaeh	%v31, %v0, %v0, %v0
++	vmaeh	%v13, %v17, %v21, %v25
++
++#CHECK: vmahb   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xab]
++#CHECK: vmahb   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xab]
++#CHECK: vmahb   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xab]
++#CHECK: vmahb   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xab]
++#CHECK: vmahb   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xab]
++#CHECK: vmahb   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xab]
++
++	vmahb	%v0, %v0, %v0, %v0
++	vmahb	%v0, %v0, %v0, %v31
++	vmahb	%v0, %v0, %v31, %v0
++	vmahb	%v0, %v31, %v0, %v0
++	vmahb	%v31, %v0, %v0, %v0
++	vmahb	%v13, %v17, %v21, %v25
++
++#CHECK: vmahf   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xab]
++#CHECK: vmahf   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xab]
++#CHECK: vmahf   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xab]
++#CHECK: vmahf   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xab]
++#CHECK: vmahf   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xab]
++#CHECK: vmahf   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xab]
++
++	vmahf	%v0, %v0, %v0, %v0
++	vmahf	%v0, %v0, %v0, %v31
++	vmahf	%v0, %v0, %v31, %v0
++	vmahf	%v0, %v31, %v0, %v0
++	vmahf	%v31, %v0, %v0, %v0
++	vmahf	%v13, %v17, %v21, %v25
++
++#CHECK: vmahh   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xab]
++#CHECK: vmahh   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xab]
++#CHECK: vmahh   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xab]
++#CHECK: vmahh   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xab]
++#CHECK: vmahh   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xab]
++#CHECK: vmahh   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xab]
++
++	vmahh	%v0, %v0, %v0, %v0
++	vmahh	%v0, %v0, %v0, %v31
++	vmahh	%v0, %v0, %v31, %v0
++	vmahh	%v0, %v31, %v0, %v0
++	vmahh	%v31, %v0, %v0, %v0
++	vmahh	%v13, %v17, %v21, %v25
++
++#CHECK: vmalb   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xaa]
++#CHECK: vmalb   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xaa]
++#CHECK: vmalb   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xaa]
++#CHECK: vmalb   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xaa]
++#CHECK: vmalb   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xaa]
++#CHECK: vmalb   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xaa]
++
++	vmalb	%v0, %v0, %v0, %v0
++	vmalb	%v0, %v0, %v0, %v31
++	vmalb	%v0, %v0, %v31, %v0
++	vmalb	%v0, %v31, %v0, %v0
++	vmalb	%v31, %v0, %v0, %v0
++	vmalb	%v13, %v17, %v21, %v25
++
++#CHECK: vmaleb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xac]
++#CHECK: vmaleb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xac]
++#CHECK: vmaleb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xac]
++#CHECK: vmaleb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xac]
++#CHECK: vmaleb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xac]
++#CHECK: vmaleb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xac]
++
++	vmaleb	%v0, %v0, %v0, %v0
++	vmaleb	%v0, %v0, %v0, %v31
++	vmaleb	%v0, %v0, %v31, %v0
++	vmaleb	%v0, %v31, %v0, %v0
++	vmaleb	%v31, %v0, %v0, %v0
++	vmaleb	%v13, %v17, %v21, %v25
++
++#CHECK: vmalef  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xac]
++#CHECK: vmalef  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xac]
++#CHECK: vmalef  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xac]
++#CHECK: vmalef  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xac]
++#CHECK: vmalef  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xac]
++#CHECK: vmalef  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xac]
++
++	vmalef	%v0, %v0, %v0, %v0
++	vmalef	%v0, %v0, %v0, %v31
++	vmalef	%v0, %v0, %v31, %v0
++	vmalef	%v0, %v31, %v0, %v0
++	vmalef	%v31, %v0, %v0, %v0
++	vmalef	%v13, %v17, %v21, %v25
++
++#CHECK: vmaleh  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xac]
++#CHECK: vmaleh  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xac]
++#CHECK: vmaleh  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xac]
++#CHECK: vmaleh  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xac]
++#CHECK: vmaleh  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xac]
++#CHECK: vmaleh  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xac]
++
++	vmaleh	%v0, %v0, %v0, %v0
++	vmaleh	%v0, %v0, %v0, %v31
++	vmaleh	%v0, %v0, %v31, %v0
++	vmaleh	%v0, %v31, %v0, %v0
++	vmaleh	%v31, %v0, %v0, %v0
++	vmaleh	%v13, %v17, %v21, %v25
++
++#CHECK: vmalf   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xaa]
++#CHECK: vmalf   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xaa]
++#CHECK: vmalf   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xaa]
++#CHECK: vmalf   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xaa]
++#CHECK: vmalf   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xaa]
++#CHECK: vmalf   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xaa]
++
++	vmalf	%v0, %v0, %v0, %v0
++	vmalf	%v0, %v0, %v0, %v31
++	vmalf	%v0, %v0, %v31, %v0
++	vmalf	%v0, %v31, %v0, %v0
++	vmalf	%v31, %v0, %v0, %v0
++	vmalf	%v13, %v17, %v21, %v25
++
++#CHECK: vmalhb  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa9]
++#CHECK: vmalhb  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xa9]
++#CHECK: vmalhb  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa9]
++#CHECK: vmalhb  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa9]
++#CHECK: vmalhb  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa9]
++#CHECK: vmalhb  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xa9]
++
++	vmalhb	%v0, %v0, %v0, %v0
++	vmalhb	%v0, %v0, %v0, %v31
++	vmalhb	%v0, %v0, %v31, %v0
++	vmalhb	%v0, %v31, %v0, %v0
++	vmalhb	%v31, %v0, %v0, %v0
++	vmalhb	%v13, %v17, %v21, %v25
++
++#CHECK: vmalhf  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xa9]
++#CHECK: vmalhf  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xa9]
++#CHECK: vmalhf  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xa9]
++#CHECK: vmalhf  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xa9]
++#CHECK: vmalhf  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xa9]
++#CHECK: vmalhf  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xa9]
++
++	vmalhf	%v0, %v0, %v0, %v0
++	vmalhf	%v0, %v0, %v0, %v31
++	vmalhf	%v0, %v0, %v31, %v0
++	vmalhf	%v0, %v31, %v0, %v0
++	vmalhf	%v31, %v0, %v0, %v0
++	vmalhf	%v13, %v17, %v21, %v25
++
++#CHECK: vmalhh  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xa9]
++#CHECK: vmalhh  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xa9]
++#CHECK: vmalhh  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xa9]
++#CHECK: vmalhh  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xa9]
++#CHECK: vmalhh  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xa9]
++#CHECK: vmalhh  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xa9]
++
++	vmalhh	%v0, %v0, %v0, %v0
++	vmalhh	%v0, %v0, %v0, %v31
++	vmalhh	%v0, %v0, %v31, %v0
++	vmalhh	%v0, %v31, %v0, %v0
++	vmalhh	%v31, %v0, %v0, %v0
++	vmalhh	%v13, %v17, %v21, %v25
++
++#CHECK: vmalhw  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xaa]
++#CHECK: vmalhw  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xaa]
++#CHECK: vmalhw  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xaa]
++#CHECK: vmalhw  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xaa]
++#CHECK: vmalhw  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xaa]
++#CHECK: vmalhw  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xaa]
++
++	vmalhw	%v0, %v0, %v0, %v0
++	vmalhw	%v0, %v0, %v0, %v31
++	vmalhw	%v0, %v0, %v31, %v0
++	vmalhw	%v0, %v31, %v0, %v0
++	vmalhw	%v31, %v0, %v0, %v0
++	vmalhw	%v13, %v17, %v21, %v25
++
++#CHECK: vmalob  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xad]
++#CHECK: vmalob  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xad]
++#CHECK: vmalob  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xad]
++#CHECK: vmalob  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xad]
++#CHECK: vmalob  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xad]
++#CHECK: vmalob  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xad]
++
++	vmalob	%v0, %v0, %v0, %v0
++	vmalob	%v0, %v0, %v0, %v31
++	vmalob	%v0, %v0, %v31, %v0
++	vmalob	%v0, %v31, %v0, %v0
++	vmalob	%v31, %v0, %v0, %v0
++	vmalob	%v13, %v17, %v21, %v25
++
++#CHECK: vmalof  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xad]
++#CHECK: vmalof  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xad]
++#CHECK: vmalof  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xad]
++#CHECK: vmalof  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xad]
++#CHECK: vmalof  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xad]
++#CHECK: vmalof  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xad]
++
++	vmalof	%v0, %v0, %v0, %v0
++	vmalof	%v0, %v0, %v0, %v31
++	vmalof	%v0, %v0, %v31, %v0
++	vmalof	%v0, %v31, %v0, %v0
++	vmalof	%v31, %v0, %v0, %v0
++	vmalof	%v13, %v17, %v21, %v25
++
++#CHECK: vmaloh  %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xad]
++#CHECK: vmaloh  %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xad]
++#CHECK: vmaloh  %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xad]
++#CHECK: vmaloh  %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xad]
++#CHECK: vmaloh  %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xad]
++#CHECK: vmaloh  %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xad]
++
++	vmaloh	%v0, %v0, %v0, %v0
++	vmaloh	%v0, %v0, %v0, %v31
++	vmaloh	%v0, %v0, %v31, %v0
++	vmaloh	%v0, %v31, %v0, %v0
++	vmaloh	%v31, %v0, %v0, %v0
++	vmaloh	%v13, %v17, %v21, %v25
++
++#CHECK: vmaob   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0xaf]
++#CHECK: vmaob   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0xaf]
++#CHECK: vmaob   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xaf]
++#CHECK: vmaob   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xaf]
++#CHECK: vmaob   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xaf]
++#CHECK: vmaob   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0xaf]
++
++	vmaob	%v0, %v0, %v0, %v0
++	vmaob	%v0, %v0, %v0, %v31
++	vmaob	%v0, %v0, %v31, %v0
++	vmaob	%v0, %v31, %v0, %v0
++	vmaob	%v31, %v0, %v0, %v0
++	vmaob	%v13, %v17, %v21, %v25
++
++#CHECK: vmaof   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x02,0x00,0x00,0xaf]
++#CHECK: vmaof   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x02,0x00,0xf1,0xaf]
++#CHECK: vmaof   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf2,0x00,0x02,0xaf]
++#CHECK: vmaof   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x02,0x00,0x04,0xaf]
++#CHECK: vmaof   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x02,0x00,0x08,0xaf]
++#CHECK: vmaof   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x52,0x00,0x97,0xaf]
++
++	vmaof	%v0, %v0, %v0, %v0
++	vmaof	%v0, %v0, %v0, %v31
++	vmaof	%v0, %v0, %v31, %v0
++	vmaof	%v0, %v31, %v0, %v0
++	vmaof	%v31, %v0, %v0, %v0
++	vmaof	%v13, %v17, %v21, %v25
++
++#CHECK: vmaoh   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x01,0x00,0x00,0xaf]
++#CHECK: vmaoh   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x01,0x00,0xf1,0xaf]
++#CHECK: vmaoh   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf1,0x00,0x02,0xaf]
++#CHECK: vmaoh   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x01,0x00,0x04,0xaf]
++#CHECK: vmaoh   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x01,0x00,0x08,0xaf]
++#CHECK: vmaoh   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x51,0x00,0x97,0xaf]
++
++	vmaoh	%v0, %v0, %v0, %v0
++	vmaoh	%v0, %v0, %v0, %v31
++	vmaoh	%v0, %v0, %v31, %v0
++	vmaoh	%v0, %v31, %v0, %v0
++	vmaoh	%v31, %v0, %v0, %v0
++	vmaoh	%v13, %v17, %v21, %v25
++
++#CHECK: vmeb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa6]
++#CHECK: vmeb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa6]
++#CHECK: vmeb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa6]
++#CHECK: vmeb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa6]
++#CHECK: vmeb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa6]
++
++	vmeb	%v0, %v0, %v0
++	vmeb	%v0, %v0, %v31
++	vmeb	%v0, %v31, %v0
++	vmeb	%v31, %v0, %v0
++	vmeb	%v18, %v3, %v20
++
++#CHECK: vmef    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa6]
++#CHECK: vmef    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa6]
++#CHECK: vmef    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa6]
++#CHECK: vmef    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa6]
++#CHECK: vmef    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa6]
++
++	vmef	%v0, %v0, %v0
++	vmef	%v0, %v0, %v31
++	vmef	%v0, %v31, %v0
++	vmef	%v31, %v0, %v0
++	vmef	%v18, %v3, %v20
++
++#CHECK: vmeh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa6]
++#CHECK: vmeh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa6]
++#CHECK: vmeh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa6]
++#CHECK: vmeh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa6]
++#CHECK: vmeh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa6]
++
++	vmeh	%v0, %v0, %v0
++	vmeh	%v0, %v0, %v31
++	vmeh	%v0, %v31, %v0
++	vmeh	%v31, %v0, %v0
++	vmeh	%v18, %v3, %v20
++
++#CHECK: vmhb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa3]
++#CHECK: vmhb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa3]
++#CHECK: vmhb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa3]
++#CHECK: vmhb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa3]
++#CHECK: vmhb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa3]
++
++	vmhb	%v0, %v0, %v0
++	vmhb	%v0, %v0, %v31
++	vmhb	%v0, %v31, %v0
++	vmhb	%v31, %v0, %v0
++	vmhb	%v18, %v3, %v20
++
++#CHECK: vmhf    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa3]
++#CHECK: vmhf    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa3]
++#CHECK: vmhf    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa3]
++#CHECK: vmhf    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa3]
++#CHECK: vmhf    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa3]
++
++	vmhf	%v0, %v0, %v0
++	vmhf	%v0, %v0, %v31
++	vmhf	%v0, %v31, %v0
++	vmhf	%v31, %v0, %v0
++	vmhf	%v18, %v3, %v20
++
++#CHECK: vmhh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa3]
++#CHECK: vmhh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa3]
++#CHECK: vmhh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa3]
++#CHECK: vmhh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa3]
++#CHECK: vmhh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa3]
++
++	vmhh	%v0, %v0, %v0
++	vmhh	%v0, %v0, %v31
++	vmhh	%v0, %v31, %v0
++	vmhh	%v31, %v0, %v0
++	vmhh	%v18, %v3, %v20
++
++#CHECK: vmlb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa2]
++#CHECK: vmlb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa2]
++#CHECK: vmlb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa2]
++#CHECK: vmlb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa2]
++#CHECK: vmlb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa2]
++
++	vmlb	%v0, %v0, %v0
++	vmlb	%v0, %v0, %v31
++	vmlb	%v0, %v31, %v0
++	vmlb	%v31, %v0, %v0
++	vmlb	%v18, %v3, %v20
++
++#CHECK: vmleb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa4]
++#CHECK: vmleb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa4]
++#CHECK: vmleb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa4]
++#CHECK: vmleb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa4]
++#CHECK: vmleb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa4]
++
++	vmleb	%v0, %v0, %v0
++	vmleb	%v0, %v0, %v31
++	vmleb	%v0, %v31, %v0
++	vmleb	%v31, %v0, %v0
++	vmleb	%v18, %v3, %v20
++
++#CHECK: vmlef   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa4]
++#CHECK: vmlef   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa4]
++#CHECK: vmlef   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa4]
++#CHECK: vmlef   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa4]
++#CHECK: vmlef   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa4]
++
++	vmlef	%v0, %v0, %v0
++	vmlef	%v0, %v0, %v31
++	vmlef	%v0, %v31, %v0
++	vmlef	%v31, %v0, %v0
++	vmlef	%v18, %v3, %v20
++
++#CHECK: vmleh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa4]
++#CHECK: vmleh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa4]
++#CHECK: vmleh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa4]
++#CHECK: vmleh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa4]
++#CHECK: vmleh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa4]
++
++	vmleh	%v0, %v0, %v0
++	vmleh	%v0, %v0, %v31
++	vmleh	%v0, %v31, %v0
++	vmleh	%v31, %v0, %v0
++	vmleh	%v18, %v3, %v20
++
++#CHECK: vmlf    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa2]
++#CHECK: vmlf    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa2]
++#CHECK: vmlf    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa2]
++#CHECK: vmlf    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa2]
++#CHECK: vmlf    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa2]
++
++	vmlf	%v0, %v0, %v0
++	vmlf	%v0, %v0, %v31
++	vmlf	%v0, %v31, %v0
++	vmlf	%v31, %v0, %v0
++	vmlf	%v18, %v3, %v20
++
++#CHECK: vmlhb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa1]
++#CHECK: vmlhb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa1]
++#CHECK: vmlhb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa1]
++#CHECK: vmlhb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa1]
++#CHECK: vmlhb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa1]
++
++	vmlhb	%v0, %v0, %v0
++	vmlhb	%v0, %v0, %v31
++	vmlhb	%v0, %v31, %v0
++	vmlhb	%v31, %v0, %v0
++	vmlhb	%v18, %v3, %v20
++
++#CHECK: vmlhf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa1]
++#CHECK: vmlhf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa1]
++#CHECK: vmlhf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa1]
++#CHECK: vmlhf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa1]
++#CHECK: vmlhf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa1]
++
++	vmlhf	%v0, %v0, %v0
++	vmlhf	%v0, %v0, %v31
++	vmlhf	%v0, %v31, %v0
++	vmlhf	%v31, %v0, %v0
++	vmlhf	%v18, %v3, %v20
++
++#CHECK: vmlhh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa1]
++#CHECK: vmlhh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa1]
++#CHECK: vmlhh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa1]
++#CHECK: vmlhh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa1]
++#CHECK: vmlhh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa1]
++
++	vmlhh	%v0, %v0, %v0
++	vmlhh	%v0, %v0, %v31
++	vmlhh	%v0, %v31, %v0
++	vmlhh	%v31, %v0, %v0
++	vmlhh	%v18, %v3, %v20
++
++#CHECK: vmlhw   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa2]
++#CHECK: vmlhw   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa2]
++#CHECK: vmlhw   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa2]
++#CHECK: vmlhw   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa2]
++#CHECK: vmlhw   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa2]
++
++	vmlhw	%v0, %v0, %v0
++	vmlhw	%v0, %v0, %v31
++	vmlhw	%v0, %v31, %v0
++	vmlhw	%v31, %v0, %v0
++	vmlhw	%v18, %v3, %v20
++
++#CHECK: vmlob   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa5]
++#CHECK: vmlob   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa5]
++#CHECK: vmlob   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa5]
++#CHECK: vmlob   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa5]
++#CHECK: vmlob   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa5]
++
++	vmlob	%v0, %v0, %v0
++	vmlob	%v0, %v0, %v31
++	vmlob	%v0, %v31, %v0
++	vmlob	%v31, %v0, %v0
++	vmlob	%v18, %v3, %v20
++
++#CHECK: vmlof   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa5]
++#CHECK: vmlof   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa5]
++#CHECK: vmlof   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa5]
++#CHECK: vmlof   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa5]
++#CHECK: vmlof   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa5]
++
++	vmlof	%v0, %v0, %v0
++	vmlof	%v0, %v0, %v31
++	vmlof	%v0, %v31, %v0
++	vmlof	%v31, %v0, %v0
++	vmlof	%v18, %v3, %v20
++
++#CHECK: vmloh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa5]
++#CHECK: vmloh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa5]
++#CHECK: vmloh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa5]
++#CHECK: vmloh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa5]
++#CHECK: vmloh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa5]
++
++	vmloh	%v0, %v0, %v0
++	vmloh	%v0, %v0, %v31
++	vmloh	%v0, %v31, %v0
++	vmloh	%v31, %v0, %v0
++	vmloh	%v18, %v3, %v20
++
++#CHECK: vmnb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xfe]
++#CHECK: vmnb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xfe]
++#CHECK: vmnb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xfe]
++#CHECK: vmnb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xfe]
++#CHECK: vmnb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xfe]
++
++	vmnb	%v0, %v0, %v0
++	vmnb	%v0, %v0, %v31
++	vmnb	%v0, %v31, %v0
++	vmnb	%v31, %v0, %v0
++	vmnb	%v18, %v3, %v20
++
++#CHECK: vmnf    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xfe]
++#CHECK: vmnf    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xfe]
++#CHECK: vmnf    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xfe]
++#CHECK: vmnf    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xfe]
++#CHECK: vmnf    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xfe]
++
++	vmnf	%v0, %v0, %v0
++	vmnf	%v0, %v0, %v31
++	vmnf	%v0, %v31, %v0
++	vmnf	%v31, %v0, %v0
++	vmnf	%v18, %v3, %v20
++
++#CHECK: vmng    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xfe]
++#CHECK: vmng    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xfe]
++#CHECK: vmng    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xfe]
++#CHECK: vmng    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xfe]
++#CHECK: vmng    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xfe]
++
++	vmng	%v0, %v0, %v0
++	vmng	%v0, %v0, %v31
++	vmng	%v0, %v31, %v0
++	vmng	%v31, %v0, %v0
++	vmng	%v18, %v3, %v20
++
++#CHECK: vmnh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xfe]
++#CHECK: vmnh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xfe]
++#CHECK: vmnh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xfe]
++#CHECK: vmnh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xfe]
++#CHECK: vmnh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xfe]
++
++	vmnh	%v0, %v0, %v0
++	vmnh	%v0, %v0, %v31
++	vmnh	%v0, %v31, %v0
++	vmnh	%v31, %v0, %v0
++	vmnh	%v18, %v3, %v20
++
++#CHECK: vmnlb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xfc]
++#CHECK: vmnlb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xfc]
++#CHECK: vmnlb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xfc]
++#CHECK: vmnlb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xfc]
++#CHECK: vmnlb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xfc]
++
++	vmnlb	%v0, %v0, %v0
++	vmnlb	%v0, %v0, %v31
++	vmnlb	%v0, %v31, %v0
++	vmnlb	%v31, %v0, %v0
++	vmnlb	%v18, %v3, %v20
++
++#CHECK: vmnlf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xfc]
++#CHECK: vmnlf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xfc]
++#CHECK: vmnlf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xfc]
++#CHECK: vmnlf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xfc]
++#CHECK: vmnlf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xfc]
++
++	vmnlf	%v0, %v0, %v0
++	vmnlf	%v0, %v0, %v31
++	vmnlf	%v0, %v31, %v0
++	vmnlf	%v31, %v0, %v0
++	vmnlf	%v18, %v3, %v20
++
++#CHECK: vmnlg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xfc]
++#CHECK: vmnlg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xfc]
++#CHECK: vmnlg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xfc]
++#CHECK: vmnlg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xfc]
++#CHECK: vmnlg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xfc]
++
++	vmnlg	%v0, %v0, %v0
++	vmnlg	%v0, %v0, %v31
++	vmnlg	%v0, %v31, %v0
++	vmnlg	%v31, %v0, %v0
++	vmnlg	%v18, %v3, %v20
++
++#CHECK: vmnlh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xfc]
++#CHECK: vmnlh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xfc]
++#CHECK: vmnlh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xfc]
++#CHECK: vmnlh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xfc]
++#CHECK: vmnlh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xfc]
++
++	vmnlh	%v0, %v0, %v0
++	vmnlh	%v0, %v0, %v31
++	vmnlh	%v0, %v31, %v0
++	vmnlh	%v31, %v0, %v0
++	vmnlh	%v18, %v3, %v20
++
++#CHECK: vmob    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xa7]
++#CHECK: vmob    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xa7]
++#CHECK: vmob    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xa7]
++#CHECK: vmob    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xa7]
++#CHECK: vmob    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xa7]
++
++	vmob	%v0, %v0, %v0
++	vmob	%v0, %v0, %v31
++	vmob	%v0, %v31, %v0
++	vmob	%v31, %v0, %v0
++	vmob	%v18, %v3, %v20
++
++#CHECK: vmof    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xa7]
++#CHECK: vmof    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xa7]
++#CHECK: vmof    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xa7]
++#CHECK: vmof    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xa7]
++#CHECK: vmof    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xa7]
++
++	vmof	%v0, %v0, %v0
++	vmof	%v0, %v0, %v31
++	vmof	%v0, %v31, %v0
++	vmof	%v31, %v0, %v0
++	vmof	%v18, %v3, %v20
++
++#CHECK: vmoh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xa7]
++#CHECK: vmoh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xa7]
++#CHECK: vmoh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xa7]
++#CHECK: vmoh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xa7]
++#CHECK: vmoh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xa7]
++
++	vmoh	%v0, %v0, %v0
++	vmoh	%v0, %v0, %v31
++	vmoh	%v0, %v31, %v0
++	vmoh	%v31, %v0, %v0
++	vmoh	%v18, %v3, %v20
++
++#CHECK: vmrhb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x61]
++#CHECK: vmrhb   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x61]
++#CHECK: vmrhb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x61]
++#CHECK: vmrhb   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x61]
++#CHECK: vmrhb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x61]
++#CHECK: vmrhb   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x61]
++#CHECK: vmrhb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x61]
++#CHECK: vmrhb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x61]
++
++	vmrhb	%v0, %v0, %v0
++	vmrhb	%v0, %v0, %v15
++	vmrhb	%v0, %v0, %v31
++	vmrhb	%v0, %v15, %v0
++	vmrhb	%v0, %v31, %v0
++	vmrhb	%v15, %v0, %v0
++	vmrhb	%v31, %v0, %v0
++	vmrhb	%v18, %v3, %v20
++
++#CHECK: vmrhf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x61]
++#CHECK: vmrhf   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x61]
++#CHECK: vmrhf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x61]
++#CHECK: vmrhf   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x61]
++#CHECK: vmrhf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x61]
++#CHECK: vmrhf   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x61]
++#CHECK: vmrhf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x61]
++#CHECK: vmrhf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x61]
++
++	vmrhf	%v0, %v0, %v0
++	vmrhf	%v0, %v0, %v15
++	vmrhf	%v0, %v0, %v31
++	vmrhf	%v0, %v15, %v0
++	vmrhf	%v0, %v31, %v0
++	vmrhf	%v15, %v0, %v0
++	vmrhf	%v31, %v0, %v0
++	vmrhf	%v18, %v3, %v20
++
++#CHECK: vmrhg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x61]
++#CHECK: vmrhg   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x61]
++#CHECK: vmrhg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x61]
++#CHECK: vmrhg   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x61]
++#CHECK: vmrhg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x61]
++#CHECK: vmrhg   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x61]
++#CHECK: vmrhg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x61]
++#CHECK: vmrhg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x61]
++
++	vmrhg	%v0, %v0, %v0
++	vmrhg	%v0, %v0, %v15
++	vmrhg	%v0, %v0, %v31
++	vmrhg	%v0, %v15, %v0
++	vmrhg	%v0, %v31, %v0
++	vmrhg	%v15, %v0, %v0
++	vmrhg	%v31, %v0, %v0
++	vmrhg	%v18, %v3, %v20
++
++#CHECK: vmrhh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x61]
++#CHECK: vmrhh   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x61]
++#CHECK: vmrhh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x61]
++#CHECK: vmrhh   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x61]
++#CHECK: vmrhh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x61]
++#CHECK: vmrhh   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x61]
++#CHECK: vmrhh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x61]
++#CHECK: vmrhh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x61]
++
++	vmrhh	%v0, %v0, %v0
++	vmrhh	%v0, %v0, %v15
++	vmrhh	%v0, %v0, %v31
++	vmrhh	%v0, %v15, %v0
++	vmrhh	%v0, %v31, %v0
++	vmrhh	%v15, %v0, %v0
++	vmrhh	%v31, %v0, %v0
++	vmrhh	%v18, %v3, %v20
++
++#CHECK: vmrlb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x60]
++#CHECK: vmrlb   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x60]
++#CHECK: vmrlb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x60]
++#CHECK: vmrlb   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x60]
++#CHECK: vmrlb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x60]
++#CHECK: vmrlb   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x60]
++#CHECK: vmrlb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x60]
++#CHECK: vmrlb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x60]
++
++	vmrlb	%v0, %v0, %v0
++	vmrlb	%v0, %v0, %v15
++	vmrlb	%v0, %v0, %v31
++	vmrlb	%v0, %v15, %v0
++	vmrlb	%v0, %v31, %v0
++	vmrlb	%v15, %v0, %v0
++	vmrlb	%v31, %v0, %v0
++	vmrlb	%v18, %v3, %v20
++
++#CHECK: vmrlf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x60]
++#CHECK: vmrlf   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x60]
++#CHECK: vmrlf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x60]
++#CHECK: vmrlf   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x60]
++#CHECK: vmrlf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x60]
++#CHECK: vmrlf   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x60]
++#CHECK: vmrlf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x60]
++#CHECK: vmrlf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x60]
++
++	vmrlf	%v0, %v0, %v0
++	vmrlf	%v0, %v0, %v15
++	vmrlf	%v0, %v0, %v31
++	vmrlf	%v0, %v15, %v0
++	vmrlf	%v0, %v31, %v0
++	vmrlf	%v15, %v0, %v0
++	vmrlf	%v31, %v0, %v0
++	vmrlf	%v18, %v3, %v20
++
++#CHECK: vmrlg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x60]
++#CHECK: vmrlg   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x60]
++#CHECK: vmrlg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x60]
++#CHECK: vmrlg   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x60]
++#CHECK: vmrlg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x60]
++#CHECK: vmrlg   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x60]
++#CHECK: vmrlg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x60]
++#CHECK: vmrlg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x60]
++
++	vmrlg	%v0, %v0, %v0
++	vmrlg	%v0, %v0, %v15
++	vmrlg	%v0, %v0, %v31
++	vmrlg	%v0, %v15, %v0
++	vmrlg	%v0, %v31, %v0
++	vmrlg	%v15, %v0, %v0
++	vmrlg	%v31, %v0, %v0
++	vmrlg	%v18, %v3, %v20
++
++#CHECK: vmrlh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x60]
++#CHECK: vmrlh   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x60]
++#CHECK: vmrlh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x60]
++#CHECK: vmrlh   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x60]
++#CHECK: vmrlh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x60]
++#CHECK: vmrlh   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x60]
++#CHECK: vmrlh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x60]
++#CHECK: vmrlh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x60]
++
++	vmrlh	%v0, %v0, %v0
++	vmrlh	%v0, %v0, %v15
++	vmrlh	%v0, %v0, %v31
++	vmrlh	%v0, %v15, %v0
++	vmrlh	%v0, %v31, %v0
++	vmrlh	%v15, %v0, %v0
++	vmrlh	%v31, %v0, %v0
++	vmrlh	%v18, %v3, %v20
++
++#CHECK: vmxb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xff]
++#CHECK: vmxb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xff]
++#CHECK: vmxb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xff]
++#CHECK: vmxb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xff]
++#CHECK: vmxb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xff]
++
++	vmxb	%v0, %v0, %v0
++	vmxb	%v0, %v0, %v31
++	vmxb	%v0, %v31, %v0
++	vmxb	%v31, %v0, %v0
++	vmxb	%v18, %v3, %v20
++
++#CHECK: vmxf    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xff]
++#CHECK: vmxf    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xff]
++#CHECK: vmxf    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xff]
++#CHECK: vmxf    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xff]
++#CHECK: vmxf    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xff]
++
++	vmxf	%v0, %v0, %v0
++	vmxf	%v0, %v0, %v31
++	vmxf	%v0, %v31, %v0
++	vmxf	%v31, %v0, %v0
++	vmxf	%v18, %v3, %v20
++
++#CHECK: vmxg    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xff]
++#CHECK: vmxg    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xff]
++#CHECK: vmxg    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xff]
++#CHECK: vmxg    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xff]
++#CHECK: vmxg    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xff]
++
++	vmxg	%v0, %v0, %v0
++	vmxg	%v0, %v0, %v31
++	vmxg	%v0, %v31, %v0
++	vmxg	%v31, %v0, %v0
++	vmxg	%v18, %v3, %v20
++
++#CHECK: vmxh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xff]
++#CHECK: vmxh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xff]
++#CHECK: vmxh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xff]
++#CHECK: vmxh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xff]
++#CHECK: vmxh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xff]
++
++	vmxh	%v0, %v0, %v0
++	vmxh	%v0, %v0, %v31
++	vmxh	%v0, %v31, %v0
++	vmxh	%v31, %v0, %v0
++	vmxh	%v18, %v3, %v20
++
++#CHECK: vmxlb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xfd]
++#CHECK: vmxlb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xfd]
++#CHECK: vmxlb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xfd]
++#CHECK: vmxlb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xfd]
++#CHECK: vmxlb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xfd]
++
++	vmxlb	%v0, %v0, %v0
++	vmxlb	%v0, %v0, %v31
++	vmxlb	%v0, %v31, %v0
++	vmxlb	%v31, %v0, %v0
++	vmxlb	%v18, %v3, %v20
++
++#CHECK: vmxlf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xfd]
++#CHECK: vmxlf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xfd]
++#CHECK: vmxlf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xfd]
++#CHECK: vmxlf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xfd]
++#CHECK: vmxlf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xfd]
++
++	vmxlf	%v0, %v0, %v0
++	vmxlf	%v0, %v0, %v31
++	vmxlf	%v0, %v31, %v0
++	vmxlf	%v31, %v0, %v0
++	vmxlf	%v18, %v3, %v20
++
++#CHECK: vmxlg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xfd]
++#CHECK: vmxlg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xfd]
++#CHECK: vmxlg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xfd]
++#CHECK: vmxlg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xfd]
++#CHECK: vmxlg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xfd]
++
++	vmxlg	%v0, %v0, %v0
++	vmxlg	%v0, %v0, %v31
++	vmxlg	%v0, %v31, %v0
++	vmxlg	%v31, %v0, %v0
++	vmxlg	%v18, %v3, %v20
++
++#CHECK: vmxlh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xfd]
++#CHECK: vmxlh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xfd]
++#CHECK: vmxlh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xfd]
++#CHECK: vmxlh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xfd]
++#CHECK: vmxlh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xfd]
++
++	vmxlh	%v0, %v0, %v0
++	vmxlh	%v0, %v0, %v31
++	vmxlh	%v0, %v31, %v0
++	vmxlh	%v31, %v0, %v0
++	vmxlh	%v18, %v3, %v20
++
++#CHECK: vn      %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x68]
++#CHECK: vn      %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x68]
++#CHECK: vn      %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x68]
++#CHECK: vn      %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x68]
++#CHECK: vn      %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x68]
++
++	vn	%v0, %v0, %v0
++	vn	%v0, %v0, %v31
++	vn	%v0, %v31, %v0
++	vn	%v31, %v0, %v0
++	vn	%v18, %v3, %v20
++
++#CHECK: vnc     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x69]
++#CHECK: vnc     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x69]
++#CHECK: vnc     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x69]
++#CHECK: vnc     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x69]
++#CHECK: vnc     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x69]
++
++	vnc	%v0, %v0, %v0
++	vnc	%v0, %v0, %v31
++	vnc	%v0, %v31, %v0
++	vnc	%v31, %v0, %v0
++	vnc	%v18, %v3, %v20
++
++#CHECK: vno     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6b]
++#CHECK: vno     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6b]
++#CHECK: vno     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6b]
++#CHECK: vno     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6b]
++#CHECK: vno     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6b]
++
++	vno	%v0, %v0, %v0
++	vno	%v0, %v0, %v31
++	vno	%v0, %v31, %v0
++	vno	%v31, %v0, %v0
++	vno	%v18, %v3, %v20
++
++#CHECK: vo      %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6a]
++#CHECK: vo      %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6a]
++#CHECK: vo      %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6a]
++#CHECK: vo      %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6a]
++#CHECK: vo      %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6a]
++
++	vo	%v0, %v0, %v0
++	vo	%v0, %v0, %v31
++	vo	%v0, %v31, %v0
++	vo	%v31, %v0, %v0
++	vo	%v18, %v3, %v20
++
++#CHECK: vone    %v0                     # encoding: [0xe7,0x00,0xff,0xff,0x00,0x44]
++#CHECK: vone    %v15                    # encoding: [0xe7,0xf0,0xff,0xff,0x00,0x44]
++#CHECK: vone    %v22                    # encoding: [0xe7,0x60,0xff,0xff,0x08,0x44]
++#CHECK: vone    %v31                    # encoding: [0xe7,0xf0,0xff,0xff,0x08,0x44]
++
++	vone	%v0
++	vone	%v15
++	vone	%v22
++	vone	%v31
++
++#CHECK: vpdi    %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x84]
++#CHECK: vpdi    %v0, %v0, %v0, 5        # encoding: [0xe7,0x00,0x00,0x00,0x50,0x84]
++#CHECK: vpdi    %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x84]
++#CHECK: vpdi    %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x84]
++#CHECK: vpdi    %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x84]
++#CHECK: vpdi    %v13, %v17, %v21, 4     # encoding: [0xe7,0xd1,0x50,0x00,0x46,0x84]
++
++	vpdi	%v0, %v0, %v0, 0
++	vpdi	%v0, %v0, %v0, 5
++	vpdi	%v0, %v0, %v31, 0
++	vpdi	%v0, %v31, %v0, 0
++	vpdi	%v31, %v0, %v0, 0
++	vpdi	%v13, %v17, %v21, 4
++
++#CHECK: vperm   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8c]
++#CHECK: vperm   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8c]
++#CHECK: vperm   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8c]
++#CHECK: vperm   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8c]
++#CHECK: vperm   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8c]
++#CHECK: vperm   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0x8c]
++
++	vperm	%v0, %v0, %v0, %v0
++	vperm	%v0, %v0, %v0, %v31
++	vperm	%v0, %v0, %v31, %v0
++	vperm	%v0, %v31, %v0, %v0
++	vperm	%v31, %v0, %v0, %v0
++	vperm	%v13, %v17, %v21, %v25
++
++#CHECK: vpkf    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x94]
++#CHECK: vpkf    %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x94]
++#CHECK: vpkf    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x94]
++#CHECK: vpkf    %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x94]
++#CHECK: vpkf    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x94]
++#CHECK: vpkf    %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x94]
++#CHECK: vpkf    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x94]
++#CHECK: vpkf    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x94]
++
++	vpkf	%v0, %v0, %v0
++	vpkf	%v0, %v0, %v15
++	vpkf	%v0, %v0, %v31
++	vpkf	%v0, %v15, %v0
++	vpkf	%v0, %v31, %v0
++	vpkf	%v15, %v0, %v0
++	vpkf	%v31, %v0, %v0
++	vpkf	%v18, %v3, %v20
++
++#CHECK: vpkg    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x94]
++#CHECK: vpkg    %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x94]
++#CHECK: vpkg    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x94]
++#CHECK: vpkg    %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x94]
++#CHECK: vpkg    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x94]
++#CHECK: vpkg    %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x94]
++#CHECK: vpkg    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x94]
++#CHECK: vpkg    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x94]
++
++	vpkg	%v0, %v0, %v0
++	vpkg	%v0, %v0, %v15
++	vpkg	%v0, %v0, %v31
++	vpkg	%v0, %v15, %v0
++	vpkg	%v0, %v31, %v0
++	vpkg	%v15, %v0, %v0
++	vpkg	%v31, %v0, %v0
++	vpkg	%v18, %v3, %v20
++
++#CHECK: vpkh    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x94]
++#CHECK: vpkh    %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x94]
++#CHECK: vpkh    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x94]
++#CHECK: vpkh    %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x94]
++#CHECK: vpkh    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x94]
++#CHECK: vpkh    %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x94]
++#CHECK: vpkh    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x94]
++#CHECK: vpkh    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x94]
++
++	vpkh	%v0, %v0, %v0
++	vpkh	%v0, %v0, %v15
++	vpkh	%v0, %v0, %v31
++	vpkh	%v0, %v15, %v0
++	vpkh	%v0, %v31, %v0
++	vpkh	%v15, %v0, %v0
++	vpkh	%v31, %v0, %v0
++	vpkh	%v18, %v3, %v20
++
++#CHECK: vpklsf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x95]
++#CHECK: vpklsf  %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x95]
++#CHECK: vpklsf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x95]
++#CHECK: vpklsf  %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x95]
++#CHECK: vpklsf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x95]
++#CHECK: vpklsf  %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x95]
++#CHECK: vpklsf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x95]
++#CHECK: vpklsf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x95]
++#CHECK: vpklsfs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0x95]
++
++	vpklsf	%v0, %v0, %v0
++	vpklsf	%v0, %v0, %v15
++	vpklsf	%v0, %v0, %v31
++	vpklsf	%v0, %v15, %v0
++	vpklsf	%v0, %v31, %v0
++	vpklsf	%v15, %v0, %v0
++	vpklsf	%v31, %v0, %v0
++	vpklsf	%v18, %v3, %v20
++	vpklsfs	%v5, %v22, %v7
++
++#CHECK: vpklsg  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x95]
++#CHECK: vpklsg  %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x95]
++#CHECK: vpklsg  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x95]
++#CHECK: vpklsg  %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x95]
++#CHECK: vpklsg  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x95]
++#CHECK: vpklsg  %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x95]
++#CHECK: vpklsg  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x95]
++#CHECK: vpklsg  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x95]
++#CHECK: vpklsgs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x34,0x95]
++
++	vpklsg	%v0, %v0, %v0
++	vpklsg	%v0, %v0, %v15
++	vpklsg	%v0, %v0, %v31
++	vpklsg	%v0, %v15, %v0
++	vpklsg	%v0, %v31, %v0
++	vpklsg	%v15, %v0, %v0
++	vpklsg	%v31, %v0, %v0
++	vpklsg	%v18, %v3, %v20
++	vpklsgs	%v5, %v22, %v7
++
++#CHECK: vpklsh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x95]
++#CHECK: vpklsh  %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x95]
++#CHECK: vpklsh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x95]
++#CHECK: vpklsh  %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x95]
++#CHECK: vpklsh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x95]
++#CHECK: vpklsh  %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x95]
++#CHECK: vpklsh  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x95]
++#CHECK: vpklsh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x95]
++#CHECK: vpklshs %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x14,0x95]
++
++	vpklsh	%v0, %v0, %v0
++	vpklsh	%v0, %v0, %v15
++	vpklsh	%v0, %v0, %v31
++	vpklsh	%v0, %v15, %v0
++	vpklsh	%v0, %v31, %v0
++	vpklsh	%v15, %v0, %v0
++	vpklsh	%v31, %v0, %v0
++	vpklsh	%v18, %v3, %v20
++	vpklshs	%v5, %v22, %v7
++
++#CHECK: vpksf   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x97]
++#CHECK: vpksf   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x20,0x97]
++#CHECK: vpksf   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x97]
++#CHECK: vpksf   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x97]
++#CHECK: vpksf   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x97]
++#CHECK: vpksf   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x97]
++#CHECK: vpksf   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x97]
++#CHECK: vpksf   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x97]
++#CHECK: vpksfs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x24,0x97]
++
++	vpksf	%v0, %v0, %v0
++	vpksf	%v0, %v0, %v15
++	vpksf	%v0, %v0, %v31
++	vpksf	%v0, %v15, %v0
++	vpksf	%v0, %v31, %v0
++	vpksf	%v15, %v0, %v0
++	vpksf	%v31, %v0, %v0
++	vpksf	%v18, %v3, %v20
++	vpksfs	%v5, %v22, %v7
++
++#CHECK: vpksg   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x97]
++#CHECK: vpksg   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x30,0x97]
++#CHECK: vpksg   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x97]
++#CHECK: vpksg   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x97]
++#CHECK: vpksg   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x97]
++#CHECK: vpksg   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x97]
++#CHECK: vpksg   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x97]
++#CHECK: vpksg   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x97]
++#CHECK: vpksgs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x34,0x97]
++
++	vpksg	%v0, %v0, %v0
++	vpksg	%v0, %v0, %v15
++	vpksg	%v0, %v0, %v31
++	vpksg	%v0, %v15, %v0
++	vpksg	%v0, %v31, %v0
++	vpksg	%v15, %v0, %v0
++	vpksg	%v31, %v0, %v0
++	vpksg	%v18, %v3, %v20
++	vpksgs	%v5, %v22, %v7
++
++#CHECK: vpksh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x97]
++#CHECK: vpksh   %v0, %v0, %v15          # encoding: [0xe7,0x00,0xf0,0x00,0x10,0x97]
++#CHECK: vpksh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x97]
++#CHECK: vpksh   %v0, %v15, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x97]
++#CHECK: vpksh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x97]
++#CHECK: vpksh   %v15, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x97]
++#CHECK: vpksh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x97]
++#CHECK: vpksh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x97]
++#CHECK: vpkshs  %v5, %v22, %v7          # encoding: [0xe7,0x56,0x70,0x10,0x14,0x97]
++
++	vpksh	%v0, %v0, %v0
++	vpksh	%v0, %v0, %v15
++	vpksh	%v0, %v0, %v31
++	vpksh	%v0, %v15, %v0
++	vpksh	%v0, %v31, %v0
++	vpksh	%v15, %v0, %v0
++	vpksh	%v31, %v0, %v0
++	vpksh	%v18, %v3, %v20
++	vpkshs	%v5, %v22, %v7
++
++#CHECK: vpopct  %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x50]
++#CHECK: vpopct  %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x50]
++#CHECK: vpopct  %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x50]
++#CHECK: vpopct  %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x50]
++#CHECK: vpopct  %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x50]
++#CHECK: vpopct  %v14, %v17, 0           # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x50]
++
++	vpopct	%v0, %v0, 0
++	vpopct	%v0, %v15, 0
++	vpopct	%v0, %v31, 0
++	vpopct	%v15, %v0, 0
++	vpopct	%v31, %v0, 0
++	vpopct	%v14, %v17, 0
++
++#CHECK: vrepb   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x4d]
++#CHECK: vrepb   %v0, %v0, 65535         # encoding: [0xe7,0x00,0xff,0xff,0x00,0x4d]
++#CHECK: vrepb   %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x4d]
++#CHECK: vrepb   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x4d]
++#CHECK: vrepb   %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x4d]
++#CHECK: vrepb   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x4d]
++#CHECK: vrepb   %v4, %v21, 26505        # encoding: [0xe7,0x45,0x67,0x89,0x04,0x4d]
++
++	vrepb	%v0, %v0, 0
++	vrepb	%v0, %v0, 65535
++	vrepb	%v0, %v15, 0
++	vrepb	%v0, %v31, 0
++	vrepb	%v15, %v0, 0
++	vrepb	%v31, %v0, 0
++	vrepb	%v4, %v21, 0x6789
++
++#CHECK: vrepf   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x20,0x4d]
++#CHECK: vrepf   %v0, %v0, 65535         # encoding: [0xe7,0x00,0xff,0xff,0x20,0x4d]
++#CHECK: vrepf   %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x4d]
++#CHECK: vrepf   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x4d]
++#CHECK: vrepf   %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x4d]
++#CHECK: vrepf   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x4d]
++#CHECK: vrepf   %v4, %v21, 26505        # encoding: [0xe7,0x45,0x67,0x89,0x24,0x4d]
++
++	vrepf	%v0, %v0, 0
++	vrepf	%v0, %v0, 65535
++	vrepf	%v0, %v15, 0
++	vrepf	%v0, %v31, 0
++	vrepf	%v15, %v0, 0
++	vrepf	%v31, %v0, 0
++	vrepf	%v4, %v21, 0x6789
++
++#CHECK: vrepg   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x30,0x4d]
++#CHECK: vrepg   %v0, %v0, 65535         # encoding: [0xe7,0x00,0xff,0xff,0x30,0x4d]
++#CHECK: vrepg   %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x30,0x4d]
++#CHECK: vrepg   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x4d]
++#CHECK: vrepg   %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x4d]
++#CHECK: vrepg   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x4d]
++#CHECK: vrepg   %v4, %v21, 26505        # encoding: [0xe7,0x45,0x67,0x89,0x34,0x4d]
++
++	vrepg	%v0, %v0, 0
++	vrepg	%v0, %v0, 65535
++	vrepg	%v0, %v15, 0
++	vrepg	%v0, %v31, 0
++	vrepg	%v15, %v0, 0
++	vrepg	%v31, %v0, 0
++	vrepg	%v4, %v21, 0x6789
++
++#CHECK: vreph   %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x10,0x4d]
++#CHECK: vreph   %v0, %v0, 65535         # encoding: [0xe7,0x00,0xff,0xff,0x10,0x4d]
++#CHECK: vreph   %v0, %v15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x4d]
++#CHECK: vreph   %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x4d]
++#CHECK: vreph   %v15, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x4d]
++#CHECK: vreph   %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x4d]
++#CHECK: vreph   %v4, %v21, 26505        # encoding: [0xe7,0x45,0x67,0x89,0x14,0x4d]
++
++	vreph	%v0, %v0, 0
++	vreph	%v0, %v0, 65535
++	vreph	%v0, %v15, 0
++	vreph	%v0, %v31, 0
++	vreph	%v15, %v0, 0
++	vreph	%v31, %v0, 0
++	vreph	%v4, %v21, 0x6789
++
++#CHECK: vrepib  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x45]
++#CHECK: vrepib  %v0, -32768             # encoding: [0xe7,0x00,0x80,0x00,0x00,0x45]
++#CHECK: vrepib  %v0, 32767              # encoding: [0xe7,0x00,0x7f,0xff,0x00,0x45]
++#CHECK: vrepib  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x45]
++#CHECK: vrepib  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x45]
++#CHECK: vrepib  %v18, 13398             # encoding: [0xe7,0x20,0x34,0x56,0x08,0x45]
++
++	vrepib	%v0, 0
++	vrepib	%v0, -32768
++	vrepib	%v0, 32767
++	vrepib	%v15, 0
++	vrepib	%v31, 0
++	vrepib	%v18, 0x3456
++
++#CHECK: vrepif  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x20,0x45]
++#CHECK: vrepif  %v0, -32768             # encoding: [0xe7,0x00,0x80,0x00,0x20,0x45]
++#CHECK: vrepif  %v0, 32767              # encoding: [0xe7,0x00,0x7f,0xff,0x20,0x45]
++#CHECK: vrepif  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x45]
++#CHECK: vrepif  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x45]
++#CHECK: vrepif  %v18, 13398             # encoding: [0xe7,0x20,0x34,0x56,0x28,0x45]
++
++	vrepif	%v0, 0
++	vrepif	%v0, -32768
++	vrepif	%v0, 32767
++	vrepif	%v15, 0
++	vrepif	%v31, 0
++	vrepif	%v18, 0x3456
++
++#CHECK: vrepig  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x30,0x45]
++#CHECK: vrepig  %v0, -32768             # encoding: [0xe7,0x00,0x80,0x00,0x30,0x45]
++#CHECK: vrepig  %v0, 32767              # encoding: [0xe7,0x00,0x7f,0xff,0x30,0x45]
++#CHECK: vrepig  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x30,0x45]
++#CHECK: vrepig  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x45]
++#CHECK: vrepig  %v18, 13398             # encoding: [0xe7,0x20,0x34,0x56,0x38,0x45]
++
++	vrepig	%v0, 0
++	vrepig	%v0, -32768
++	vrepig	%v0, 32767
++	vrepig	%v15, 0
++	vrepig	%v31, 0
++	vrepig	%v18, 0x3456
++
++#CHECK: vrepih  %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x10,0x45]
++#CHECK: vrepih  %v0, -32768             # encoding: [0xe7,0x00,0x80,0x00,0x10,0x45]
++#CHECK: vrepih  %v0, 32767              # encoding: [0xe7,0x00,0x7f,0xff,0x10,0x45]
++#CHECK: vrepih  %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x45]
++#CHECK: vrepih  %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x45]
++#CHECK: vrepih  %v18, 13398             # encoding: [0xe7,0x20,0x34,0x56,0x18,0x45]
++
++	vrepih	%v0, 0
++	vrepih	%v0, -32768
++	vrepih	%v0, 32767
++	vrepih	%v15, 0
++	vrepih	%v31, 0
++	vrepih	%v18, 0x3456
++
++#CHECK: vsb     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf7]
++#CHECK: vsb     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf7]
++#CHECK: vsb     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf7]
++#CHECK: vsb     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf7]
++#CHECK: vsb     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf7]
++
++	vsb	%v0, %v0, %v0
++	vsb	%v0, %v0, %v31
++	vsb	%v0, %v31, %v0
++	vsb	%v31, %v0, %v0
++	vsb	%v18, %v3, %v20
++
++#CHECK: vsbcbiq %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x00,0x00,0xbd]
++#CHECK: vsbcbiq %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x00,0xf1,0xbd]
++#CHECK: vsbcbiq %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x00,0x02,0xbd]
++#CHECK: vsbcbiq %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x00,0x04,0xbd]
++#CHECK: vsbcbiq %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x00,0x08,0xbd]
++#CHECK: vsbcbiq %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x00,0x97,0xbd]
++
++	vsbcbiq	%v0, %v0, %v0, %v0
++	vsbcbiq	%v0, %v0, %v0, %v31
++	vsbcbiq	%v0, %v0, %v31, %v0
++	vsbcbiq	%v0, %v31, %v0, %v0
++	vsbcbiq	%v31, %v0, %v0, %v0
++	vsbcbiq	%v13, %v17, %v21, %v25
++
++#CHECK: vsbiq   %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x04,0x00,0x00,0xbf]
++#CHECK: vsbiq   %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x04,0x00,0xf1,0xbf]
++#CHECK: vsbiq   %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf4,0x00,0x02,0xbf]
++#CHECK: vsbiq   %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x04,0x00,0x04,0xbf]
++#CHECK: vsbiq   %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x04,0x00,0x08,0xbf]
++#CHECK: vsbiq   %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x54,0x00,0x97,0xbf]
++
++	vsbiq	%v0, %v0, %v0, %v0
++	vsbiq	%v0, %v0, %v0, %v31
++	vsbiq	%v0, %v0, %v31, %v0
++	vsbiq	%v0, %v31, %v0, %v0
++	vsbiq	%v31, %v0, %v0, %v0
++	vsbiq	%v13, %v17, %v21, %v25
++
++#CHECK: vscbib  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0xf5]
++#CHECK: vscbib  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0xf5]
++#CHECK: vscbib  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xf5]
++#CHECK: vscbib  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xf5]
++#CHECK: vscbib  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0xf5]
++
++	vscbib	%v0, %v0, %v0
++	vscbib	%v0, %v0, %v31
++	vscbib	%v0, %v31, %v0
++	vscbib	%v31, %v0, %v0
++	vscbib	%v18, %v3, %v20
++
++#CHECK: vscbif  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf5]
++#CHECK: vscbif  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf5]
++#CHECK: vscbif  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf5]
++#CHECK: vscbif  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf5]
++#CHECK: vscbif  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf5]
++
++	vscbif	%v0, %v0, %v0
++	vscbif	%v0, %v0, %v31
++	vscbif	%v0, %v31, %v0
++	vscbif	%v31, %v0, %v0
++	vscbif	%v18, %v3, %v20
++
++#CHECK: vscbig  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf5]
++#CHECK: vscbig  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf5]
++#CHECK: vscbig  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf5]
++#CHECK: vscbig  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf5]
++#CHECK: vscbig  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf5]
++
++	vscbig	%v0, %v0, %v0
++	vscbig	%v0, %v0, %v31
++	vscbig	%v0, %v31, %v0
++	vscbig	%v31, %v0, %v0
++	vscbig	%v18, %v3, %v20
++
++#CHECK: vscbih  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf5]
++#CHECK: vscbih  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf5]
++#CHECK: vscbih  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf5]
++#CHECK: vscbih  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf5]
++#CHECK: vscbih  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf5]
++
++	vscbih	%v0, %v0, %v0
++	vscbih	%v0, %v0, %v31
++	vscbih	%v0, %v31, %v0
++	vscbih	%v31, %v0, %v0
++	vscbih	%v18, %v3, %v20
++
++#CHECK: vscbiq  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x40,0xf5]
++#CHECK: vscbiq  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x42,0xf5]
++#CHECK: vscbiq  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xf5]
++#CHECK: vscbiq  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xf5]
++#CHECK: vscbiq  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x4a,0xf5]
++
++	vscbiq	%v0, %v0, %v0
++	vscbiq	%v0, %v0, %v31
++	vscbiq	%v0, %v31, %v0
++	vscbiq	%v31, %v0, %v0
++	vscbiq	%v18, %v3, %v20
++
++#CHECK: vscef   %v0, 0(%v0), 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x1b]
++#CHECK: vscef   %v0, 0(%v0,%r1), 0      # encoding: [0xe7,0x00,0x10,0x00,0x00,0x1b]
++#CHECK: vscef   %v0, 0(%v0,%r1), 3      # encoding: [0xe7,0x00,0x10,0x00,0x30,0x1b]
++#CHECK: vscef   %v0, 0(%v0,%r15), 0     # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x1b]
++#CHECK: vscef   %v0, 0(%v15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x1b]
++#CHECK: vscef   %v0, 0(%v31,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x04,0x1b]
++#CHECK: vscef   %v0, 4095(%v0,%r1), 0   # encoding: [0xe7,0x00,0x1f,0xff,0x00,0x1b]
++#CHECK: vscef   %v15, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x00,0x1b]
++#CHECK: vscef   %v31, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x08,0x1b]
++#CHECK: vscef   %v10, 1000(%v19,%r7), 1 # encoding: [0xe7,0xa3,0x73,0xe8,0x14,0x1b]
++
++	vscef	%v0, 0(%v0), 0
++	vscef	%v0, 0(%v0,%r1), 0
++	vscef	%v0, 0(%v0,%r1), 3
++	vscef	%v0, 0(%v0,%r15), 0
++	vscef	%v0, 0(%v15,%r1), 0
++	vscef	%v0, 0(%v31,%r1), 0
++	vscef	%v0, 4095(%v0, %r1), 0
++	vscef	%v15, 0(%v0,%r1), 0
++	vscef	%v31, 0(%v0,%r1), 0
++	vscef	%v10, 1000(%v19,%r7), 1
++
++#CHECK: vsceg   %v0, 0(%v0), 0          # encoding: [0xe7,0x00,0x00,0x00,0x00,0x1a]
++#CHECK: vsceg   %v0, 0(%v0,%r1), 0      # encoding: [0xe7,0x00,0x10,0x00,0x00,0x1a]
++#CHECK: vsceg   %v0, 0(%v0,%r1), 1      # encoding: [0xe7,0x00,0x10,0x00,0x10,0x1a]
++#CHECK: vsceg   %v0, 0(%v0,%r15), 0     # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x1a]
++#CHECK: vsceg   %v0, 0(%v15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x1a]
++#CHECK: vsceg   %v0, 0(%v31,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x04,0x1a]
++#CHECK: vsceg   %v0, 4095(%v0,%r1), 0   # encoding: [0xe7,0x00,0x1f,0xff,0x00,0x1a]
++#CHECK: vsceg   %v15, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x00,0x1a]
++#CHECK: vsceg   %v31, 0(%v0,%r1), 0     # encoding: [0xe7,0xf0,0x10,0x00,0x08,0x1a]
++#CHECK: vsceg   %v10, 1000(%v19,%r7), 1 # encoding: [0xe7,0xa3,0x73,0xe8,0x14,0x1a]
++
++	vsceg	%v0, 0(%v0), 0
++	vsceg	%v0, 0(%v0,%r1), 0
++	vsceg	%v0, 0(%v0,%r1), 1
++	vsceg	%v0, 0(%v0,%r15), 0
++	vsceg	%v0, 0(%v15,%r1), 0
++	vsceg	%v0, 0(%v31,%r1), 0
++	vsceg	%v0, 4095(%v0,%r1), 0
++	vsceg	%v15, 0(%v0,%r1), 0
++	vsceg	%v31, 0(%v0,%r1), 0
++	vsceg	%v10, 1000(%v19,%r7), 1
++
++#CHECK: vsel    %v0, %v0, %v0, %v0      # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8d]
++#CHECK: vsel    %v0, %v0, %v0, %v31     # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8d]
++#CHECK: vsel    %v0, %v0, %v31, %v0     # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8d]
++#CHECK: vsel    %v0, %v31, %v0, %v0     # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8d]
++#CHECK: vsel    %v31, %v0, %v0, %v0     # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8d]
++#CHECK: vsel    %v13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x50,0x00,0x97,0x8d]
++
++	vsel	%v0, %v0, %v0, %v0
++	vsel	%v0, %v0, %v0, %v31
++	vsel	%v0, %v0, %v31, %v0
++	vsel	%v0, %v31, %v0, %v0
++	vsel	%v31, %v0, %v0, %v0
++	vsel 	%v13, %v17, %v21, %v25
++
++#CHECK: vsegb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0x5f]
++#CHECK: vsegb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x5f]
++#CHECK: vsegb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x5f]
++#CHECK: vsegb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x5f]
++#CHECK: vsegb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x5f]
++#CHECK: vsegb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0x5f]
++
++	vsegb	%v0, %v0
++	vsegb	%v0, %v15
++	vsegb	%v0, %v31
++	vsegb	%v15, %v0
++	vsegb	%v31, %v0
++	vsegb	%v14, %v17
++
++#CHECK: vsegf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0x5f]
++#CHECK: vsegf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0x5f]
++#CHECK: vsegf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x5f]
++#CHECK: vsegf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0x5f]
++#CHECK: vsegf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x5f]
++#CHECK: vsegf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0x5f]
++
++	vsegf	%v0, %v0
++	vsegf	%v0, %v15
++	vsegf	%v0, %v31
++	vsegf	%v15, %v0
++	vsegf	%v31, %v0
++	vsegf	%v14, %v17
++
++#CHECK: vsegh   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0x5f]
++#CHECK: vsegh   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0x5f]
++#CHECK: vsegh   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x5f]
++#CHECK: vsegh   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0x5f]
++#CHECK: vsegh   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x5f]
++#CHECK: vsegh   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0x5f]
++
++	vsegh	%v0, %v0
++	vsegh	%v0, %v15
++	vsegh	%v0, %v31
++	vsegh	%v15, %v0
++	vsegh	%v31, %v0
++	vsegh	%v14, %v17
++
++#CHECK: vsf     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0xf7]
++#CHECK: vsf     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0xf7]
++#CHECK: vsf     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xf7]
++#CHECK: vsf     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xf7]
++#CHECK: vsf     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0xf7]
++
++	vsf	%v0, %v0, %v0
++	vsf	%v0, %v0, %v31
++	vsf	%v0, %v31, %v0
++	vsf	%v31, %v0, %v0
++	vsf	%v18, %v3, %v20
++
++#CHECK: vsg     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0xf7]
++#CHECK: vsg     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0xf7]
++#CHECK: vsg     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xf7]
++#CHECK: vsg     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xf7]
++#CHECK: vsg     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0xf7]
++
++	vsg	%v0, %v0, %v0
++	vsg	%v0, %v0, %v31
++	vsg	%v0, %v31, %v0
++	vsg	%v31, %v0, %v0
++	vsg	%v18, %v3, %v20
++
++#CHECK: vsh     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0xf7]
++#CHECK: vsh     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0xf7]
++#CHECK: vsh     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xf7]
++#CHECK: vsh     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xf7]
++#CHECK: vsh     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0xf7]
++
++	vsh	%v0, %v0, %v0
++	vsh	%v0, %v0, %v31
++	vsh	%v0, %v31, %v0
++	vsh	%v31, %v0, %v0
++	vsh	%v18, %v3, %v20
++
++#CHECK: vsl     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x74]
++#CHECK: vsl     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x74]
++#CHECK: vsl     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x74]
++#CHECK: vsl     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x74]
++#CHECK: vsl     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x74]
++
++	vsl	%v0, %v0, %v0
++	vsl	%v0, %v0, %v31
++	vsl	%v0, %v31, %v0
++	vsl	%v31, %v0, %v0
++	vsl	%v18, %v3, %v20
++
++#CHECK: vslb    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x75]
++#CHECK: vslb    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x75]
++#CHECK: vslb    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x75]
++#CHECK: vslb    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x75]
++#CHECK: vslb    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x75]
++
++	vslb	%v0, %v0, %v0
++	vslb	%v0, %v0, %v31
++	vslb	%v0, %v31, %v0
++	vslb	%v31, %v0, %v0
++	vslb	%v18, %v3, %v20
++
++#CHECK: vsldb   %v0, %v0, %v0, 0        # encoding: [0xe7,0x00,0x00,0x00,0x00,0x77]
++#CHECK: vsldb   %v0, %v0, %v0, 255      # encoding: [0xe7,0x00,0x00,0xff,0x00,0x77]
++#CHECK: vsldb   %v0, %v0, %v31, 0       # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x77]
++#CHECK: vsldb   %v0, %v31, %v0, 0       # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x77]
++#CHECK: vsldb   %v31, %v0, %v0, 0       # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x77]
++#CHECK: vsldb   %v13, %v17, %v21, 121   # encoding: [0xe7,0xd1,0x50,0x79,0x06,0x77]
++
++	vsldb	%v0, %v0, %v0, 0
++	vsldb	%v0, %v0, %v0, 255
++	vsldb	%v0, %v0, %v31, 0
++	vsldb	%v0, %v31, %v0, 0
++	vsldb	%v31, %v0, %v0, 0
++	vsldb 	%v13, %v17, %v21, 0x79
++
++#CHECK: vsq     %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x40,0xf7]
++#CHECK: vsq     %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x42,0xf7]
++#CHECK: vsq     %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x44,0xf7]
++#CHECK: vsq     %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x48,0xf7]
++#CHECK: vsq     %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x4a,0xf7]
++
++	vsq	%v0, %v0, %v0
++	vsq	%v0, %v0, %v31
++	vsq	%v0, %v31, %v0
++	vsq	%v31, %v0, %v0
++	vsq	%v18, %v3, %v20
++
++#CHECK: vsra    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7e]
++#CHECK: vsra    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7e]
++#CHECK: vsra    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7e]
++#CHECK: vsra    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7e]
++#CHECK: vsra    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7e]
++
++	vsra	%v0, %v0, %v0
++	vsra	%v0, %v0, %v31
++	vsra	%v0, %v31, %v0
++	vsra	%v31, %v0, %v0
++	vsra	%v18, %v3, %v20
++
++#CHECK: vsrab   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7f]
++#CHECK: vsrab   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7f]
++#CHECK: vsrab   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7f]
++#CHECK: vsrab   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7f]
++#CHECK: vsrab   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7f]
++
++	vsrab	%v0, %v0, %v0
++	vsrab	%v0, %v0, %v31
++	vsrab	%v0, %v31, %v0
++	vsrab	%v31, %v0, %v0
++	vsrab	%v18, %v3, %v20
++
++#CHECK: vsrl    %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7c]
++#CHECK: vsrl    %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7c]
++#CHECK: vsrl    %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7c]
++#CHECK: vsrl    %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7c]
++#CHECK: vsrl    %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7c]
++
++	vsrl	%v0, %v0, %v0
++	vsrl	%v0, %v0, %v31
++	vsrl	%v0, %v31, %v0
++	vsrl	%v31, %v0, %v0
++	vsrl	%v18, %v3, %v20
++
++#CHECK: vsrlb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x7d]
++#CHECK: vsrlb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x7d]
++#CHECK: vsrlb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x7d]
++#CHECK: vsrlb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x7d]
++#CHECK: vsrlb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x7d]
++
++	vsrlb	%v0, %v0, %v0
++	vsrlb	%v0, %v0, %v31
++	vsrlb	%v0, %v31, %v0
++	vsrlb	%v31, %v0, %v0
++	vsrlb	%v18, %v3, %v20
++
++#CHECK: vst     %v0, 0                  # encoding: [0xe7,0x00,0x00,0x00,0x00,0x0e]
++#CHECK: vst     %v0, 4095               # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x0e]
++#CHECK: vst     %v0, 0(%r15)            # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x0e]
++#CHECK: vst     %v0, 0(%r15,%r1)        # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x0e]
++#CHECK: vst     %v15, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x0e]
++#CHECK: vst     %v31, 0                 # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x0e]
++#CHECK: vst     %v18, 1383(%r3,%r4)     # encoding: [0xe7,0x23,0x45,0x67,0x08,0x0e]
++
++	vst	%v0, 0
++	vst	%v0, 4095
++	vst	%v0, 0(%r15)
++	vst	%v0, 0(%r15,%r1)
++	vst	%v15, 0
++	vst	%v31, 0
++	vst	%v18, 0x567(%r3,%r4)
++
++#CHECK: vsteb   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x08]
++#CHECK: vsteb   %v0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x08]
++#CHECK: vsteb   %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x08]
++#CHECK: vsteb   %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x08]
++#CHECK: vsteb   %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x08]
++#CHECK: vsteb   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x08]
++#CHECK: vsteb   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x08]
++#CHECK: vsteb   %v18, 1383(%r3,%r4), 8  # encoding: [0xe7,0x23,0x45,0x67,0x88,0x08]
++
++	vsteb	%v0, 0, 0
++	vsteb	%v0, 0, 15
++	vsteb	%v0, 4095, 0
++	vsteb	%v0, 0(%r15), 0
++	vsteb	%v0, 0(%r15,%r1), 0
++	vsteb	%v15, 0, 0
++	vsteb	%v31, 0, 0
++	vsteb	%v18, 1383(%r3,%r4), 8
++
++#CHECK: vstef   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x0b]
++#CHECK: vstef   %v0, 0, 3               # encoding: [0xe7,0x00,0x00,0x00,0x30,0x0b]
++#CHECK: vstef   %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x0b]
++#CHECK: vstef   %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x0b]
++#CHECK: vstef   %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x0b]
++#CHECK: vstef   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x0b]
++#CHECK: vstef   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x0b]
++#CHECK: vstef   %v18, 1383(%r3,%r4), 2  # encoding: [0xe7,0x23,0x45,0x67,0x28,0x0b]
++
++	vstef	%v0, 0, 0
++	vstef	%v0, 0, 3
++	vstef	%v0, 4095, 0
++	vstef	%v0, 0(%r15), 0
++	vstef	%v0, 0(%r15,%r1), 0
++	vstef	%v15, 0, 0
++	vstef	%v31, 0, 0
++	vstef	%v18, 1383(%r3,%r4), 2
++
++#CHECK: vsteg   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x0a]
++#CHECK: vsteg   %v0, 0, 1               # encoding: [0xe7,0x00,0x00,0x00,0x10,0x0a]
++#CHECK: vsteg   %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x0a]
++#CHECK: vsteg   %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x0a]
++#CHECK: vsteg   %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x0a]
++#CHECK: vsteg   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x0a]
++#CHECK: vsteg   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x0a]
++#CHECK: vsteg   %v18, 1383(%r3,%r4), 1  # encoding: [0xe7,0x23,0x45,0x67,0x18,0x0a]
++
++	vsteg	%v0, 0, 0
++	vsteg	%v0, 0, 1
++	vsteg	%v0, 4095, 0
++	vsteg	%v0, 0(%r15), 0
++	vsteg	%v0, 0(%r15,%r1), 0
++	vsteg	%v15, 0, 0
++	vsteg	%v31, 0, 0
++	vsteg	%v18, 1383(%r3,%r4), 1
++
++#CHECK: vsteh   %v0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x09]
++#CHECK: vsteh   %v0, 0, 7               # encoding: [0xe7,0x00,0x00,0x00,0x70,0x09]
++#CHECK: vsteh   %v0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x09]
++#CHECK: vsteh   %v0, 0(%r15), 0         # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x09]
++#CHECK: vsteh   %v0, 0(%r15,%r1), 0     # encoding: [0xe7,0x0f,0x10,0x00,0x00,0x09]
++#CHECK: vsteh   %v15, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x09]
++#CHECK: vsteh   %v31, 0, 0              # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x09]
++#CHECK: vsteh   %v18, 1383(%r3,%r4), 4  # encoding: [0xe7,0x23,0x45,0x67,0x48,0x09]
++
++	vsteh	%v0, 0, 0
++	vsteh	%v0, 0, 7
++	vsteh	%v0, 4095, 0
++	vsteh	%v0, 0(%r15), 0
++	vsteh	%v0, 0(%r15,%r1), 0
++	vsteh	%v15, 0, 0
++	vsteh	%v31, 0, 0
++	vsteh	%v18, 1383(%r3,%r4), 4
++
++#CHECK: vstl    %v0, %r0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x3f]
++#CHECK: vstl    %v0, %r0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x3f]
++#CHECK: vstl    %v0, %r0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x3f]
++#CHECK: vstl    %v0, %r15, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x3f]
++#CHECK: vstl    %v15, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x3f]
++#CHECK: vstl    %v31, %r0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x3f]
++#CHECK: vstl    %v18, %r3, 1383(%r4)    # encoding: [0xe7,0x23,0x45,0x67,0x08,0x3f]
++
++	vstl	%v0, %r0, 0
++	vstl	%v0, %r0, 4095
++	vstl	%v0, %r0, 0(%r15)
++	vstl	%v0, %r15, 0
++	vstl	%v15, %r0, 0
++	vstl	%v31, %r0, 0
++	vstl	%v18, %r3, 1383(%r4)
++
++#CHECK: vstm    %v0, %v0, 0             # encoding: [0xe7,0x00,0x00,0x00,0x00,0x3e]
++#CHECK: vstm    %v0, %v0, 4095          # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x3e]
++#CHECK: vstm    %v0, %v0, 0(%r15)       # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x3e]
++#CHECK: vstm    %v0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x3e]
++#CHECK: vstm    %v31, %v0, 0            # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x3e]
++#CHECK: vstm    %v14, %v17, 1074(%r5)   # encoding: [0xe7,0xe1,0x54,0x32,0x04,0x3e]
++
++	vstm	%v0, %v0, 0
++	vstm	%v0, %v0, 4095
++	vstm	%v0, %v0, 0(%r15)
++	vstm	%v0, %v31, 0
++	vstm	%v31, %v0, 0
++	vstm	%v14, %v17, 1074(%r5)
++
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8a]
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x00,0x00,0x00,0x8a]
++#CHECK: vstrcb   %v0, %v0, %v0, %v0, 12  # encoding: [0xe7,0x00,0x00,0xc0,0x00,0x8a]
++#CHECK: vstrcb   %v0, %v0, %v0, %v15, 0  # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x8a]
++#CHECK: vstrcb   %v0, %v0, %v0, %v31, 0  # encoding: [0xe7,0x00,0x00,0x00,0xf1,0x8a]
++#CHECK: vstrcb   %v0, %v0, %v15, %v0, 0  # encoding: [0xe7,0x00,0xf0,0x00,0x00,0x8a]
++#CHECK: vstrcb   %v0, %v0, %v31, %v0, 0  # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x8a]
++#CHECK: vstrcb   %v0, %v15, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x00,0x00,0x00,0x8a]
++#CHECK: vstrcb   %v0, %v31, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x8a]
++#CHECK: vstrcb   %v15, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x8a]
++#CHECK: vstrcb   %v31, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x8a]
++#CHECK: vstrcb   %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x40,0x40,0x5a,0x8a]
++#CHECK: vstrcb   %v18, %v3, %v20, %v5, 15 # encoding: [0xe7,0x23,0x40,0xf0,0x5a,0x8a]
++#CHECK: vstrcbs  %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x40,0x90,0x5a,0x8a]
++#CHECK: vstrczb  %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x40,0x60,0x5a,0x8a]
++#CHECK: vstrczbs %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x40,0xb0,0x5a,0x8a]
++#CHECK: vstrczbs %v18, %v3, %v20, %v5, 15 # encoding: [0xe7,0x23,0x40,0xf0,0x5a,0x8a]
++
++        vstrcb   %v0, %v0, %v0, %v0
++        vstrcb   %v0, %v0, %v0, %v0, 0
++        vstrcb   %v0, %v0, %v0, %v0, 12
++        vstrcb   %v0, %v0, %v0, %v15
++        vstrcb   %v0, %v0, %v0, %v31
++        vstrcb   %v0, %v0, %v15, %v0
++        vstrcb   %v0, %v0, %v31, %v0
++        vstrcb   %v0, %v15, %v0, %v0
++        vstrcb   %v0, %v31, %v0, %v0
++        vstrcb   %v15, %v0, %v0, %v0
++        vstrcb   %v31, %v0, %v0, %v0
++        vstrcb   %v18, %v3, %v20, %v5, 4
++        vstrcb   %v18, %v3, %v20, %v5, 15
++        vstrcbs  %v18, %v3, %v20, %v5, 8
++        vstrczb  %v18, %v3, %v20, %v5, 4
++        vstrczbs %v18, %v3, %v20, %v5, 8
++        vstrczbs %v18, %v3, %v20, %v5, 15
++
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8a]
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x02,0x00,0x00,0x8a]
++#CHECK: vstrcf   %v0, %v0, %v0, %v0, 12  # encoding: [0xe7,0x00,0x02,0xc0,0x00,0x8a]
++#CHECK: vstrcf   %v0, %v0, %v0, %v15, 0  # encoding: [0xe7,0x00,0x02,0x00,0xf0,0x8a]
++#CHECK: vstrcf   %v0, %v0, %v0, %v31, 0  # encoding: [0xe7,0x00,0x02,0x00,0xf1,0x8a]
++#CHECK: vstrcf   %v0, %v0, %v15, %v0, 0  # encoding: [0xe7,0x00,0xf2,0x00,0x00,0x8a]
++#CHECK: vstrcf   %v0, %v0, %v31, %v0, 0  # encoding: [0xe7,0x00,0xf2,0x00,0x02,0x8a]
++#CHECK: vstrcf   %v0, %v15, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x02,0x00,0x00,0x8a]
++#CHECK: vstrcf   %v0, %v31, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x02,0x00,0x04,0x8a]
++#CHECK: vstrcf   %v15, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x02,0x00,0x00,0x8a]
++#CHECK: vstrcf   %v31, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x02,0x00,0x08,0x8a]
++#CHECK: vstrcf   %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x42,0x40,0x5a,0x8a]
++#CHECK: vstrcf   %v18, %v3, %v20, %v5, 15 # encoding: [0xe7,0x23,0x42,0xf0,0x5a,0x8a]
++#CHECK: vstrcfs  %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x42,0x90,0x5a,0x8a]
++#CHECK: vstrczf  %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x42,0x60,0x5a,0x8a]
++#CHECK: vstrczfs %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x42,0xb0,0x5a,0x8a]
++#CHECK: vstrczfs %v18, %v3, %v20, %v5, 15 # encoding: [0xe7,0x23,0x42,0xf0,0x5a,0x8a]
++
++        vstrcf   %v0, %v0, %v0, %v0
++        vstrcf   %v0, %v0, %v0, %v0, 0
++        vstrcf   %v0, %v0, %v0, %v0, 12
++        vstrcf   %v0, %v0, %v0, %v15
++        vstrcf   %v0, %v0, %v0, %v31
++        vstrcf   %v0, %v0, %v15, %v0
++        vstrcf   %v0, %v0, %v31, %v0
++        vstrcf   %v0, %v15, %v0, %v0
++        vstrcf   %v0, %v31, %v0, %v0
++        vstrcf   %v15, %v0, %v0, %v0
++        vstrcf   %v31, %v0, %v0, %v0
++        vstrcf   %v18, %v3, %v20, %v5, 4
++        vstrcf   %v18, %v3, %v20, %v5, 15
++        vstrcfs  %v18, %v3, %v20, %v5, 8
++        vstrczf  %v18, %v3, %v20, %v5, 4
++        vstrczfs %v18, %v3, %v20, %v5, 8
++        vstrczfs %v18, %v3, %v20, %v5, 15
++
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x01,0x00,0x00,0x8a]
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 0   # encoding: [0xe7,0x00,0x01,0x00,0x00,0x8a]
++#CHECK: vstrch   %v0, %v0, %v0, %v0, 12  # encoding: [0xe7,0x00,0x01,0xc0,0x00,0x8a]
++#CHECK: vstrch   %v0, %v0, %v0, %v15, 0  # encoding: [0xe7,0x00,0x01,0x00,0xf0,0x8a]
++#CHECK: vstrch   %v0, %v0, %v0, %v31, 0  # encoding: [0xe7,0x00,0x01,0x00,0xf1,0x8a]
++#CHECK: vstrch   %v0, %v0, %v15, %v0, 0  # encoding: [0xe7,0x00,0xf1,0x00,0x00,0x8a]
++#CHECK: vstrch   %v0, %v0, %v31, %v0, 0  # encoding: [0xe7,0x00,0xf1,0x00,0x02,0x8a]
++#CHECK: vstrch   %v0, %v15, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x01,0x00,0x00,0x8a]
++#CHECK: vstrch   %v0, %v31, %v0, %v0, 0  # encoding: [0xe7,0x0f,0x01,0x00,0x04,0x8a]
++#CHECK: vstrch   %v15, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x01,0x00,0x00,0x8a]
++#CHECK: vstrch   %v31, %v0, %v0, %v0, 0  # encoding: [0xe7,0xf0,0x01,0x00,0x08,0x8a]
++#CHECK: vstrch   %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x41,0x40,0x5a,0x8a]
++#CHECK: vstrch   %v18, %v3, %v20, %v5, 15 # encoding: [0xe7,0x23,0x41,0xf0,0x5a,0x8a]
++#CHECK: vstrchs  %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x41,0x90,0x5a,0x8a]
++#CHECK: vstrczh  %v18, %v3, %v20, %v5, 4 # encoding: [0xe7,0x23,0x41,0x60,0x5a,0x8a]
++#CHECK: vstrczhs %v18, %v3, %v20, %v5, 8 # encoding: [0xe7,0x23,0x41,0xb0,0x5a,0x8a]
++#CHECK: vstrczhs %v18, %v3, %v20, %v5, 15 # encoding: [0xe7,0x23,0x41,0xf0,0x5a,0x8a]
++
++        vstrch   %v0, %v0, %v0, %v0
++        vstrch   %v0, %v0, %v0, %v0, 0
++        vstrch   %v0, %v0, %v0, %v0, 12
++        vstrch   %v0, %v0, %v0, %v15
++        vstrch   %v0, %v0, %v0, %v31
++        vstrch   %v0, %v0, %v15, %v0
++        vstrch   %v0, %v0, %v31, %v0
++        vstrch   %v0, %v15, %v0, %v0
++        vstrch   %v0, %v31, %v0, %v0
++        vstrch   %v15, %v0, %v0, %v0
++        vstrch   %v31, %v0, %v0, %v0
++        vstrch   %v18, %v3, %v20, %v5, 4
++        vstrch   %v18, %v3, %v20, %v5, 15
++        vstrchs  %v18, %v3, %v20, %v5, 8
++        vstrczh  %v18, %v3, %v20, %v5, 4
++        vstrczhs %v18, %v3, %v20, %v5, 8
++        vstrczhs %v18, %v3, %v20, %v5, 15
++
++#CHECK: vsumgh  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x65]
++#CHECK: vsumgh  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x65]
++#CHECK: vsumgh  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x65]
++#CHECK: vsumgh  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x65]
++#CHECK: vsumgh  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x65]
++
++	vsumgh	%v0, %v0, %v0
++	vsumgh	%v0, %v0, %v31
++	vsumgh	%v0, %v31, %v0
++	vsumgh	%v31, %v0, %v0
++	vsumgh	%v18, %v3, %v20
++
++#CHECK: vsumgf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x65]
++#CHECK: vsumgf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x65]
++#CHECK: vsumgf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x65]
++#CHECK: vsumgf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x65]
++#CHECK: vsumgf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x65]
++
++	vsumgf	%v0, %v0, %v0
++	vsumgf	%v0, %v0, %v31
++	vsumgf	%v0, %v31, %v0
++	vsumgf	%v31, %v0, %v0
++	vsumgf	%v18, %v3, %v20
++
++#CHECK: vsumqf  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x20,0x67]
++#CHECK: vsumqf  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x22,0x67]
++#CHECK: vsumqf  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x24,0x67]
++#CHECK: vsumqf  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x28,0x67]
++#CHECK: vsumqf  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x2a,0x67]
++
++	vsumqf	%v0, %v0, %v0
++	vsumqf	%v0, %v0, %v31
++	vsumqf	%v0, %v31, %v0
++	vsumqf	%v31, %v0, %v0
++	vsumqf	%v18, %v3, %v20
++
++#CHECK: vsumqg  %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x30,0x67]
++#CHECK: vsumqg  %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x32,0x67]
++#CHECK: vsumqg  %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x34,0x67]
++#CHECK: vsumqg  %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x38,0x67]
++#CHECK: vsumqg  %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x3a,0x67]
++
++	vsumqg	%v0, %v0, %v0
++	vsumqg	%v0, %v0, %v31
++	vsumqg	%v0, %v31, %v0
++	vsumqg	%v31, %v0, %v0
++	vsumqg	%v18, %v3, %v20
++
++#CHECK: vsumb   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x64]
++#CHECK: vsumb   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x64]
++#CHECK: vsumb   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x64]
++#CHECK: vsumb   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x64]
++#CHECK: vsumb   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x64]
++
++	vsumb	%v0, %v0, %v0
++	vsumb	%v0, %v0, %v31
++	vsumb	%v0, %v31, %v0
++	vsumb	%v31, %v0, %v0
++	vsumb	%v18, %v3, %v20
++
++#CHECK: vsumh   %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x10,0x64]
++#CHECK: vsumh   %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x12,0x64]
++#CHECK: vsumh   %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x14,0x64]
++#CHECK: vsumh   %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x18,0x64]
++#CHECK: vsumh   %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x1a,0x64]
++
++	vsumh	%v0, %v0, %v0
++	vsumh	%v0, %v0, %v31
++	vsumh	%v0, %v31, %v0
++	vsumh	%v31, %v0, %v0
++	vsumh	%v18, %v3, %v20
++
++#CHECK: vtm     %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd8]
++#CHECK: vtm     %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd8]
++#CHECK: vtm     %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd8]
++#CHECK: vtm     %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd8]
++#CHECK: vtm     %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd8]
++#CHECK: vtm     %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd8]
++
++	vtm	%v0, %v0
++	vtm	%v0, %v15
++	vtm	%v0, %v31
++	vtm	%v15, %v0
++	vtm	%v31, %v0
++	vtm	%v14, %v17
++
++#CHECK: vuphb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd7]
++#CHECK: vuphb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd7]
++#CHECK: vuphb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd7]
++#CHECK: vuphb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd7]
++#CHECK: vuphb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd7]
++#CHECK: vuphb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd7]
++
++	vuphb	%v0, %v0
++	vuphb	%v0, %v15
++	vuphb	%v0, %v31
++	vuphb	%v15, %v0
++	vuphb	%v31, %v0
++	vuphb	%v14, %v17
++
++#CHECK: vuphf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd7]
++#CHECK: vuphf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd7]
++#CHECK: vuphf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd7]
++#CHECK: vuphf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd7]
++#CHECK: vuphf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd7]
++#CHECK: vuphf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd7]
++
++	vuphf	%v0, %v0
++	vuphf	%v0, %v15
++	vuphf	%v0, %v31
++	vuphf	%v15, %v0
++	vuphf	%v31, %v0
++	vuphf	%v14, %v17
++
++#CHECK: vuphh   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd7]
++#CHECK: vuphh   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd7]
++#CHECK: vuphh   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd7]
++#CHECK: vuphh   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xd7]
++#CHECK: vuphh   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xd7]
++#CHECK: vuphh   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xd7]
++
++	vuphh	%v0, %v0
++	vuphh	%v0, %v15
++	vuphh	%v0, %v31
++	vuphh	%v15, %v0
++	vuphh	%v31, %v0
++	vuphh	%v14, %v17
++
++#CHECK: vuplhb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd5]
++#CHECK: vuplhb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd5]
++#CHECK: vuplhb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd5]
++#CHECK: vuplhb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd5]
++#CHECK: vuplhb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd5]
++#CHECK: vuplhb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd5]
++
++	vuplhb	%v0, %v0
++	vuplhb	%v0, %v15
++	vuplhb	%v0, %v31
++	vuplhb	%v15, %v0
++	vuplhb	%v31, %v0
++	vuplhb	%v14, %v17
++
++#CHECK: vuplhf  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd5]
++#CHECK: vuplhf  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd5]
++#CHECK: vuplhf  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd5]
++#CHECK: vuplhf  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd5]
++#CHECK: vuplhf  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd5]
++#CHECK: vuplhf  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd5]
++
++	vuplhf	%v0, %v0
++	vuplhf	%v0, %v15
++	vuplhf	%v0, %v31
++	vuplhf	%v15, %v0
++	vuplhf	%v31, %v0
++	vuplhf	%v14, %v17
++
++#CHECK: vuplhh  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd5]
++#CHECK: vuplhh  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd5]
++#CHECK: vuplhh  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd5]
++#CHECK: vuplhh  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xd5]
++#CHECK: vuplhh  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xd5]
++#CHECK: vuplhh  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xd5]
++
++	vuplhh	%v0, %v0
++	vuplhh	%v0, %v15
++	vuplhh	%v0, %v31
++	vuplhh	%v15, %v0
++	vuplhh	%v31, %v0
++	vuplhh	%v14, %v17
++
++#CHECK: vuplb   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd6]
++#CHECK: vuplb   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd6]
++#CHECK: vuplb   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd6]
++#CHECK: vuplb   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd6]
++#CHECK: vuplb   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd6]
++#CHECK: vuplb   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd6]
++
++	vuplb	%v0, %v0
++	vuplb	%v0, %v15
++	vuplb	%v0, %v31
++	vuplb	%v15, %v0
++	vuplb	%v31, %v0
++	vuplb	%v14, %v17
++
++#CHECK: vuplf   %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd6]
++#CHECK: vuplf   %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd6]
++#CHECK: vuplf   %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd6]
++#CHECK: vuplf   %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd6]
++#CHECK: vuplf   %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd6]
++#CHECK: vuplf   %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd6]
++
++	vuplf	%v0, %v0
++	vuplf	%v0, %v15
++	vuplf	%v0, %v31
++	vuplf	%v15, %v0
++	vuplf	%v31, %v0
++	vuplf	%v14, %v17
++
++#CHECK: vuplhw  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd6]
++#CHECK: vuplhw  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd6]
++#CHECK: vuplhw  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd6]
++#CHECK: vuplhw  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xd6]
++#CHECK: vuplhw  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xd6]
++#CHECK: vuplhw  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xd6]
++
++	vuplhw	%v0, %v0
++	vuplhw	%v0, %v15
++	vuplhw	%v0, %v31
++	vuplhw	%v15, %v0
++	vuplhw	%v31, %v0
++	vuplhw	%v14, %v17
++
++#CHECK: vupllb  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x00,0xd4]
++#CHECK: vupllb  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x00,0xd4]
++#CHECK: vupllb  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x04,0xd4]
++#CHECK: vupllb  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x00,0xd4]
++#CHECK: vupllb  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x08,0xd4]
++#CHECK: vupllb  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x04,0xd4]
++
++	vupllb	%v0, %v0
++	vupllb	%v0, %v15
++	vupllb	%v0, %v31
++	vupllb	%v15, %v0
++	vupllb	%v31, %v0
++	vupllb	%v14, %v17
++
++#CHECK: vupllf  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x20,0xd4]
++#CHECK: vupllf  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x20,0xd4]
++#CHECK: vupllf  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x24,0xd4]
++#CHECK: vupllf  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x20,0xd4]
++#CHECK: vupllf  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x28,0xd4]
++#CHECK: vupllf  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x24,0xd4]
++
++	vupllf	%v0, %v0
++	vupllf	%v0, %v15
++	vupllf	%v0, %v31
++	vupllf	%v15, %v0
++	vupllf	%v31, %v0
++	vupllf	%v14, %v17
++
++#CHECK: vupllh  %v0, %v0                # encoding: [0xe7,0x00,0x00,0x00,0x10,0xd4]
++#CHECK: vupllh  %v0, %v15               # encoding: [0xe7,0x0f,0x00,0x00,0x10,0xd4]
++#CHECK: vupllh  %v0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x14,0xd4]
++#CHECK: vupllh  %v15, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x10,0xd4]
++#CHECK: vupllh  %v31, %v0               # encoding: [0xe7,0xf0,0x00,0x00,0x18,0xd4]
++#CHECK: vupllh  %v14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x14,0xd4]
++
++	vupllh	%v0, %v0
++	vupllh	%v0, %v15
++	vupllh	%v0, %v31
++	vupllh	%v15, %v0
++	vupllh	%v31, %v0
++	vupllh	%v14, %v17
++
++#CHECK: vx      %v0, %v0, %v0           # encoding: [0xe7,0x00,0x00,0x00,0x00,0x6d]
++#CHECK: vx      %v0, %v0, %v31          # encoding: [0xe7,0x00,0xf0,0x00,0x02,0x6d]
++#CHECK: vx      %v0, %v31, %v0          # encoding: [0xe7,0x0f,0x00,0x00,0x04,0x6d]
++#CHECK: vx      %v31, %v0, %v0          # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x6d]
++#CHECK: vx      %v18, %v3, %v20         # encoding: [0xe7,0x23,0x40,0x00,0x0a,0x6d]
++
++	vx	%v0, %v0, %v0
++	vx	%v0, %v0, %v31
++	vx	%v0, %v31, %v0
++	vx	%v31, %v0, %v0
++	vx	%v18, %v3, %v20
++
++#CHECK: vzero   %v0                     # encoding: [0xe7,0x00,0x00,0x00,0x00,0x44]
++#CHECK: vzero   %v11                    # encoding: [0xe7,0xb0,0x00,0x00,0x00,0x44]
++#CHECK: vzero   %v15                    # encoding: [0xe7,0xf0,0x00,0x00,0x00,0x44]
++#CHECK: vzero   %v31                    # encoding: [0xe7,0xf0,0x00,0x00,0x08,0x44]
++
++	vzero	%v0
++	vzero	%v11
++	vzero	%v15
++	vzero	%v31
++
++#CHECK: wcdgb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc3]
++#CHECK: wcdgb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc3]
++#CHECK: wcdgb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
++#CHECK: wcdgb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc3]
++#CHECK: wcdgb   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc3]
++#CHECK: wcdgb   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc3]
++#CHECK: wcdgb   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc3]
++
++	wcdgb	%v0, %v0, 0, 0
++ 	wcdgb	%v0, %v0, 0, 15
++	wcdgb	%v0, %v0, 4, 0
++	wcdgb	%v0, %v0, 12, 0
++	wcdgb	%v0, %v31, 0, 0
++	wcdgb	%v31, %v0, 0, 0
++	wcdgb	%v14, %v17, 4, 10
++
++#CHECK: wcdlgb  %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc1]
++#CHECK: wcdlgb  %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc1]
++#CHECK: wcdlgb  %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
++#CHECK: wcdlgb  %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc1]
++#CHECK: wcdlgb  %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc1]
++#CHECK: wcdlgb  %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc1]
++#CHECK: wcdlgb  %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc1]
++
++	wcdlgb	%v0, %v0, 0, 0
++ 	wcdlgb	%v0, %v0, 0, 15
++	wcdlgb	%v0, %v0, 4, 0
++	wcdlgb	%v0, %v0, 12, 0
++	wcdlgb	%v0, %v31, 0, 0
++	wcdlgb	%v31, %v0, 0, 0
++	wcdlgb	%v14, %v17, 4, 10
++
++#CHECK: wcgdb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc2]
++#CHECK: wcgdb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc2]
++#CHECK: wcgdb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
++#CHECK: wcgdb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc2]
++#CHECK: wcgdb   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc2]
++#CHECK: wcgdb   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc2]
++#CHECK: wcgdb   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc2]
++
++	wcgdb	%v0, %v0, 0, 0
++ 	wcgdb	%v0, %v0, 0, 15
++	wcgdb	%v0, %v0, 4, 0
++	wcgdb	%v0, %v0, 12, 0
++	wcgdb	%v0, %v31, 0, 0
++	wcgdb	%v31, %v0, 0, 0
++	wcgdb	%v14, %v17, 4, 10
++
++#CHECK: wclgdb  %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc0]
++#CHECK: wclgdb  %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc0]
++#CHECK: wclgdb  %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
++#CHECK: wclgdb  %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc0]
++#CHECK: wclgdb  %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc0]
++#CHECK: wclgdb  %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc0]
++#CHECK: wclgdb  %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc0]
++
++	wclgdb	%v0, %v0, 0, 0
++ 	wclgdb	%v0, %v0, 0, 15
++	wclgdb	%v0, %v0, 4, 0
++	wclgdb	%v0, %v0, 12, 0
++	wclgdb	%v0, %v31, 0, 0
++	wclgdb	%v31, %v0, 0, 0
++	wclgdb	%v14, %v17, 4, 10
++
++#CHECK: wfadb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe3]
++#CHECK: wfadb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe3]
++#CHECK: wfadb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe3]
++#CHECK: wfadb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe3]
++#CHECK: wfadb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe3]
++
++	wfadb	%v0, %v0, %v0
++	wfadb	%v0, %v0, %v31
++	wfadb	%v0, %v31, %v0
++	wfadb	%v31, %v0, %v0
++	wfadb	%v18, %v3, %v20
++
++#CHECK: wfcdb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xcb]
++#CHECK: wfcdb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xcb]
++#CHECK: wfcdb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xcb]
++#CHECK: wfcdb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xcb]
++#CHECK: wfcdb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xcb]
++#CHECK: wfcdb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xcb]
++
++	wfcdb	%v0, %v0
++	wfcdb	%v0, %v15
++	wfcdb	%v0, %v31
++	wfcdb	%v15, %v0
++	wfcdb	%v31, %v0
++	wfcdb	%v14, %v17
++
++#CHECK: wfcedb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe8]
++#CHECK: wfcedb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe8]
++#CHECK: wfcedb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe8]
++#CHECK: wfcedb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe8]
++#CHECK: wfcedb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe8]
++
++	wfcedb	%v0, %v0, %v0
++	wfcedb	%v0, %v0, %v31
++	wfcedb	%v0, %v31, %v0
++	wfcedb	%v31, %v0, %v0
++	wfcedb	%v18, %v3, %v20
++
++#CHECK: wfcedbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x30,0xe8]
++#CHECK: wfcedbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x32,0xe8]
++#CHECK: wfcedbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x34,0xe8]
++#CHECK: wfcedbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x38,0xe8]
++#CHECK: wfcedbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x3a,0xe8]
++
++	wfcedbs	%v0, %v0, %v0
++	wfcedbs	%v0, %v0, %v31
++	wfcedbs	%v0, %v31, %v0
++	wfcedbs	%v31, %v0, %v0
++	wfcedbs	%v18, %v3, %v20
++
++#CHECK: wfchdb  %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xeb]
++#CHECK: wfchdb  %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xeb]
++#CHECK: wfchdb  %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xeb]
++#CHECK: wfchdb  %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xeb]
++#CHECK: wfchdb  %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xeb]
++
++	wfchdb	%v0, %v0, %v0
++	wfchdb	%v0, %v0, %v31
++	wfchdb	%v0, %v31, %v0
++	wfchdb	%v31, %v0, %v0
++	wfchdb	%v18, %v3, %v20
++
++#CHECK: wfchdbs %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x18,0x30,0xeb]
++#CHECK: wfchdbs %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x18,0x32,0xeb]
++#CHECK: wfchdbs %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x18,0x34,0xeb]
++#CHECK: wfchdbs %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x18,0x38,0xeb]
++#CHECK: wfchdbs %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x18,0x3a,0xeb]
++
++	wfchdbs	%v0, %v0, %v0
++	wfchdbs	%v0, %v0, %v31
++	wfchdbs	%v0, %v31, %v0
++	wfchdbs	%v31, %v0, %v0
++	wfchdbs	%v18, %v3, %v20
++
++#CHECK: wfchedb %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xea]
++#CHECK: wfchedb %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xea]
++#CHECK: wfchedb %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xea]
++#CHECK: wfchedb %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xea]
++#CHECK: wfchedb %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xea]
++
++	wfchedb	%v0, %v0, %v0
++	wfchedb	%v0, %v0, %v31
++	wfchedb	%v0, %v31, %v0
++	wfchedb	%v31, %v0, %v0
++	wfchedb	%v18, %v3, %v20
++
++#CHECK: wfchedbs %f0, %f0, %f0          # encoding: [0xe7,0x00,0x00,0x18,0x30,0xea]
++#CHECK: wfchedbs %f0, %f0, %v31         # encoding: [0xe7,0x00,0xf0,0x18,0x32,0xea]
++#CHECK: wfchedbs %f0, %v31, %f0         # encoding: [0xe7,0x0f,0x00,0x18,0x34,0xea]
++#CHECK: wfchedbs %v31, %f0, %f0         # encoding: [0xe7,0xf0,0x00,0x18,0x38,0xea]
++#CHECK: wfchedbs %v18, %f3, %v20        # encoding: [0xe7,0x23,0x40,0x18,0x3a,0xea]
++
++	wfchedbs %v0, %v0, %v0
++	wfchedbs %v0, %v0, %v31
++	wfchedbs %v0, %v31, %v0
++	wfchedbs %v31, %v0, %v0
++	wfchedbs %v18, %v3, %v20
++
++#CHECK: wfddb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe5]
++#CHECK: wfddb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe5]
++#CHECK: wfddb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe5]
++#CHECK: wfddb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe5]
++#CHECK: wfddb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe5]
++
++	wfddb	%v0, %v0, %v0
++	wfddb	%v0, %v0, %v31
++	wfddb	%v0, %v31, %v0
++	wfddb	%v31, %v0, %v0
++	wfddb	%v18, %v3, %v20
++
++#CHECK: wfidb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc7]
++#CHECK: wfidb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc7]
++#CHECK: wfidb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
++#CHECK: wfidb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc7]
++#CHECK: wfidb   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc7]
++#CHECK: wfidb   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc7]
++#CHECK: wfidb   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc7]
++
++	wfidb	%v0, %v0, 0, 0
++ 	wfidb	%v0, %v0, 0, 15
++	wfidb	%v0, %v0, 4, 0
++	wfidb	%v0, %v0, 12, 0
++	wfidb	%v0, %v31, 0, 0
++	wfidb	%v31, %v0, 0, 0
++	wfidb	%v14, %v17, 4, 10
++
++#CHECK: wfkdb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x00,0x30,0xca]
++#CHECK: wfkdb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x00,0x30,0xca]
++#CHECK: wfkdb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x00,0x34,0xca]
++#CHECK: wfkdb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x30,0xca]
++#CHECK: wfkdb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x00,0x38,0xca]
++#CHECK: wfkdb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x00,0x34,0xca]
++
++	wfkdb	%v0, %v0
++	wfkdb	%v0, %v15
++	wfkdb	%v0, %v31
++	wfkdb	%v15, %v0
++	wfkdb	%v31, %v0
++	wfkdb	%v14, %v17
++
++#CHECK: wflcdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xcc]
++#CHECK: wflcdb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xcc]
++#CHECK: wflcdb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xcc]
++#CHECK: wflcdb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xcc]
++#CHECK: wflcdb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xcc]
++#CHECK: wflcdb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xcc]
++
++	wflcdb	%v0, %v0
++	wflcdb	%v0, %v15
++	wflcdb	%v0, %v31
++	wflcdb	%v15, %v0
++	wflcdb	%v31, %v0
++	wflcdb	%v14, %v17
++
++#CHECK: wflndb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x18,0x30,0xcc]
++#CHECK: wflndb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x18,0x30,0xcc]
++#CHECK: wflndb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x18,0x34,0xcc]
++#CHECK: wflndb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x18,0x30,0xcc]
++#CHECK: wflndb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x18,0x38,0xcc]
++#CHECK: wflndb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x18,0x34,0xcc]
++
++	wflndb	%v0, %v0
++	wflndb	%v0, %v15
++	wflndb	%v0, %v31
++	wflndb	%v15, %v0
++	wflndb	%v31, %v0
++	wflndb	%v14, %v17
++
++#CHECK: wflpdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x28,0x30,0xcc]
++#CHECK: wflpdb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x28,0x30,0xcc]
++#CHECK: wflpdb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x28,0x34,0xcc]
++#CHECK: wflpdb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x28,0x30,0xcc]
++#CHECK: wflpdb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x28,0x38,0xcc]
++#CHECK: wflpdb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x28,0x34,0xcc]
++
++	wflpdb	%v0, %v0
++	wflpdb	%v0, %v15
++	wflpdb	%v0, %v31
++	wflpdb	%v15, %v0
++	wflpdb	%v31, %v0
++	wflpdb	%v14, %v17
++
++#CHECK: wfmadb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x8f]
++#CHECK: wfmadb  %f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x8f]
++#CHECK: wfmadb  %f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x8f]
++#CHECK: wfmadb  %f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x8f]
++#CHECK: wfmadb  %v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x8f]
++#CHECK: wfmadb  %f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x8f]
++
++	wfmadb	%v0, %v0, %v0, %v0
++	wfmadb	%v0, %v0, %v0, %v31
++	wfmadb	%v0, %v0, %v31, %v0
++	wfmadb	%v0, %v31, %v0, %v0
++	wfmadb	%v31, %v0, %v0, %v0
++	wfmadb	%v13, %v17, %v21, %v25
++
++#CHECK: wfmdb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe7]
++#CHECK: wfmdb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe7]
++#CHECK: wfmdb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe7]
++#CHECK: wfmdb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe7]
++#CHECK: wfmdb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe7]
++
++	wfmdb	%v0, %v0, %v0
++	wfmdb	%v0, %v0, %v31
++	wfmdb	%v0, %v31, %v0
++	wfmdb	%v31, %v0, %v0
++	wfmdb	%v18, %v3, %v20
++
++#CHECK: wfmsdb  %f0, %f0, %f0, %f0      # encoding: [0xe7,0x00,0x03,0x08,0x00,0x8e]
++#CHECK: wfmsdb  %f0, %f0, %f0, %v31     # encoding: [0xe7,0x00,0x03,0x08,0xf1,0x8e]
++#CHECK: wfmsdb  %f0, %f0, %v31, %f0     # encoding: [0xe7,0x00,0xf3,0x08,0x02,0x8e]
++#CHECK: wfmsdb  %f0, %v31, %f0, %f0     # encoding: [0xe7,0x0f,0x03,0x08,0x04,0x8e]
++#CHECK: wfmsdb  %v31, %f0, %f0, %f0     # encoding: [0xe7,0xf0,0x03,0x08,0x08,0x8e]
++#CHECK: wfmsdb  %f13, %v17, %v21, %v25  # encoding: [0xe7,0xd1,0x53,0x08,0x97,0x8e]
++
++	wfmsdb	%v0, %v0, %v0, %v0
++	wfmsdb	%v0, %v0, %v0, %v31
++	wfmsdb	%v0, %v0, %v31, %v0
++	wfmsdb	%v0, %v31, %v0, %v0
++	wfmsdb	%v31, %v0, %v0, %v0
++	wfmsdb	%v13, %v17, %v21, %v25
++
++#CHECK: wfsdb   %f0, %f0, %f0           # encoding: [0xe7,0x00,0x00,0x08,0x30,0xe2]
++#CHECK: wfsdb   %f0, %f0, %v31          # encoding: [0xe7,0x00,0xf0,0x08,0x32,0xe2]
++#CHECK: wfsdb   %f0, %v31, %f0          # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xe2]
++#CHECK: wfsdb   %v31, %f0, %f0          # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xe2]
++#CHECK: wfsdb   %v18, %f3, %v20         # encoding: [0xe7,0x23,0x40,0x08,0x3a,0xe2]
++
++	wfsdb	%v0, %v0, %v0
++	wfsdb	%v0, %v0, %v31
++	wfsdb	%v0, %v31, %v0
++	wfsdb	%v31, %v0, %v0
++	wfsdb	%v18, %v3, %v20
++
++#CHECK: wfsqdb  %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x30,0xce]
++#CHECK: wfsqdb  %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x30,0xce]
++#CHECK: wfsqdb  %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xce]
++#CHECK: wfsqdb  %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x30,0xce]
++#CHECK: wfsqdb  %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xce]
++#CHECK: wfsqdb  %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x34,0xce]
++
++	wfsqdb	%v0, %v0
++	wfsqdb	%v0, %v15
++	wfsqdb	%v0, %v31
++	wfsqdb	%v15, %v0
++	wfsqdb	%v31, %v0
++	wfsqdb	%v14, %v17
++
++#CHECK: wftcidb %f0, %f0, 0             # encoding: [0xe7,0x00,0x00,0x08,0x30,0x4a]
++#CHECK: wftcidb %f0, %f0, 4095          # encoding: [0xe7,0x00,0xff,0xf8,0x30,0x4a]
++#CHECK: wftcidb %f0, %f15, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x30,0x4a]
++#CHECK: wftcidb %f0, %v31, 0            # encoding: [0xe7,0x0f,0x00,0x08,0x34,0x4a]
++#CHECK: wftcidb %f15, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x30,0x4a]
++#CHECK: wftcidb %v31, %f0, 0            # encoding: [0xe7,0xf0,0x00,0x08,0x38,0x4a]
++#CHECK: wftcidb %f4, %v21, 1656         # encoding: [0xe7,0x45,0x67,0x88,0x34,0x4a]
++
++	wftcidb	%v0, %v0, 0
++	wftcidb	%v0, %v0, 4095
++	wftcidb	%v0, %v15, 0
++	wftcidb	%v0, %v31, 0
++	wftcidb	%v15, %v0, 0
++	wftcidb	%v31, %v0, 0
++	wftcidb	%v4, %v21, 0x678
++
++#CHECK: wldeb   %f0, %f0                # encoding: [0xe7,0x00,0x00,0x08,0x20,0xc4]
++#CHECK: wldeb   %f0, %f15               # encoding: [0xe7,0x0f,0x00,0x08,0x20,0xc4]
++#CHECK: wldeb   %f0, %v31               # encoding: [0xe7,0x0f,0x00,0x08,0x24,0xc4]
++#CHECK: wldeb   %f15, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x20,0xc4]
++#CHECK: wldeb   %v31, %f0               # encoding: [0xe7,0xf0,0x00,0x08,0x28,0xc4]
++#CHECK: wldeb   %f14, %v17              # encoding: [0xe7,0xe1,0x00,0x08,0x24,0xc4]
++
++	wldeb	%v0, %v0
++	wldeb	%v0, %v15
++	wldeb	%v0, %v31
++	wldeb	%v15, %v0
++	wldeb	%v31, %v0
++	wldeb	%v14, %v17
++
++#CHECK: wledb   %f0, %f0, 0, 0          # encoding: [0xe7,0x00,0x00,0x08,0x30,0xc5]
++#CHECK: wledb   %f0, %f0, 0, 15         # encoding: [0xe7,0x00,0x00,0xf8,0x30,0xc5]
++#CHECK: wledb   %f0, %f0, 4, 0          # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
++#CHECK: wledb   %f0, %f0, 12, 0         # encoding: [0xe7,0x00,0x00,0x0c,0x30,0xc5]
++#CHECK: wledb   %f0, %v31, 0, 0         # encoding: [0xe7,0x0f,0x00,0x08,0x34,0xc5]
++#CHECK: wledb   %v31, %f0, 0, 0         # encoding: [0xe7,0xf0,0x00,0x08,0x38,0xc5]
++#CHECK: wledb   %f14, %v17, 4, 10       # encoding: [0xe7,0xe1,0x00,0xac,0x34,0xc5]
++
++	wledb	%v0, %v0, 0, 0
++ 	wledb	%v0, %v0, 0, 15
++	wledb	%v0, %v0, 4, 0
++	wledb	%v0, %v0, 12, 0
++	wledb	%v0, %v31, 0, 0
++	wledb	%v31, %v0, 0, 0
++	wledb	%v14, %v17, 4, 10
+Index: llvm-36/test/MC/SystemZ/insn-good-z196.s
+===================================================================
+--- llvm-36.orig/test/MC/SystemZ/insn-good-z196.s
++++ llvm-36/test/MC/SystemZ/insn-good-z196.s
+@@ -1021,6 +1021,16 @@
+ 	ork	%r15,%r0,%r0
+ 	ork	%r7,%r8,%r9
+ 
++#CHECK: popcnt	%r0, %r0                # encoding: [0xb9,0xe1,0x00,0x00]
++#CHECK: popcnt	%r0, %r15               # encoding: [0xb9,0xe1,0x00,0x0f]
++#CHECK: popcnt	%r15, %r0               # encoding: [0xb9,0xe1,0x00,0xf0]
++#CHECK: popcnt	%r7, %r8                # encoding: [0xb9,0xe1,0x00,0x78]
++
++	popcnt	%r0,%r0
++	popcnt	%r0,%r15
++	popcnt	%r15,%r0
++	popcnt	%r7,%r8
++
+ #CHECK: risbhg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x5d]
+ #CHECK: risbhg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x5d]
+ #CHECK: risbhg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x5d]
+Index: llvm-36/test/MC/SystemZ/insn-good-zEC12.s
+===================================================================
+--- /dev/null
++++ llvm-36/test/MC/SystemZ/insn-good-zEC12.s
+@@ -0,0 +1,126 @@
++# For zEC12 and above.
++# RUN: llvm-mc -triple s390x-linux-gnu -mcpu=zEC12 -show-encoding %s | FileCheck %s
++
++#CHECK: etnd	%r0                     # encoding: [0xb2,0xec,0x00,0x00]
++#CHECK: etnd	%r15                    # encoding: [0xb2,0xec,0x00,0xf0]
++#CHECK: etnd	%r7                     # encoding: [0xb2,0xec,0x00,0x70]
++
++	etnd	%r0
++	etnd	%r15
++	etnd	%r7
++
++#CHECK: ntstg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x25]
++#CHECK: ntstg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x25]
++#CHECK: ntstg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x25]
++#CHECK: ntstg	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x25]
++#CHECK: ntstg	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x25]
++#CHECK: ntstg	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x25]
++#CHECK: ntstg	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x25]
++#CHECK: ntstg	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x25]
++#CHECK: ntstg	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x25]
++#CHECK: ntstg	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x25]
++
++	ntstg	%r0, -524288
++	ntstg	%r0, -1
++	ntstg	%r0, 0
++	ntstg	%r0, 1
++	ntstg	%r0, 524287
++	ntstg	%r0, 0(%r1)
++	ntstg	%r0, 0(%r15)
++	ntstg	%r0, 524287(%r1,%r15)
++	ntstg	%r0, 524287(%r15,%r1)
++	ntstg	%r15, 0
++
++#CHECK: ppa	%r0, %r0, 0             # encoding: [0xb2,0xe8,0x00,0x00]
++#CHECK: ppa	%r0, %r0, 15            # encoding: [0xb2,0xe8,0xf0,0x00]
++#CHECK: ppa	%r0, %r15, 0            # encoding: [0xb2,0xe8,0x00,0x0f]
++#CHECK: ppa	%r4, %r6, 7             # encoding: [0xb2,0xe8,0x70,0x46]
++#CHECK: ppa	%r15, %r0, 0            # encoding: [0xb2,0xe8,0x00,0xf0]
++
++	ppa	%r0, %r0, 0
++	ppa	%r0, %r0, 15
++	ppa	%r0, %r15, 0
++	ppa	%r4, %r6, 7
++	ppa	%r15, %r0, 0
++
++#CHECK: risbgn	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x59]
++#CHECK: risbgn	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x59]
++#CHECK: risbgn	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x59]
++#CHECK: risbgn	%r0, %r0, 255, 0, 0     # encoding: [0xec,0x00,0xff,0x00,0x00,0x59]
++#CHECK: risbgn	%r0, %r15, 0, 0, 0      # encoding: [0xec,0x0f,0x00,0x00,0x00,0x59]
++#CHECK: risbgn	%r15, %r0, 0, 0, 0      # encoding: [0xec,0xf0,0x00,0x00,0x00,0x59]
++#CHECK: risbgn	%r4, %r5, 6, 7, 8       # encoding: [0xec,0x45,0x06,0x07,0x08,0x59]
++
++	risbgn	%r0,%r0,0,0,0
++	risbgn	%r0,%r0,0,0,63
++	risbgn	%r0,%r0,0,255,0
++	risbgn	%r0,%r0,255,0,0
++	risbgn	%r0,%r15,0,0,0
++	risbgn	%r15,%r0,0,0,0
++	risbgn	%r4,%r5,6,7,8
++
++#CHECK: tabort	0                       # encoding: [0xb2,0xfc,0x00,0x00]
++#CHECK: tabort	0(%r1)                  # encoding: [0xb2,0xfc,0x10,0x00]
++#CHECK: tabort	0(%r15)                 # encoding: [0xb2,0xfc,0xf0,0x00]
++#CHECK: tabort	4095                    # encoding: [0xb2,0xfc,0x0f,0xff]
++#CHECK: tabort	4095(%r1)               # encoding: [0xb2,0xfc,0x1f,0xff]
++#CHECK: tabort	4095(%r15)              # encoding: [0xb2,0xfc,0xff,0xff]
++
++	tabort	0
++	tabort	0(%r1)
++	tabort	0(%r15)
++	tabort	4095
++	tabort	4095(%r1)
++	tabort	4095(%r15)
++
++#CHECK: tbegin	0, 0                    # encoding: [0xe5,0x60,0x00,0x00,0x00,0x00]
++#CHECK: tbegin	4095, 0                 # encoding: [0xe5,0x60,0x0f,0xff,0x00,0x00]
++#CHECK: tbegin	0, 0                    # encoding: [0xe5,0x60,0x00,0x00,0x00,0x00]
++#CHECK: tbegin	0, 1                    # encoding: [0xe5,0x60,0x00,0x00,0x00,0x01]
++#CHECK: tbegin	0, 32767                # encoding: [0xe5,0x60,0x00,0x00,0x7f,0xff]
++#CHECK: tbegin	0, 32768                # encoding: [0xe5,0x60,0x00,0x00,0x80,0x00]
++#CHECK: tbegin	0, 65535                # encoding: [0xe5,0x60,0x00,0x00,0xff,0xff]
++#CHECK: tbegin	0(%r1), 42              # encoding: [0xe5,0x60,0x10,0x00,0x00,0x2a]
++#CHECK: tbegin	0(%r15), 42             # encoding: [0xe5,0x60,0xf0,0x00,0x00,0x2a]
++#CHECK: tbegin	4095(%r1), 42           # encoding: [0xe5,0x60,0x1f,0xff,0x00,0x2a]
++#CHECK: tbegin	4095(%r15), 42          # encoding: [0xe5,0x60,0xff,0xff,0x00,0x2a]
++
++	tbegin	0, 0
++	tbegin	4095, 0
++	tbegin	0, 0
++	tbegin	0, 1
++	tbegin	0, 32767
++	tbegin	0, 32768
++	tbegin	0, 65535
++	tbegin	0(%r1), 42
++	tbegin	0(%r15), 42
++	tbegin	4095(%r1), 42
++	tbegin	4095(%r15), 42
++
++#CHECK: tbeginc	0, 0                    # encoding: [0xe5,0x61,0x00,0x00,0x00,0x00]
++#CHECK: tbeginc	4095, 0                 # encoding: [0xe5,0x61,0x0f,0xff,0x00,0x00]
++#CHECK: tbeginc	0, 0                    # encoding: [0xe5,0x61,0x00,0x00,0x00,0x00]
++#CHECK: tbeginc	0, 1                    # encoding: [0xe5,0x61,0x00,0x00,0x00,0x01]
++#CHECK: tbeginc	0, 32767                # encoding: [0xe5,0x61,0x00,0x00,0x7f,0xff]
++#CHECK: tbeginc	0, 32768                # encoding: [0xe5,0x61,0x00,0x00,0x80,0x00]
++#CHECK: tbeginc	0, 65535                # encoding: [0xe5,0x61,0x00,0x00,0xff,0xff]
++#CHECK: tbeginc	0(%r1), 42              # encoding: [0xe5,0x61,0x10,0x00,0x00,0x2a]
++#CHECK: tbeginc	0(%r15), 42             # encoding: [0xe5,0x61,0xf0,0x00,0x00,0x2a]
++#CHECK: tbeginc	4095(%r1), 42           # encoding: [0xe5,0x61,0x1f,0xff,0x00,0x2a]
++#CHECK: tbeginc	4095(%r15), 42          # encoding: [0xe5,0x61,0xff,0xff,0x00,0x2a]
++
++	tbeginc	0, 0
++	tbeginc	4095, 0
++	tbeginc	0, 0
++	tbeginc	0, 1
++	tbeginc	0, 32767
++	tbeginc	0, 32768
++	tbeginc	0, 65535
++	tbeginc	0(%r1), 42
++	tbeginc	0(%r15), 42
++	tbeginc	4095(%r1), 42
++	tbeginc	4095(%r15), 42
++
++#CHECK: tend                            # encoding: [0xb2,0xf8,0x00,0x00]
++
++	tend
+Index: llvm-36/test/MC/SystemZ/tokens.s
+===================================================================
+--- llvm-36.orig/test/MC/SystemZ/tokens.s
++++ llvm-36/test/MC/SystemZ/tokens.s
+@@ -13,10 +13,16 @@
+ #CHECK: foo	100(200,%r0), 300
+ #CHECK: error: invalid instruction
+ #CHECK: foo	100(200,%r1), 300
+-#CHECK: error: invalid operand
++#CHECK: error: invalid address register
+ #CHECK: foo	100(%a0), 200
+ #CHECK: error: %r0 used in an address
+ #CHECK: foo	100(%r0), 200
++#CHECK: error: %r0 used in an address
++#CHECK: foo	100(%v1,%r0), 200
++#CHECK: error: invalid instruction
++#CHECK: foo	100(%v0,%r1), 200
++#CHECK: error: invalid instruction
++#CHECK: foo	100(%v31), 200
+ #CHECK: error: invalid operand
+ #CHECK: foo	100(%r1,%a0), 200
+ #CHECK: error: %r0 used in an address
+@@ -45,6 +51,12 @@
+ #CHECK: foo	%a15, 200
+ #CHECK: error: invalid register
+ #CHECK: foo	%a16, 200
++#CHECK: error: invalid instruction
++#CHECK: foo	%v0, 200
++#CHECK: error: invalid instruction
++#CHECK: foo	%v31, 200
++#CHECK: error: invalid register
++#CHECK: foo	%v32, 200
+ #CHECK: error: invalid register
+ #CHECK: foo	%c, 200
+ #CHECK: error: invalid register
+@@ -60,6 +72,9 @@
+ 	foo	100(200,%r1), 300
+ 	foo	100(%a0), 200
+ 	foo	100(%r0), 200
++	foo	100(%v1,%r0), 200
++	foo	100(%v0,%r1), 200
++	foo	100(%v31), 200
+ 	foo	100(%r1,%a0), 200
+ 	foo	100(%r1,%r0), 200
+ 	foo	100(%r1,%r2, 200
+@@ -74,6 +89,9 @@
+ 	foo	%a0, 200
+ 	foo	%a15, 200
+ 	foo	%a16, 200
++	foo	%v0, 200
++	foo	%v31, 200
++	foo	%v32, 200
+ 	foo	%c, 200
+ 	foo	%, 200
+ 	foo	{, 200
diff --git a/SPECS/llvm.spec b/SPECS/llvm.spec
index 5fce613..450f2b8 100644
--- a/SPECS/llvm.spec
+++ b/SPECS/llvm.spec
@@ -6,18 +6,18 @@
 # consequently we build swrast on them instead of llvmpipe.
 ExcludeArch: ppc s390 %{?rhel6:s390x}
 
-#%global svndate 20131023
-#global prerel rc3
+#global svndate 20131023
+#global prerel rc4
 
 Name:           mesa-private-llvm
-Version:        3.5.0
-Release:        1%{?dist}
+Version:        3.6.2
+Release:        2%{?prerel:.%prerel}%{?dist}
 Summary:        llvm engine for Mesa
 
 Group:		System Environment/Libraries
 License:        NCSA
 URL:            http://llvm.org/
-Source0:	http://llvm.org/pre-releases/3.5/llvm-3.5.0.src.tar.xz
+Source0:	http://llvm.org/releases/%{version}/%{?prerel}/llvm-%{version}%{?prerel}.src.tar.xz
 #Source0:	llvm-%{svndate}.tar.xz
 Source1:	make-llvm-snapshot.sh
 # multilib fixes
@@ -26,7 +26,15 @@ Source3:        llvm-Config-llvm-config.h
 
 # Data files should be installed with timestamps preserved
 Patch0:         llvm-2.6-timestamp.patch
-Patch1:		llvm-3.5.0-build-fix.patch
+
+# llvm Z13 backports (#1182150)
+Patch1: llvm-z13-backports.patch
+Patch2: llvm-3.6-large-struct-return.patch
+
+# llvm aarch64 bug fix (#1254386)
+Patch10: 0001-AArch64-Fix-invalid-use-of-references-to-BuildMI.patch
+# add model detection for skylake and broadwell
+Patch11: llvm-3.6.2-nerf-skylake.patch
 
 BuildRequires:  bison
 BuildRequires:  chrpath
@@ -59,7 +67,10 @@ rm -r -f tools/clang
 
 # llvm patches
 %patch0 -p1 -b .timestamp
-%patch1 -p1 -b .build
+%patch1 -p1 -b .z13
+%patch2 -p1 -b .large-struct
+%patch10 -p1 -b .aarch64-fix
+%patch11 -p1 -b .skl-fix
 
 # fix ld search path
 sed -i 's|/lib /usr/lib $lt_ld_extra|%{_libdir} $lt_ld_extra|' \
@@ -94,14 +105,6 @@ export CXX=g++
   --disable-libffi \
   --disable-terminfo \
   --disable-timestamps \
-%ifarch armv7hl armv7l
-  --with-cpu=cortex-a8 \
-  --with-tune=cortex-a8 \
-  --with-arch=armv7-a \
-  --with-float=hard \
-  --with-fpu=vfpv3-d16 \
-  --with-abi=aapcs-linux \
-%endif
   %{nil}
 
 # FIXME file this
@@ -123,9 +126,6 @@ make install DESTDIR=%{buildroot}
 # rename the few binaries we're keeping
 mv %{buildroot}%{_bindir}/llvm-config %{buildroot}%{_bindir}/%{name}-config-%{__isa_bits}
 
-# silly
-rm -f %{buildroot}%{_libdir}/llvm-3.5.0.so
-
 pushd %{buildroot}%{_includedir}/mesa-private/llvm/Config
 mv config.h config-%{__isa_bits}.h
 cp -p %{SOURCE2} config.h
@@ -156,7 +156,7 @@ rm -rf %{buildroot}%{_mandir}/man1
 
 # RHEL: Strip out some headers Mesa doesn't need
 rm -rf %{buildroot}%{_includedir}/mesa-private/llvm/{Analysis,Assembly}
-rm -rf %{buildroot}%{_includedir}/mesa-private/llvm/{DebugInfo,Object,Option}
+rm -rf %{buildroot}%{_includedir}/mesa-private/llvm/{DebugInfo,Option}
 rm -rf %{buildroot}%{_includedir}/mesa-private/llvm/TableGen
 
 # RHEL: Strip out cmake build foo
@@ -175,7 +175,7 @@ make check LIT_ARGS="-v -j4" | tee llvm-testlog-%{_arch}.txt
 %files
 %defattr(-,root,root,-)
 %doc LICENSE.TXT
-%{_libdir}/libLLVM-3.5-mesa.so
+%{_libdir}/libLLVM-3.6-mesa.so
 
 %files devel
 %defattr(-,root,root,-)
@@ -184,6 +184,30 @@ make check LIT_ARGS="-v -j4" | tee llvm-testlog-%{_arch}.txt
 %{_includedir}/mesa-private/llvm-c
 
 %changelog
+* Wed Oct 14 2015 Adam Jackson <ajax@redhat.com> 3.6.2-2
+- Teach CPU detection about Skylake/Broadwell, treat them like Haswell
+
+* Mon Aug 24 2015 Dave Airlie <airlied@redhat.com> 3.6.2-1
+- fix aarch64 bugs via 3.6.2 + patch
+
+* Tue Aug 18 2015 Adam Jackson <ajax@redhat.com> 3.6.1-2
+- Fix large struct return on s390
+
+* Tue May 26 2015 Dave Airlie <airlied@redhat.com> 3.6.1-1
+- rebase to llvm 3.6.1
+
+* Thu May 21 2015 Dave Airlie <airlied@redhat.com> 3.6.0-3
+- backport llvm z13 support from IBM
+
+* Wed May 13 2015 Dave Airlie <airlied@redhat.com> 3.6.0-2
+- mesa needs Object headers now.
+
+* Wed May 13 2015 Dave Airlie <airlied@redhat.com> 3.6.0-1
+- llvm 3.6.0 final
+
+* Mon Feb 23 2015 Adam Jackson <ajax@redhat.com> 3.6.0-0.1
+- llvm 3.6.0 rc4
+
 * Tue Sep 09 2014 Dave Airlie <airlied@redhat.com> 3.5.0-1
 - llvm 3.5.0 final