Tree - rpms/valgrind - CentOS Git server

rpms / valgrind

Blame SOURCES/valgrind-3.17.0-ppc64-isa-3.1.patch

Blob History Raw

		b1cb33	`commit 3cc0232c46a5905b4a6c2fbd302b58bf5f90b3d5`
		b1cb33	`Author: Carl Love <cel@us.ibm.com>`
		b1cb33	`Date: Mon Jan 11 16:00:57 2021 -0600`
		b1cb33
		b1cb33	`PPC64: ISA 3.1 VSX PCV Generate Operations`
		b1cb33
		b1cb33	`xgenpcvbm VSX Vector Generate PCV from Byte Mask`
		b1cb33	`xxgenpcvdmVSX Vector Generate PCV from Doubleword Mask`
		b1cb33	`xxgenpcvhmVSX Vector Generate PCV from Halfword Mask`
		b1cb33	`xxgenpcvwmVSX Vector Generate PCV from Word Mask`
		b1cb33
		b1cb33	`diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h`
		b1cb33	`index deda4dfce..54ce923a9 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_defs.h`
		b1cb33	`+++ b/VEX/priv/guest_ppc_defs.h`
		b1cb33	`@@ -169,6 +169,23 @@ void write_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,`
		b1cb33	`void get_ACC_entry (VexGuestPPC64State* gst, UInt offset, UInt acc,`
		b1cb33	`UInt reg, UInt *result);`
		b1cb33
		b1cb33	`+extern void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi,`
		b1cb33	`+ ULong src_lo,`
		b1cb33	`+ UInt rtn_val, UInt IMM );`
		b1cb33	`+extern void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi,`
		b1cb33	`+ ULong src_lo,`
		b1cb33	`+ UInt rtn_val, UInt IMM );`
		b1cb33	`+extern void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi,`
		b1cb33	`+ ULong src_lo,`
		b1cb33	`+ UInt rtn_val, UInt IMM );`
		b1cb33	`+extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi,`
		b1cb33	`+ ULong src_lo,`
		b1cb33	`+ UInt rtn_val, UInt IMM );`
		b1cb33	`+`
		b1cb33	`/* 8-bit XO value from instruction description */`
		b1cb33	`#define XVI4GER8 0b00100011`
		b1cb33	`#define XVI4GER8PP 0b00100010`
		b1cb33	`diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`index c24191ef3..75497abb9 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`+++ b/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`@@ -701,6 +701,738 @@ ULong vector_evaluate64_helper( ULong srcA, ULong srcB, ULong srcC,`
		b1cb33	`#undef MAX_IMM_BITS`
		b1cb33	`}`
		b1cb33
		b1cb33	`+/--------------------------------------------------/`
		b1cb33	`+/---- VSX Vector Generate PCV from Mask helpers ---/`
		b1cb33	`+/--------------------------------------------------/`
		b1cb33	`+static void write_VSX_entry (VexGuestPPC64State* gst, UInt reg_offset,`
		b1cb33	`+ ULong *vsx_entry)`
		b1cb33	`+{`
		b1cb33	`+ U128* pU128_dst;`
		b1cb33	`+ pU128_dst = (U128) (((UChar) gst) + reg_offset);`
		b1cb33	`+`
		b1cb33	`+ /* The U128 type is defined as an array of unsigned intetgers. */`
		b1cb33	`+ /* Writing in LE order */`
		b1cb33	`+ (*pU128_dst)[0] = (UInt)(vsx_entry[1] & 0xFFFFFFFF);`
		b1cb33	`+ (*pU128_dst)[1] = (UInt)(vsx_entry[1] >> 32);`
		b1cb33	`+ (*pU128_dst)[2] = (UInt)(vsx_entry[0] & 0xFFFFFFFF);`
		b1cb33	`+ (*pU128_dst)[3] = (UInt)(vsx_entry[0] >> 32);`
		b1cb33	`+ return;`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+/* CALLED FROM GENERATED CODE */`
		b1cb33	`+void vector_gen_pvc_byte_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi, ULong src_lo,`
		b1cb33	`+ UInt reg_offset, UInt imm ) {`
		b1cb33	`+ /* The function computes the 128-bit result then writes it directly`
		b1cb33	`+ into the guest state VSX register. */`
		b1cb33	`+`
		b1cb33	`+ UInt i, shift_by, sel_shift_by, half_sel;`
		b1cb33	`+ ULong index, src, result[2];`
		b1cb33	`+ ULong j;`
		b1cb33	`+`
		b1cb33	`+ result[0] = 0;`
		b1cb33	`+ result[1] = 0;`
		b1cb33	`+ j = 0;`
		b1cb33	`+`
		b1cb33	`+ /* The algorithm in the ISA is written with IBM numbering zero on left and`
		b1cb33	`+ N-1 on right. The loop index is converted to "i" to match the algorithm`
		b1cb33	`+ for claritiy of matching the C code to the algorithm in the ISA. */`
		b1cb33	`+`
		b1cb33	`+ if (imm == 0b00) { // big endian expansion`
		b1cb33	`+ for( index = 0; index < 16; index++) {`
		b1cb33	`+ i = 15 - index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*8;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 8) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 7;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ result[half_sel] \|= j << shift_by;`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (index + (unsigned long long)0x10) << shift_by;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b01) { // big endian compression`
		b1cb33	`+ /* If IMM=0b00001, let pcv be the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement a`
		b1cb33	`+ compression of the sparse byte elements in a source vector specified`
		b1cb33	`+ by the byte-element mask in VSR[VRB+32] into the leftmost byte`
		b1cb33	`+ elements of a result vector.`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 16; index++) {`
		b1cb33	`+ i = 15 - index;`
		b1cb33	`+ shift_by = i*8;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 8) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 7;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j >= 8)`
		b1cb33	`+ result[1] \|= (index) << (15 - j)*8;`
		b1cb33	`+ else`
		b1cb33	`+ result[0] \|= (index) << (7 - j)*8;`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+ /* The algorithim says set to undefined, leave as 0`
		b1cb33	`+ for( index = 3 - j; index < 4; index++) {`
		b1cb33	`+ result \|= (0 << (index*8));`
		b1cb33	`+ }`
		b1cb33	`+ */`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b10) { //little-endian expansion`
		b1cb33	`+ /* If IMM=0b00010, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement an`
		b1cb33	`+ expansion of the rightmost byte elements of a source vector into the`
		b1cb33	`+ byte elements of a result vector specified by the byte-element mask`
		b1cb33	`+ in VSR[VRB+32]. */`
		b1cb33	`+ for( index = 0; index < 16; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*8;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 8) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 7;`
		b1cb33	`+`
		b1cb33	`+ /* mod shift amount by 8 since src is either the upper or lower`
		b1cb33	`+ 64-bits. */`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ result[half_sel] \|= j << shift_by;`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (index + (unsigned long long)0x10) << shift_by;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b11) { //little-endian compression`
		b1cb33	`+ /* If IMM=0b00011, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement a`
		b1cb33	`+ compression of the sparse byte elements in a source vector specified`
		b1cb33	`+ by the byte-element mask in VSR[VRB+32] into the rightmost byte`
		b1cb33	`+ elements of a result vector. */`
		b1cb33	`+`
		b1cb33	`+ for( index = 0; index < 16; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*8;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 8) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 7;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j >= 8)`
		b1cb33	`+ result[0] \|= (index) << (j-8)*8;`
		b1cb33	`+ else`
		b1cb33	`+ result[1] \|= (index) << j*8;`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ /* The algorithim says set to undefined, leave as 0`
		b1cb33	`+ for( index = 3 - j; index < 4; index++) {`
		b1cb33	`+ result \|= (0 << (index*8));`
		b1cb33	`+ }`
		b1cb33	`+ */`
		b1cb33	`+`
		b1cb33	`+ } else {`
		b1cb33	`+ vex_printf("ERROR, vector_gen_pvc_byte_mask_dirty_helper, imm value %u not supported.\n",`
		b1cb33	`+ imm);`
		b1cb33	`+ vassert(0);`
		b1cb33	`+ }`
		b1cb33	`+ write_VSX_entry( gst, reg_offset, result);`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+/* CALLED FROM GENERATED CODE */`
		b1cb33	`+void vector_gen_pvc_hword_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi, ULong src_lo,`
		b1cb33	`+ UInt reg_offset,`
		b1cb33	`+ UInt imm ) {`
		b1cb33	`+ /* The function computes the 128-bit result then writes it directly`
		b1cb33	`+ into the guest state VSX register. */`
		b1cb33	`+ UInt i, shift_by, sel_shift_by, half_sel;`
		b1cb33	`+ ULong index, src, result[2];`
		b1cb33	`+ ULong j;`
		b1cb33	`+`
		b1cb33	`+ result[0] = 0;`
		b1cb33	`+ result[1] = 0;`
		b1cb33	`+ j = 0;`
		b1cb33	`+`
		b1cb33	`+ /* The algorithm in the ISA is written with IBM numbering zero on left and`
		b1cb33	`+ N-1 on right. The loop index is converted to "i" to match the algorithm`
		b1cb33	`+ for claritiy of matching the C code to the algorithm in the ISA. */`
		b1cb33	`+`
		b1cb33	`+ if (imm == 0b00) { // big endian expansion`
		b1cb33	`+ /* If IMM=0b00000, let pcv be the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement an`
		b1cb33	`+ expansion of the leftmost halfword elements of a source vector into`
		b1cb33	`+ the halfword elements of a result vector specified by the halfword-`
		b1cb33	`+ element mask in VSR[VRB+32].`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 8; index++) {`
		b1cb33	`+ i = 7 - index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*16;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 4) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 15;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ // half-word i, byte 0`
		b1cb33	`+ result[half_sel] \|= (2*j + 0x0) << (shift_by+8);`
		b1cb33	`+ // half-word i, byte 1`
		b1cb33	`+ result[half_sel] \|= (2*j + 0x1) << shift_by;`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (2*index + 0x10) << (shift_by+8);`
		b1cb33	`+ result[half_sel] \|= (2*index + 0x11) << shift_by;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b01) { // big endian expansion`
		b1cb33	`+ /* If IMM=0b00001,let pcv be the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement a`
		b1cb33	`+ compression of the sparse halfword elements in a source vector`
		b1cb33	`+ specified by the halfword-element mask in VSR[VRB+32] into the`
		b1cb33	`+ leftmost halfword elements of a result vector.`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 8; index++) {`
		b1cb33	`+ i = 7 - index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*16;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 4) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 15;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j >= 4) {`
		b1cb33	`+ // half-word i, byte 0`
		b1cb33	`+ result[1] \|= (2index + 0x0) << ((7 - j)16 + 8);`
		b1cb33	`+ // half-word i, byte 1`
		b1cb33	`+ result[1] \|= (2index + 0x1) << ((7 - j)16);`
		b1cb33	`+ } else {`
		b1cb33	`+ // half-word i, byte 0`
		b1cb33	`+ result[0] \|= (2index + 0x0) << ((3 - j)16 + 8);`
		b1cb33	`+ // half-word i, byte 1`
		b1cb33	`+ result[0] \|= (2index + 0x1) << ((3 - j)16);`
		b1cb33	`+ }`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b10) { //little-endian expansion`
		b1cb33	`+ /* If IMM=0b00010, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement an`
		b1cb33	`+ expansion of the rightmost halfword elements of a source vector into`
		b1cb33	`+ the halfword elements of a result vector specified by the halfword-`
		b1cb33	`+ element mask in VSR[VRB+32].`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 8; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+ shift_by = i*16;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 4) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 15;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ // half-word i, byte 0`
		b1cb33	`+ result[half_sel] \|= (2*j + 0x00) << shift_by;`
		b1cb33	`+ // half-word i, byte 1`
		b1cb33	`+ result[half_sel] \|= (2*j + 0x01) << (shift_by+8);`
		b1cb33	`+ j++;`
		b1cb33	`+`
		b1cb33	`+ } else {`
		b1cb33	`+ // half-word i, byte 0`
		b1cb33	`+ result[half_sel] \|= (2*index + 0x10) << shift_by;`
		b1cb33	`+ // half-word i, byte 1`
		b1cb33	`+ result[half_sel] \|= (2*index + 0x11) << (shift_by+8);`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b11) { //little-endian compression`
		b1cb33	`+ /* If IMM=0b00011, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement a`
		b1cb33	`+ compression of the sparse halfword elements in a source vector`
		b1cb33	`+ specified by the halfword-element mask in VSR[VRB+32] into the`
		b1cb33	`+ rightmost halfword elements of a result vector. */`
		b1cb33	`+ for( index = 0; index < 8; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+ shift_by = i*16;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 4) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 15;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j >= 4) {`
		b1cb33	`+ // half-word j, byte 0`
		b1cb33	`+ result[0] \|= (2index + 0x0) << ((j-4)16);`
		b1cb33	`+ // half-word j, byte 1`
		b1cb33	`+ result[0] \|= (2index + 0x1) << ((j-4)16+8);`
		b1cb33	`+ } else {`
		b1cb33	`+ // half-word j, byte 0`
		b1cb33	`+ result[1] \|= (2index + 0x0) << (j16);`
		b1cb33	`+ // half-word j, byte 1`
		b1cb33	`+ result[1] \|= (2index + 0x1) << ((j16)+8);`
		b1cb33	`+ }`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else {`
		b1cb33	`+ vex_printf("ERROR, vector_gen_pvc_hword_dirty_mask_helper, imm value %u not supported.\n",`
		b1cb33	`+ imm);`
		b1cb33	`+ vassert(0);`
		b1cb33	`+ }`
		b1cb33	`+ write_VSX_entry( gst, reg_offset, result);`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+/* CALLED FROM GENERATED CODE */`
		b1cb33	`+void vector_gen_pvc_word_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi, ULong src_lo,`
		b1cb33	`+ UInt reg_offset, UInt imm ) {`
		b1cb33	`+ /* The function computes the 128-bit result then writes it directly`
		b1cb33	`+ into the guest state VSX register. */`
		b1cb33	`+ UInt i, shift_by, sel_shift_by, half_sel;`
		b1cb33	`+ ULong index, src, result[2];`
		b1cb33	`+ ULong j;`
		b1cb33	`+`
		b1cb33	`+ result[0] = 0;`
		b1cb33	`+ result[1] = 0;`
		b1cb33	`+ j = 0;`
		b1cb33	`+`
		b1cb33	`+ /* The algorithm in the ISA is written with IBM numbering zero on left and`
		b1cb33	`+ N-1 on right. The loop index is converted to "i" to match the algorithm`
		b1cb33	`+ for claritiy of matching the C code to the algorithm in the ISA. */`
		b1cb33	`+`
		b1cb33	`+ if (imm == 0b00) { // big endian expansion`
		b1cb33	`+ /* If IMM=0b00000, let pcv be the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement an`
		b1cb33	`+ expansion of the leftmost word elements of a source vector into the`
		b1cb33	`+ word elements of a result vector specified by the word-element mask`
		b1cb33	`+ in VSR[VRB+32].`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 4; index++) {`
		b1cb33	`+ i = 3 - index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*32;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 2) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 31;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ result[half_sel] \|= (4*j+0) << (shift_by+24); // word i, byte 0`
		b1cb33	`+ result[half_sel] \|= (4*j+1) << (shift_by+16); // word i, byte 1`
		b1cb33	`+ result[half_sel] \|= (4*j+2) << (shift_by+8); // word i, byte 2`
		b1cb33	`+ result[half_sel] \|= (4*j+3) << shift_by; // word i, byte 3`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x10) << (shift_by+24);`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x11) << (shift_by+16);`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x12) << (shift_by+8);`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x13) << shift_by;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b01) { // big endian compression`
		b1cb33	`+ /* If IMM=0b00001, let pcv be the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement a`
		b1cb33	`+ compression of the sparse word elements in a source vector specified`
		b1cb33	`+ by the word-element mask in VSR[VRB+32] into the leftmost word`
		b1cb33	`+ elements of a result vector.`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 4; index++) {`
		b1cb33	`+ i = 3 - index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*32;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 2) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 31;`
		b1cb33	`+`
		b1cb33	`+ if (((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j >= 2) {`
		b1cb33	`+ // word j, byte 0`
		b1cb33	`+ result[1] \|= (4index+0) << ((3 - j)32 + 24);`
		b1cb33	`+ // word j, byte 1`
		b1cb33	`+ result[1] \|= (4index+1) << ((3 - j)32 + 16);`
		b1cb33	`+ // word j, byte 2`
		b1cb33	`+ result[1] \|= (4index+2) << ((3 - j)32 + 8);`
		b1cb33	`+ // word j, byte 3`
		b1cb33	`+ result[1] \|= (4index+3) << ((3 - j)32 + 0);`
		b1cb33	`+ } else {`
		b1cb33	`+ result[0] \|= (4index+0) << ((1 - j)32 + 24);`
		b1cb33	`+ result[0] \|= (4index+1) << ((1 - j)32 + 16);`
		b1cb33	`+ result[0] \|= (4index+2) << ((1 - j)32 + 8);`
		b1cb33	`+ result[0] \|= (4index+3) << ((1 - j)32 + 0);`
		b1cb33	`+ }`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b10) { //little-endian expansion`
		b1cb33	`+ /* If IMM=0b00010, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement an`
		b1cb33	`+ expansion of the rightmost word elements of a source vector into the`
		b1cb33	`+ word elements of a result vector specified by the word-element mask`
		b1cb33	`+ in VSR[VRB+32].`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 4; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*32;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 2) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 31;`
		b1cb33	`+`
		b1cb33	`+ if (((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ result[half_sel] \|= (4*j+0) << (shift_by + 0); // word j, byte 0`
		b1cb33	`+ result[half_sel] \|= (4*j+1) << (shift_by + 8); // word j, byte 1`
		b1cb33	`+ result[half_sel] \|= (4*j+2) << (shift_by + 16); // word j, byte 2`
		b1cb33	`+ result[half_sel] \|= (4*j+3) << (shift_by + 24); // word j, byte 3`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x10) << (shift_by + 0);`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x11) << (shift_by + 8);`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x12) << (shift_by + 16);`
		b1cb33	`+ result[half_sel] \|= (4*index + 0x13) << (shift_by + 24);`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b11) { //little-endian compression`
		b1cb33	`+ /* If IMM=0b00011, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement a`
		b1cb33	`+ compression of the sparse word elements in a source vector specified`
		b1cb33	`+ by the word-element mask in VSR[VRB+32] into the rightmost word`
		b1cb33	`+ elements of a result vector. */`
		b1cb33	`+ for( index = 0; index < 4; index++) {`
		b1cb33	`+ i =index;`
		b1cb33	`+`
		b1cb33	`+ shift_by = i*32;`
		b1cb33	`+`
		b1cb33	`+ if ( i >= 2) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ shift_by = shift_by - 64;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = shift_by + 31;`
		b1cb33	`+`
		b1cb33	`+ if (((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j >= 2){`
		b1cb33	`+ // word j, byte 0`
		b1cb33	`+ result[0] \|= (4index + 0x0) << ((j-2)32+0);`
		b1cb33	`+ // word j, byte 1`
		b1cb33	`+ result[0] \|= (4index + 0x1) << ((j-2)32+8);`
		b1cb33	`+ // word j, byte 2`
		b1cb33	`+ result[0] \|= (4index + 0x2) << ((j-2)32+16);`
		b1cb33	`+ // word j, byte 3`
		b1cb33	`+ result[0] \|= (4index + 0x3) << ((j-2)32+24);`
		b1cb33	`+ } else {`
		b1cb33	`+ result[1] \|= (4index + 0x0) << (j32+0);`
		b1cb33	`+ result[1] \|= (4index + 0x1) << (j32+8);`
		b1cb33	`+ result[1] \|= (4index + 0x2) << (j32+16);`
		b1cb33	`+ result[1] \|= (4index + 0x3) << (j32+24);`
		b1cb33	`+ }`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+ } else {`
		b1cb33	`+ vex_printf("ERROR, vector_gen_pvc_word_mask_dirty_helper, imm value %u not supported.\n",`
		b1cb33	`+ imm);`
		b1cb33	`+ vassert(0);`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ write_VSX_entry( gst, reg_offset, result);`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+/* CALLED FROM GENERATED CODE */`
		b1cb33	`+void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`+ ULong src_hi, ULong src_lo,`
		b1cb33	`+ UInt reg_offset, UInt imm ) {`
		b1cb33	`+ /* The function computes the 128-bit result then writes it directly`
		b1cb33	`+ into the guest state VSX register. */`
		b1cb33	`+ UInt sel_shift_by, half_sel;`
		b1cb33	`+ ULong index, src, result[2];`
		b1cb33	`+ ULong j, i;`
		b1cb33	`+`
		b1cb33	`+ result[0] = 0;`
		b1cb33	`+ result[1] = 0;`
		b1cb33	`+ j = 0;`
		b1cb33	`+`
		b1cb33	`+ /* The algorithm in the ISA is written with IBM numbering zero on left and`
		b1cb33	`+ N-1 on right. The loop index is converted to "i" to match the algorithm`
		b1cb33	`+ for claritiy of matching the C code to the algorithm in the ISA. */`
		b1cb33	`+`
		b1cb33	`+ if (imm == 0b00) { // big endian expansion`
		b1cb33	`+ /* If IMM=0b00000, let pcv be the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement an`
		b1cb33	`+ expansion of the leftmost doubleword elements of a source vector into`
		b1cb33	`+ the doubleword elements of a result vector specified by the`
		b1cb33	`+ doubleword-element mask in VSR[VRB+32].`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 2; index++) {`
		b1cb33	`+ i = 1 - index;`
		b1cb33	`+`
		b1cb33	`+ if ( i == 1) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = 63;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x0) << 56; // dword i, byte 0`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x1) << 48; // dword i, byte 1`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x2) << 40; // dword i, byte 2`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x3) << 32; // dword i, byte 3`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x4) << 24; // dword i, byte 4`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x5) << 16; // dword i, byte 5`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x6) << 8; // dword i, byte 6`
		b1cb33	`+ result[half_sel] \|= (8*j + 0x7) << 0; // dword i, byte 7`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x10) << 56;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x11) << 48;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x12) << 40;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x13) << 32;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x14) << 24;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x15) << 16;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x16) << 8;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x17) << 0;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+ } else if (imm == 0b01) { // big endian compression`
		b1cb33	`+ /* If IMM=0b00001, let pcv be the the permute control vector required to`
		b1cb33	`+ enable a left-indexed permute (vperm or xxperm) to implement a`
		b1cb33	`+ compression of the sparse doubleword elements in a source vector`
		b1cb33	`+ specified by the doubleword-element mask in VSR[VRB+32] into the`
		b1cb33	`+ leftmost doubleword elements of a result vector.`
		b1cb33	`+ */`
		b1cb33	`+ for( index = 0; index < 2; index++) {`
		b1cb33	`+ i = 1 - index;`
		b1cb33	`+`
		b1cb33	`+ if ( i == 1) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = 63;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j == 1) {`
		b1cb33	`+ result[1] \|= (8*index + 0x0) << 56; // double-word j, byte 0`
		b1cb33	`+ result[1] \|= (8*index + 0x1) << 48; // double-word j, byte 1`
		b1cb33	`+ result[1] \|= (8*index + 0x2) << 40; // double-word j, byte 2`
		b1cb33	`+ result[1] \|= (8*index + 0x3) << 32; // double-word j, byte 3`
		b1cb33	`+ result[1] \|= (8*index + 0x4) << 24; // double-word j, byte 4`
		b1cb33	`+ result[1] \|= (8*index + 0x5) << 16; // double-word j, byte 5`
		b1cb33	`+ result[1] \|= (8*index + 0x6) << 8; // double-word j, byte 6`
		b1cb33	`+ result[1] \|= (8*index + 0x7) << 0; // double-word j, byte 7`
		b1cb33	`+ } else {`
		b1cb33	`+ result[0] \|= (8*index + 0x0) << 56; // double-word j, byte 0`
		b1cb33	`+ result[0] \|= (8*index + 0x1) << 48; // double-word j, byte 1`
		b1cb33	`+ result[0] \|= (8*index + 0x2) << 40; // double-word j, byte 2`
		b1cb33	`+ result[0] \|= (8*index + 0x3) << 32; // double-word j, byte 3`
		b1cb33	`+ result[0] \|= (8*index + 0x4) << 24; // double-word j, byte 4`
		b1cb33	`+ result[0] \|= (8*index + 0x5) << 16; // double-word j, byte 5`
		b1cb33	`+ result[0] \|= (8*index + 0x6) << 8; // double-word j, byte 6`
		b1cb33	`+ result[0] \|= (8*index + 0x7) << 0; // double-word j, byte 7`
		b1cb33	`+ }`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+ } else if (imm == 0b10) { //little-endian expansion`
		b1cb33	`+ /* If IMM=0b00010, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement an`
		b1cb33	`+ expansion of the rightmost doubleword elements of a source vector`
		b1cb33	`+ into the doubleword elements of a result vector specified by the`
		b1cb33	`+ doubleword-element mask in VSR[VRB+32].`
		b1cb33	`+ */`
		b1cb33	`+`
		b1cb33	`+ for( index = 0; index < 2; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+`
		b1cb33	`+ if ( i == 1) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = 63;`
		b1cb33	`+`
		b1cb33	`+ if ( ((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ result[half_sel] \|= (8*j+0) << 0; // double-word i, byte 0`
		b1cb33	`+ result[half_sel] \|= (8*j+1) << 8; // double-word i, byte 1`
		b1cb33	`+ result[half_sel] \|= (8*j+2) << 16; // double-word i, byte 2`
		b1cb33	`+ result[half_sel] \|= (8*j+3) << 24; // double-word i, byte 3`
		b1cb33	`+ result[half_sel] \|= (8*j+4) << 32; // double-word i, byte 4`
		b1cb33	`+ result[half_sel] \|= (8*j+5) << 40; // double-word i, byte 5`
		b1cb33	`+ result[half_sel] \|= (8*j+6) << 48; // double-word i, byte 6`
		b1cb33	`+ result[half_sel] \|= (8*j+7) << 56; // double-word i, byte 7`
		b1cb33	`+ j++;`
		b1cb33	`+ } else {`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x10) << 0;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x11) << 8;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x12) << 16;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x13) << 24;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x14) << 32;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x15) << 40;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x16) << 48;`
		b1cb33	`+ result[half_sel] \|= (8*index + 0x17) << 56;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ } else if (imm == 0b11) { //little-endian compression`
		b1cb33	`+ /* If IMM=0b00011, let pcv be the permute control vector required to`
		b1cb33	`+ enable a right-indexed permute (vpermr or xxpermr) to implement a`
		b1cb33	`+ compression of the sparse doubleword elements in a source vector`
		b1cb33	`+ specified by the doubleword-element mask in VSR[VRB+32] into the`
		b1cb33	`+ rightmost doubleword elements of a result vector. */`
		b1cb33	`+ for( index = 0; index < 2; index++) {`
		b1cb33	`+ i = index;`
		b1cb33	`+`
		b1cb33	`+ if ( i == 1) {`
		b1cb33	`+ src = src_hi;`
		b1cb33	`+ half_sel = 0;`
		b1cb33	`+ } else {`
		b1cb33	`+ src = src_lo;`
		b1cb33	`+ half_sel = 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ sel_shift_by = 63;`
		b1cb33	`+`
		b1cb33	`+ if (((src >> sel_shift_by) & 0x1) == 1) {`
		b1cb33	`+ if (j == 1) {`
		b1cb33	`+ result[0] \|= (8*index + 0x0) << 0; // double-word j, byte 0`
		b1cb33	`+ result[0] \|= (8*index + 0x1) << 8; // double-word j, byte 1`
		b1cb33	`+ result[0] \|= (8*index + 0x2) << 16; // double-word j, byte 2`
		b1cb33	`+ result[0] \|= (8*index + 0x3) << 24; // double-word j, byte 3`
		b1cb33	`+ result[0] \|= (8*index + 0x4) << 32; // double-word j, byte 4`
		b1cb33	`+ result[0] \|= (8*index + 0x5) << 40; // double-word j, byte 5`
		b1cb33	`+ result[0] \|= (8*index + 0x6) << 48; // double-word j, byte 6`
		b1cb33	`+ result[0] \|= (8*index + 0x7) << 56; // double-word j, byte 7`
		b1cb33	`+ } else {`
		b1cb33	`+ result[1] \|= (8*index + 0x0) << 0;`
		b1cb33	`+ result[1] \|= (8*index + 0x1) << 8;`
		b1cb33	`+ result[1] \|= (8*index + 0x2) << 16;`
		b1cb33	`+ result[1] \|= (8*index + 0x3) << 24;`
		b1cb33	`+ result[1] \|= (8*index + 0x4) << 32;`
		b1cb33	`+ result[1] \|= (8*index + 0x5) << 40;`
		b1cb33	`+ result[1] \|= (8*index + 0x6) << 48;`
		b1cb33	`+ result[1] \|= (8*index + 0x7) << 56;`
		b1cb33	`+ }`
		b1cb33	`+ j++;`
		b1cb33	`+ }`
		b1cb33	`+ }`
		b1cb33	`+ } else {`
		b1cb33	`+ vex_printf("ERROR, vector_gen_pvc_dword_mask_helper, imm value %u not supported.\n",`
		b1cb33	`+ imm);`
		b1cb33	`+ vassert(0);`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ write_VSX_entry( gst, reg_offset, result);`
		b1cb33	`+}`
		b1cb33
		b1cb33	`/------------------------------------------------/`
		b1cb33	`/---- VSX Matrix signed integer GER functions ---/`
		b1cb33	`diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`index bcabf69dd..354be6b53 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`+++ b/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`@@ -3322,6 +3322,7 @@ static IRExpr * locate_vector_ele_eq ( IRTemp src, IRExpr *value,`
		b1cb33	`#define DFORM_IMMASK 0xffffffff`
		b1cb33	`#define DSFORM_IMMASK 0xfffffffc`
		b1cb33	`#define DQFORM_IMMASK 0xfffffff0`
		b1cb33	`+#define DA8LSFORM_IMMASK 0x3fffffff // Algebraic 8LS Dform`
		b1cb33
		b1cb33	`#define ISA_3_1_PREFIX_CHECK if (prefix) {if (!allow_isa_3_1) goto decode_noIsa3_1;}`
		b1cb33
		b1cb33	`@@ -6109,6 +6110,87 @@ static void vsx_matrix_64bit_float_ger ( const VexAbiInfo* vbi,`
		b1cb33	`stmt( IRStmt_Dirty(d) );`
		b1cb33	`}`
		b1cb33
		b1cb33	`+static void vector_gen_pvc_mask ( const VexAbiInfo* vbi,`
		b1cb33	`+ IRExpr *src, UInt IMM,`
		b1cb33	`+ UInt opc2, UInt VSX_addr ) {`
		b1cb33	`+ /* The function takes a 64-bit source and an immediate value. The function`
		b1cb33	`+ calls a helper to execute the xxgenpcvbm, xxgenpcvhm, xxgenpcvwm,`
		b1cb33	`+ xxgenpcvdm instruction. The instructions are not practical to do with`
		b1cb33	`+ Iops. The instruction is implemented with a dirty helper that`
		b1cb33	`+ calculates the 128-bit result and writes it directly into the guest`
		b1cb33	`+ state VSX register.`
		b1cb33	`+ */`
		b1cb33	`+ IRTemp src_hi = newTemp( Ity_I64);`
		b1cb33	`+ IRTemp src_lo = newTemp( Ity_I64);`
		b1cb33	`+`
		b1cb33	`+ IRDirty* d;`
		b1cb33	`+`
		b1cb33	`+ vassert( (VSX_addr >= 0) && (VSX_addr < 64) );`
		b1cb33	`+ UInt reg_offset = offsetofPPCGuestState( guest_VSR0 )`
		b1cb33	`+ + sizeof(U128) * VSX_addr;`
		b1cb33	`+`
		b1cb33	`+ assign( src_hi, unop( Iop_V128HIto64, src ) );`
		b1cb33	`+ assign( src_lo, unop( Iop_V128to64, src ) );`
		b1cb33	`+`
		b1cb33	`+ IRExpr** args = mkIRExprVec_5(`
		b1cb33	`+ IRExpr_GSPTR(),`
		b1cb33	`+ mkexpr( src_hi ),`
		b1cb33	`+ mkexpr( src_lo ),`
		b1cb33	`+ mkU32( reg_offset ),`
		b1cb33	`+ mkU64( IMM ) );`
		b1cb33	`+`
		b1cb33	`+ switch( opc2 ) {`
		b1cb33	`+ case 0x394: // xxgenpcvbm`
		b1cb33	`+ d = unsafeIRDirty_0_N (`
		b1cb33	`+ 0 /regparms/,`
		b1cb33	`+ "vector_gen_pvc_byte_mask_dirty_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &vector_gen_pvc_byte_mask_dirty_helper ),`
		b1cb33	`+ args);`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ case 0x395: // xxgenpcvhm`
		b1cb33	`+ d = unsafeIRDirty_0_N (`
		b1cb33	`+ 0 /regparms/,`
		b1cb33	`+ "vector_gen_pvc_hword_mask_dirty_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &vector_gen_pvc_hword_mask_dirty_helper ),`
		b1cb33	`+ args);`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ case 0x3B4: // xxgenpcvwm`
		b1cb33	`+ d = unsafeIRDirty_0_N (`
		b1cb33	`+ 0 /regparms/,`
		b1cb33	`+ "vector_gen_pvc_word_mask_dirty_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &vector_gen_pvc_word_mask_dirty_helper ),`
		b1cb33	`+ args);`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ case 0x3B5: // xxgenpcvdm`
		b1cb33	`+ d = unsafeIRDirty_0_N (`
		b1cb33	`+ 0 /regparms/,`
		b1cb33	`+ "vector_gen_pvc_dword_mask_dirty_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &vector_gen_pvc_dword_mask_dirty_helper ),`
		b1cb33	`+ args);`
		b1cb33	`+ break;`
		b1cb33	`+ default:`
		b1cb33	`+ vex_printf("ERROR: Unkown instruction = %u in vector_gen_pvc_mask()\n",`
		b1cb33	`+ opc2);`
		b1cb33	`+ return;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ d->nFxState = 1;`
		b1cb33	`+ vex_bzero(&d->fxState, sizeof(d->fxState));`
		b1cb33	`+ d->fxState[0].fx = Ifx_Modify;`
		b1cb33	`+ d->fxState[0].size = sizeof(U128);`
		b1cb33	`+ d->fxState[0].offset = reg_offset;`
		b1cb33	`+`
		b1cb33	`+ /* execute the dirty call, side-effecting guest state */`
		b1cb33	`+ stmt( IRStmt_Dirty(d) );`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`static IRExpr * UNSIGNED_CMP_GT_V128 ( IRExpr vA, IRExpr vB ) {`
		b1cb33	`/* This function does an unsigned compare of two V128 values. The`
		b1cb33	`* function is for use in 32-bit mode only as it is expensive. The`
		b1cb33	`@@ -35227,6 +35309,54 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,`
		b1cb33	`return True;`
		b1cb33	`}`
		b1cb33
		b1cb33	`+static Bool dis_vector_generate_pvc_from_mask ( UInt prefix,`
		b1cb33	`+ UInt theInstr,`
		b1cb33	`+ const VexAbiInfo* vbi )`
		b1cb33	`+{`
		b1cb33	`+ UChar XT_addr = ifieldRegXT(theInstr);`
		b1cb33	`+ UChar vB_addr = ifieldRegB(theInstr);`
		b1cb33	`+ IRTemp vB = newTemp( Ity_V128 );`
		b1cb33	`+ UInt opc2 = ifieldOPClo10(theInstr);`
		b1cb33	`+ UInt IMM = IFIELD(theInstr, (31-15), 5); // bits[11:15]`
		b1cb33	`+`
		b1cb33	`+ assign( vB, getVReg( vB_addr ) );`
		b1cb33	`+`
		b1cb33	`+ switch( opc2 ) {`
		b1cb33	`+ case 0x394:`
		b1cb33	`+ DIP("xxgenpcvbm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);`
		b1cb33	`+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and`
		b1cb33	`+ write it to the VSX result register. */`
		b1cb33	`+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ case 0x395:`
		b1cb33	`+ DIP("xxgenpcvhm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);`
		b1cb33	`+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and`
		b1cb33	`+ write it to the VSX result register. */`
		b1cb33	`+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ case 0x3B4:`
		b1cb33	`+ DIP("xxgenpcvwm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);`
		b1cb33	`+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and`
		b1cb33	`+ write it to the VSX result register. */`
		b1cb33	`+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ case 0x3B5:`
		b1cb33	`+ DIP("xxgenpcvdm v%u,v%u,%u\n", XT_addr, vB_addr, IMM);`
		b1cb33	`+ /* vector_gen_pvc_mask uses a dirty helper to calculate the result and`
		b1cb33	`+ write it to the VSX result register. */`
		b1cb33	`+ vector_gen_pvc_mask( vbi, mkexpr( vB ), IMM, opc2, XT_addr );`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`+ default:`
		b1cb33	`+ return False;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ return True;`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`static Int dis_nop_prefix ( UInt prefix, UInt theInstr )`
		b1cb33	`{`
		b1cb33	`Bool is_prefix = prefix_instruction( prefix );`
		b1cb33	`@@ -35748,14 +35878,9 @@ DisResult disInstr_PPC_WRK (`
		b1cb33	`}`
		b1cb33	`goto decode_failure;`
		b1cb33
		b1cb33	`- case 0x31: // lfsu, stxv`
		b1cb33	`+ case 0x31: // lfsu`
		b1cb33	`if (!allow_F) goto decode_noF;`
		b1cb33	`- if (prefix_instruction( prefix )) { // stxv`
		b1cb33	`- if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;`
		b1cb33	`- if (dis_fp_pair_prefix( prefix, theInstr )) goto decode_success;`
		b1cb33	`- } else { // lfsu`
		b1cb33	`- if (dis_fp_load( prefix, theInstr )) goto decode_success;`
		b1cb33	`- }`
		b1cb33	`+ if (dis_fp_load( prefix, theInstr )) goto decode_success;`
		b1cb33	`goto decode_failure;`
		b1cb33
		b1cb33	`case 0x32:`
		b1cb33	`@@ -35842,7 +35967,6 @@ DisResult disInstr_PPC_WRK (`
		b1cb33	`case 0x39: // pld, lxsd, lxssp, lfdp`
		b1cb33	`{`
		b1cb33	`UInt opc2tmp = ifieldOPC0o2(theInstr);`
		b1cb33	`-`
		b1cb33	`if (!allow_F) goto decode_noF;`
		b1cb33	`if (prefix_instruction( prefix )) { // pld`
		b1cb33	`if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;`
		b1cb33	`@@ -36125,12 +36249,6 @@ DisResult disInstr_PPC_WRK (`
		b1cb33	`goto decode_failure;`
		b1cb33	`}`
		b1cb33
		b1cb33	`- /* The vsxOpc2 returned is the "normalized" value, representing the`
		b1cb33	`- * instructions secondary opcode as taken from the standard secondary`
		b1cb33	`- * opcode field [21:30] (IBM notatition), even if the actual field`
		b1cb33	`- * is non-standard. These normalized values are given in the opcode`
		b1cb33	`- * appendices of the ISA 2.06 document.`
		b1cb33	`- */`
		b1cb33	`if ( ( opc2 == 0x168 ) && ( IFIELD( theInstr, 19, 2 ) == 0 ) )// xxspltib`
		b1cb33	`{`
		b1cb33	`/* This is a special case of the XX1 form where the RA, RB`
		b1cb33	`@@ -36153,6 +36271,23 @@ DisResult disInstr_PPC_WRK (`
		b1cb33	`goto decode_failure;`
		b1cb33	`}`
		b1cb33
		b1cb33	`+ if ( ( opc2 == 0x394 ) \|\| // xxgenpcvbm`
		b1cb33	`+ ( opc2 == 0x395 ) \|\| // xxgenpcvwm`
		b1cb33	`+ ( opc2 == 0x3B4 ) \|\| // xxgenpcvhm`
		b1cb33	`+ ( opc2 == 0x3B5 ) ) { // xxgenpcvdm`
		b1cb33	`+ if ( !(allow_isa_3_1) ) goto decode_noIsa3_1;`
		b1cb33	`+ if (dis_vector_generate_pvc_from_mask( prefix, theInstr,`
		b1cb33	`+ abiinfo ))`
		b1cb33	`+ goto decode_success;`
		b1cb33	`+ goto decode_failure;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ /* The vsxOpc2 returned is the "normalized" value, representing the`
		b1cb33	`+ * instructions secondary opcode as taken from the standard secondary`
		b1cb33	`+ * opcode field [21:30] (IBM notatition), even if the actual field`
		b1cb33	`+ * is non-standard. These normalized values are given in the opcode`
		b1cb33	`+ * appendices of the ISA 2.06 document.`
		b1cb33	`+ */`
		b1cb33	`vsxOpc2 = get_VSX60_opc2(opc2, theInstr);`
		b1cb33
		b1cb33	`switch (vsxOpc2) {`
		b1cb33	`commit 078f89e99b6f62e043f6138c6a7ae238befc1f2a`
		b1cb33	`Author: Carl Love <cel@us.ibm.com>`
		b1cb33	`Date: Fri Feb 26 15:46:55 2021 -0600`
		b1cb33
		b1cb33	`PPC64: Reduced-Precision - bfloat16 Outer Product & Format Conversion Operations`
		b1cb33
		b1cb33	`Add support for:`
		b1cb33
		b1cb33	`pmxvbf16ger2 Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update)`
		b1cb33	`pmxvbf16ger2pp Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Positive`
		b1cb33	`multiply, Positive accumulate`
		b1cb33	`pmxvbf16ger2pn Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Positive`
		b1cb33	`multiply, Negative accumulate`
		b1cb33	`pmxvbf16ger2np Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Negative`
		b1cb33	`multiply, Positive accumulate`
		b1cb33	`pmxvbf16ger2nn Prefixed Masked VSX Vector bfloat16 GER (Rank-2 Update) Negative`
		b1cb33	`multiply, Negative accumulate`
		b1cb33	`xvbf16ger2VSX Vector bfloat16 GER (Rank-2 Update)`
		b1cb33	`xvbf16ger2pp VSX Vector bfloat16 GER (Rank-2 Update) Positive multiply, Positive`
		b1cb33	`accumulate`
		b1cb33	`xvbf16ger2pn VSX Vector bfloat16 GER (Rank-2 Update) Positive multiply, Negative`
		b1cb33	`accumulate`
		b1cb33	`xvbf16ger2np VSX Vector bfloat16 GER (Rank-2 Update) Negative multiply, Positive`
		b1cb33	`accumulate`
		b1cb33	`xvbf16ger2nn VSX Vector bfloat16 GER (Rank-2 Update) Negative multiply, Negative`
		b1cb33	`accumulate`
		b1cb33	`xvcvbf16sp VSX Vector Convert bfloat16 to Single-Precision format`
		b1cb33	`xvcvspbf16 VSX Vector Convert with round Single-Precision to bfloat16 format`
		b1cb33
		b1cb33	`diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h`
		b1cb33	`index 54ce923a9..d36d6c07d 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_defs.h`
		b1cb33	`+++ b/VEX/priv/guest_ppc_defs.h`
		b1cb33	`@@ -150,6 +150,8 @@ extern ULong convert_to_zoned_helper( ULong src_hi, ULong src_low,`
		b1cb33	`ULong return_upper );`
		b1cb33	`extern ULong convert_to_national_helper( ULong src, ULong return_upper );`
		b1cb33	`extern ULong convert_from_zoned_helper( ULong src_hi, ULong src_low );`
		b1cb33	`+extern ULong convert_from_floattobf16_helper( ULong src );`
		b1cb33	`+extern ULong convert_from_bf16tofloat_helper( ULong src );`
		b1cb33	`extern ULong convert_from_national_helper( ULong src_hi, ULong src_low );`
		b1cb33	`extern ULong generate_C_FPCC_helper( ULong size, ULong src_hi, ULong src );`
		b1cb33	`extern ULong extract_bits_under_mask_helper( ULong src, ULong mask,`
		b1cb33	`@@ -201,6 +203,11 @@ extern void vector_gen_pvc_dword_mask_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`#define XVF16GER2PN 0b10010010`
		b1cb33	`#define XVF16GER2NP 0b01010010`
		b1cb33	`#define XVF16GER2NN 0b11010010`
		b1cb33	`+#define XVBF16GER2 0b00110011`
		b1cb33	`+#define XVBF16GER2PP 0b00110010`
		b1cb33	`+#define XVBF16GER2PN 0b10110010`
		b1cb33	`+#define XVBF16GER2NP 0b01110010`
		b1cb33	`+#define XVBF16GER2NN 0b11110010`
		b1cb33	`#define XVF32GER 0b00011011`
		b1cb33	`#define XVF32GERPP 0b00011010`
		b1cb33	`#define XVF32GERPN 0b10011010`
		b1cb33	`diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`index 75497abb9..6bcee966d 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`+++ b/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`@@ -1905,6 +1905,125 @@ static Double conv_f16_to_double( ULong input )`
		b1cb33	`# endif`
		b1cb33	`}`
		b1cb33
		b1cb33	`+#define BF16_SIGN_MASK 0x8000`
		b1cb33	`+#define BF16_EXP_MASK 0x7F80`
		b1cb33	`+#define BF16_FRAC_MASK 0x007F`
		b1cb33	`+#define BF16_BIAS 127`
		b1cb33	`+#define BF16_MAX_UNBIASED_EXP 127`
		b1cb33	`+#define BF16_MIN_UNBIASED_EXP -126`
		b1cb33	`+#define FLOAT_SIGN_MASK 0x80000000`
		b1cb33	`+#define FLOAT_EXP_MASK 0x7F800000`
		b1cb33	`+#define FLOAT_FRAC_MASK 0x007FFFFF`
		b1cb33	`+#define FLOAT_FRAC_BIT8 0x00008000`
		b1cb33	`+#define FLOAT_BIAS 127`
		b1cb33	`+`
		b1cb33	`+static Float conv_bf16_to_float( UInt input )`
		b1cb33	`+{`
		b1cb33	`+ /* input is 16-bit bfloat.`
		b1cb33	`+ bias +127, exponent 8-bits, fraction 7-bits`
		b1cb33	`+`
		b1cb33	`+ output is 32-bit float.`
		b1cb33	`+ bias +127, exponent 8-bits, fraction 22-bits`
		b1cb33	`+ */`
		b1cb33	`+`
		b1cb33	`+ UInt input_exp, input_fraction, unbiased_exp;`
		b1cb33	`+ UInt output_exp, output_fraction;`
		b1cb33	`+ UInt sign;`
		b1cb33	`+ union convert_t conv;`
		b1cb33	`+`
		b1cb33	`+ sign = (UInt)(input & BF16_SIGN_MASK);`
		b1cb33	`+ input_exp = input & BF16_EXP_MASK;`
		b1cb33	`+ unbiased_exp = (input_exp >> 7) - (UInt)BF16_BIAS;`
		b1cb33	`+ input_fraction = input & BF16_FRAC_MASK;`
		b1cb33	`+`
		b1cb33	`+ if (((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&`
		b1cb33	`+ (input_fraction != 0)) {`
		b1cb33	`+ /* input is NaN or SNaN, exp all 1's, fraction != 0 */`
		b1cb33	`+ output_exp = FLOAT_EXP_MASK;`
		b1cb33	`+ output_fraction = input_fraction;`
		b1cb33	`+`
		b1cb33	`+ } else if(((input_exp & BF16_EXP_MASK) == BF16_EXP_MASK) &&`
		b1cb33	`+ ( input_fraction == 0)) {`
		b1cb33	`+ /* input is infinity, exp all 1's, fraction = 0 */`
		b1cb33	`+ output_exp = FLOAT_EXP_MASK;`
		b1cb33	`+ output_fraction = 0;`
		b1cb33	`+`
		b1cb33	`+ } else if((input_exp == 0) && (input_fraction == 0)) {`
		b1cb33	`+ /* input is zero */`
		b1cb33	`+ output_exp = 0;`
		b1cb33	`+ output_fraction = 0;`
		b1cb33	`+`
		b1cb33	`+ } else if((input_exp == 0) && (input_fraction != 0)) {`
		b1cb33	`+ /* input is denormal */`
		b1cb33	`+ output_fraction = input_fraction;`
		b1cb33	`+ output_exp = (-(Int)BF16_BIAS + (Int)FLOAT_BIAS ) << 23;`
		b1cb33	`+`
		b1cb33	`+ } else {`
		b1cb33	`+ /* result is normal */`
		b1cb33	`+ output_exp = (unbiased_exp + FLOAT_BIAS) << 23;`
		b1cb33	`+ output_fraction = input_fraction;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ conv.u32 = sign << (31 - 15) \| output_exp \| (output_fraction << (23-7));`
		b1cb33	`+ return conv.f;`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+static UInt conv_float_to_bf16( UInt input )`
		b1cb33	`+{`
		b1cb33	`+ /* input is 32-bit float stored as unsigned 32-bit.`
		b1cb33	`+ bias +127, exponent 8-bits, fraction 23-bits`
		b1cb33	`+`
		b1cb33	`+ output is 16-bit bfloat.`
		b1cb33	`+ bias +127, exponent 8-bits, fraction 7-bits`
		b1cb33	`+`
		b1cb33	`+ If the unbiased exponent of the input is greater than the max floating`
		b1cb33	`+ point unbiased exponent value, the result of the floating point 16-bit`
		b1cb33	`+ value is infinity.`
		b1cb33	`+ */`
		b1cb33	`+`
		b1cb33	`+ UInt input_exp, input_fraction;`
		b1cb33	`+ UInt output_exp, output_fraction;`
		b1cb33	`+ UInt result, sign;`
		b1cb33	`+`
		b1cb33	`+ sign = input & FLOAT_SIGN_MASK;`
		b1cb33	`+ input_exp = input & FLOAT_EXP_MASK;`
		b1cb33	`+ input_fraction = input & FLOAT_FRAC_MASK;`
		b1cb33	`+`
		b1cb33	`+ if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&`
		b1cb33	`+ (input_fraction != 0)) {`
		b1cb33	`+ /* input is NaN or SNaN, exp all 1's, fraction != 0 */`
		b1cb33	`+ output_exp = BF16_EXP_MASK;`
		b1cb33	`+ output_fraction = (ULong)input_fraction >> (23 - 7);`
		b1cb33	`+ } else if (((input_exp & FLOAT_EXP_MASK) == FLOAT_EXP_MASK) &&`
		b1cb33	`+ ( input_fraction == 0)) {`
		b1cb33	`+ /* input is infinity, exp all 1's, fraction = 0 */`
		b1cb33	`+ output_exp = BF16_EXP_MASK;`
		b1cb33	`+ output_fraction = 0;`
		b1cb33	`+ } else if ((input_exp == 0) && (input_fraction == 0)) {`
		b1cb33	`+ /* input is zero */`
		b1cb33	`+ output_exp = 0;`
		b1cb33	`+ output_fraction = 0;`
		b1cb33	`+ } else if ((input_exp == 0) && (input_fraction != 0)) {`
		b1cb33	`+ /* input is denormal */`
		b1cb33	`+ output_exp = 0;`
		b1cb33	`+ output_fraction = (ULong)input_fraction >> (23 - 7);`
		b1cb33	`+ } else {`
		b1cb33	`+ /* result is normal */`
		b1cb33	`+ output_exp = (input_exp - BF16_BIAS + FLOAT_BIAS) >> (23 - 7);`
		b1cb33	`+ output_fraction = (ULong)input_fraction >> (23 - 7);`
		b1cb33	`+`
		b1cb33	`+ /* Round result. Look at the 8th bit position of the 32-bit floating`
		b1cb33	`+ pointt fraction. The F16 fraction is only 7 bits wide so if the 8th`
		b1cb33	`+ bit of the F32 is a 1 we need to round up by adding 1 to the output`
		b1cb33	`+ fraction. */`
		b1cb33	`+ if ((input_fraction & FLOAT_FRAC_BIT8) == FLOAT_FRAC_BIT8)`
		b1cb33	`+ /* Round the F16 fraction up by 1 */`
		b1cb33	`+ output_fraction = output_fraction + 1;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`+ result = sign >> (31 - 15) \| output_exp \| output_fraction;`
		b1cb33	`+ return result;`
		b1cb33	`+}`
		b1cb33
		b1cb33	`static Float conv_double_to_float( Double src )`
		b1cb33	`{`
		b1cb33	`@@ -1942,6 +2061,36 @@ static Float negate_float( Float input )`
		b1cb33	`return -input;`
		b1cb33	`}`
		b1cb33
		b1cb33	`+/* This C-helper takes a vector of two 32-bit floating point values`
		b1cb33	`+ * and returns a vector containing two 16-bit bfloats.`
		b1cb33	`+ input: word0 word1`
		b1cb33	`+ output 0x0 hword1 0x0 hword3`
		b1cb33	`+ Called from generated code.`
		b1cb33	`+ */`
		b1cb33	`+ULong convert_from_floattobf16_helper( ULong src ) {`
		b1cb33	`+ ULong resultHi, resultLo;`
		b1cb33	`+`
		b1cb33	`+ resultHi = (ULong)conv_float_to_bf16( (UInt)(src >> 32));`
		b1cb33	`+ resultLo = (ULong)conv_float_to_bf16( (UInt)(src & 0xFFFFFFFF));`
		b1cb33	`+ return (resultHi << 32) \| resultLo;`
		b1cb33	`+`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+/* This C-helper takes a vector of two 16-bit bfloating point values`
		b1cb33	`+ * and returns a vector containing one 32-bit float.`
		b1cb33	`+ input: 0x0 hword1 0x0 hword3`
		b1cb33	`+ output: word0 word1`
		b1cb33	`+ */`
		b1cb33	`+ULong convert_from_bf16tofloat_helper( ULong src ) {`
		b1cb33	`+ ULong result;`
		b1cb33	`+ union convert_t conv;`
		b1cb33	`+ conv.f = conv_bf16_to_float( (UInt)(src >> 32) );`
		b1cb33	`+ result = (ULong) conv.u32;`
		b1cb33	`+ conv.f = conv_bf16_to_float( (UInt)(src & 0xFFFFFFFF));`
		b1cb33	`+ result = (result << 32) \| (ULong) conv.u32;`
		b1cb33	`+ return result;`
		b1cb33	`+ }`
		b1cb33	`+`
		b1cb33	`void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`UInt offset_ACC,`
		b1cb33	`ULong srcA_hi, ULong srcA_lo,`
		b1cb33	`@@ -2002,24 +2151,44 @@ void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`srcB_word[0][j] = (UInt)((srcB_lo >> (16-16*j)) & mask);`
		b1cb33	`}`
		b1cb33
		b1cb33	`+ /* Note the isa is not consistent in the src naming. Will use the`
		b1cb33	`+ naming src10, src11, src20, src21 used with xvf16ger2 instructions.`
		b1cb33	`+ */`
		b1cb33	`for( j = 0; j < 4; j++) {`
		b1cb33	`if (((pmsk >> 1) & 0x1) == 0) {`
		b1cb33	`src10 = 0;`
		b1cb33	`src20 = 0;`
		b1cb33	`} else {`
		b1cb33	`- src10 = conv_f16_to_double((ULong)srcA_word[i][0]);`
		b1cb33	`- src20 = conv_f16_to_double((ULong)srcB_word[j][0]);`
		b1cb33	`+ if (( inst == XVF16GER2 ) \|\| ( inst == XVF16GER2PP )`
		b1cb33	`+ \|\| ( inst == XVF16GER2PN ) \|\| ( inst == XVF16GER2NP )`
		b1cb33	`+ \|\| ( inst == XVF16GER2NN )) {`
		b1cb33	`+ src10 = conv_f16_to_double((ULong)srcA_word[i][0]);`
		b1cb33	`+ src20 = conv_f16_to_double((ULong)srcB_word[j][0]);`
		b1cb33	`+ } else {`
		b1cb33	`+ /* Input is in bfloat format, result is stored in the`
		b1cb33	`+ "traditional" 64-bit float format. */`
		b1cb33	`+ src10 = (double)conv_bf16_to_float((ULong)srcA_word[i][0]);`
		b1cb33	`+ src20 = (double)conv_bf16_to_float((ULong)srcB_word[j][0]);`
		b1cb33	`+ }`
		b1cb33	`}`
		b1cb33
		b1cb33	`if ((pmsk & 0x1) == 0) {`
		b1cb33	`src11 = 0;`
		b1cb33	`src21 = 0;`
		b1cb33	`} else {`
		b1cb33	`- src11 = conv_f16_to_double((ULong)srcA_word[i][1]);`
		b1cb33	`- src21 = conv_f16_to_double((ULong)srcB_word[j][1]);`
		b1cb33	`+ if (( inst == XVF16GER2 ) \|\| ( inst == XVF16GER2PP )`
		b1cb33	`+ \|\| ( inst == XVF16GER2PN ) \|\| ( inst == XVF16GER2NP )`
		b1cb33	`+ \|\| ( inst == XVF16GER2NN )) {`
		b1cb33	`+ src11 = conv_f16_to_double((ULong)srcA_word[i][1]);`
		b1cb33	`+ src21 = conv_f16_to_double((ULong)srcB_word[j][1]);`
		b1cb33	`+ } else {`
		b1cb33	`+ /* Input is in bfloat format, result is stored in the`
		b1cb33	`+ "traditional" 64-bit float format. */`
		b1cb33	`+ src11 = (double)conv_bf16_to_float((ULong)srcA_word[i][1]);`
		b1cb33	`+ src21 = (double)conv_bf16_to_float((ULong)srcB_word[j][1]);`
		b1cb33	`+ }`
		b1cb33	`}`
		b1cb33
		b1cb33	`-`
		b1cb33	`prod = src10 * src20;`
		b1cb33	`msum = prod + src11 * src21;`
		b1cb33
		b1cb33	`@@ -2027,26 +2196,26 @@ void vsx_matrix_16bit_float_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`/* Note, we do not track the exception handling bits`
		b1cb33	`ox, ux, xx, si, mz, vxsnan and vximz in the FPSCR. */`
		b1cb33
		b1cb33	`- if ( inst == XVF16GER2 )`
		b1cb33	`+ if (( inst == XVF16GER2 ) \|\| ( inst == XVBF16GER2 ) )`
		b1cb33	`result[j] = reinterpret_float_as_int(`
		b1cb33	`conv_double_to_float(msum) );`
		b1cb33
		b1cb33	`- else if ( inst == XVF16GER2PP )`
		b1cb33	`+ else if (( inst == XVF16GER2PP ) \|\| (inst == XVBF16GER2PP ))`
		b1cb33	`result[j] = reinterpret_float_as_int(`
		b1cb33	`conv_double_to_float(msum)`
		b1cb33	`+ acc_word[j] );`
		b1cb33
		b1cb33	`- else if ( inst == XVF16GER2PN )`
		b1cb33	`+ else if (( inst == XVF16GER2PN ) \|\| ( inst == XVBF16GER2PN ))`
		b1cb33	`result[j] = reinterpret_float_as_int(`
		b1cb33	`conv_double_to_float(msum)`
		b1cb33	`+ negate_float( acc_word[j] ) );`
		b1cb33
		b1cb33	`- else if ( inst == XVF16GER2NP )`
		b1cb33	`+ else if (( inst == XVF16GER2NP ) \|\| ( inst == XVBF16GER2NP ))`
		b1cb33	`result[j] = reinterpret_float_as_int(`
		b1cb33	`conv_double_to_float( negate_double( msum ) )`
		b1cb33	`+ acc_word[j] );`
		b1cb33
		b1cb33	`- else if ( inst == XVF16GER2NN )`
		b1cb33	`+ else if (( inst == XVF16GER2NN ) \|\| ( inst == XVBF16GER2NN ))`
		b1cb33	`result[j] = reinterpret_float_as_int(`
		b1cb33	`conv_double_to_float( negate_double( msum ) )`
		b1cb33	`+ negate_float( acc_word[j] ) );`
		b1cb33	`diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`index 354be6b53..20553a539 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`+++ b/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`@@ -5688,6 +5688,57 @@ static IRExpr * convert_from_national ( const VexAbiInfo* vbi, IRExpr *src ) {`
		b1cb33	`return mkexpr( result );`
		b1cb33	`}`
		b1cb33
		b1cb33	`+static IRExpr * vector_convert_floattobf16 ( const VexAbiInfo* vbi,`
		b1cb33	`+ IRExpr *src ) {`
		b1cb33	`+ /* The function takes 128-bit value containing four 32-bit floats and`
		b1cb33	`+ returns a 128-bit value containint four 16-bit bfloats in the lower`
		b1cb33	`+ halfwords. */`
		b1cb33	`+`
		b1cb33	`+ IRTemp resultHi = newTemp( Ity_I64);`
		b1cb33	`+ IRTemp resultLo = newTemp( Ity_I64);`
		b1cb33	`+`
		b1cb33	`+ assign( resultHi,`
		b1cb33	`+ mkIRExprCCall( Ity_I64, 0 /regparms/,`
		b1cb33	`+ "vector_convert_floattobf16_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &convert_from_floattobf16_helper ),`
		b1cb33	`+ mkIRExprVec_1( unop( Iop_V128HIto64, src ) ) ) );`
		b1cb33	`+`
		b1cb33	`+ assign( resultLo,`
		b1cb33	`+ mkIRExprCCall( Ity_I64, 0 /regparms/,`
		b1cb33	`+ "vector_convert_floattobf16_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &convert_from_floattobf16_helper ),`
		b1cb33	`+ mkIRExprVec_1( unop( Iop_V128to64, src ) ) ) );`
		b1cb33	`+`
		b1cb33	`+ return binop( Iop_64HLtoV128, mkexpr( resultHi ), mkexpr( resultLo ) );`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`+static IRExpr * vector_convert_bf16tofloat ( const VexAbiInfo* vbi,`
		b1cb33	`+ IRExpr *src ) {`
		b1cb33	`+ /* The function takes 128-bit value containing four 16-bit bfloats in`
		b1cb33	`+ the lower halfwords and returns a 128-bit value containint four`
		b1cb33	`+ 32-bit floats. */`
		b1cb33	`+ IRTemp resultHi = newTemp( Ity_I64);`
		b1cb33	`+ IRTemp resultLo = newTemp( Ity_I64);`
		b1cb33	`+`
		b1cb33	`+ assign( resultHi,`
		b1cb33	`+ mkIRExprCCall( Ity_I64, 0 /regparms/,`
		b1cb33	`+ "vector_convert_bf16tofloat_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &convert_from_bf16tofloat_helper ),`
		b1cb33	`+ mkIRExprVec_1( unop( Iop_V128HIto64, src ) ) ) );`
		b1cb33	`+`
		b1cb33	`+ assign( resultLo,`
		b1cb33	`+ mkIRExprCCall( Ity_I64, 0 /regparms/,`
		b1cb33	`+ "vector_convert_bf16tofloat_helper",`
		b1cb33	`+ fnptr_to_fnentry( vbi,`
		b1cb33	`+ &convert_from_bf16tofloat_helper ),`
		b1cb33	`+ mkIRExprVec_1( unop( Iop_V128to64, src ) ) ) );`
		b1cb33	`+`
		b1cb33	`+ return binop( Iop_64HLtoV128, mkexpr( resultHi ), mkexpr( resultLo ) );`
		b1cb33	`+}`
		b1cb33	`+`
		b1cb33	`static IRExpr * popcnt64 ( const VexAbiInfo* vbi,`
		b1cb33	`IRExpr *src ){`
		b1cb33	`/* The function takes a 64-bit source and counts the number of bits in the`
		b1cb33	`@@ -5936,6 +5987,7 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,`
		b1cb33	`case XVI16GER2:`
		b1cb33	`case XVI16GER2S:`
		b1cb33	`case XVF16GER2:`
		b1cb33	`+ case XVBF16GER2:`
		b1cb33	`case XVF32GER:`
		b1cb33	`AT_fx = Ifx_Write;`
		b1cb33	`break;`
		b1cb33	`@@ -5943,6 +5995,10 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,`
		b1cb33	`case XVI8GER4PP:`
		b1cb33	`case XVI16GER2PP:`
		b1cb33	`case XVI16GER2SPP:`
		b1cb33	`+ case XVBF16GER2PP:`
		b1cb33	`+ case XVBF16GER2PN:`
		b1cb33	`+ case XVBF16GER2NP:`
		b1cb33	`+ case XVBF16GER2NN:`
		b1cb33	`case XVF16GER2PP:`
		b1cb33	`case XVF16GER2PN:`
		b1cb33	`case XVF16GER2NP:`
		b1cb33	`@@ -23899,6 +23955,24 @@ dis_vxs_misc( UInt prefix, UInt theInstr, const VexAbiInfo* vbi, UInt opc2,`
		b1cb33	`mkexpr( sub_element1 ),`
		b1cb33	`mkexpr( sub_element0 ) ) ) );`
		b1cb33
		b1cb33	`+ } else if ((inst_select == 16) && !prefix) {`
		b1cb33	`+ IRTemp result = newTemp(Ity_V128);`
		b1cb33	`+ UChar xT_addr = ifieldRegXT ( theInstr );`
		b1cb33	`+ UChar xB_addr = ifieldRegXB ( theInstr );`
		b1cb33	`+ /* Convert 16-bit bfloat to 32-bit float, not a prefix inst */`
		b1cb33	`+ DIP("xvcvbf16sp v%u,v%u\n", xT_addr, xB_addr);`
		b1cb33	`+ assign( result, vector_convert_bf16tofloat( vbi, mkexpr( vB ) ) );`
		b1cb33	`+ putVSReg( XT, mkexpr( result) );`
		b1cb33	`+`
		b1cb33	`+ } else if ((inst_select == 17) && !prefix) {`
		b1cb33	`+ IRTemp result = newTemp(Ity_V128);`
		b1cb33	`+ UChar xT_addr = ifieldRegXT ( theInstr );`
		b1cb33	`+ UChar xB_addr = ifieldRegXB ( theInstr );`
		b1cb33	`+ /* Convert 32-bit float to 16-bit bfloat, not a prefix inst */`
		b1cb33	`+ DIP("xvcvspbf16 v%u,v%u\n", xT_addr, xB_addr);`
		b1cb33	`+ assign( result, vector_convert_floattobf16( vbi, mkexpr( vB ) ) );`
		b1cb33	`+ putVSReg( XT, mkexpr( result) );`
		b1cb33	`+`
		b1cb33	`} else if (inst_select == 23) {`
		b1cb33	`DIP("xxbrd v%u, v%u\n", (UInt)XT, (UInt)XB);`
		b1cb33
		b1cb33	`@@ -34956,6 +35030,41 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,`
		b1cb33	`getVSReg( rB_addr ), AT,`
		b1cb33	`( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`break;`
		b1cb33	`+ case XVBF16GER2:`
		b1cb33	`+ DIP("xvbf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ), AT,`
		b1cb33	`+ ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2PP:`
		b1cb33	`+ DIP("xvbf16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ), AT,`
		b1cb33	`+ ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2PN:`
		b1cb33	`+ DIP("xvbf16ger2pn %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ), AT,`
		b1cb33	`+ ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2NP:`
		b1cb33	`+ DIP("xvbf16ger2np %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ), AT,`
		b1cb33	`+ ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2NN:`
		b1cb33	`+ DIP("xvbf16ger2nn %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ), AT,`
		b1cb33	`+ ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`case XVF32GER:`
		b1cb33	`DIP("xvf32ger %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`vsx_matrix_ger( vbi, MATRIX_32BIT_FLOAT_GER,`
		b1cb33	`@@ -35106,6 +35215,61 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,`
		b1cb33	`AT,`
		b1cb33	`( (MASKS << 9 ) \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`break;`
		b1cb33	`+ case XVBF16GER2:`
		b1cb33	`+ PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvbf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( (MASKS << 9 )`
		b1cb33	`+ \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2PP:`
		b1cb33	`+ PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvbf16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( (MASKS << 9 )`
		b1cb33	`+ \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2PN:`
		b1cb33	`+ PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvbf16ger2pn %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( (MASKS << 9 )`
		b1cb33	`+ \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2NP:`
		b1cb33	`+ PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvbf16ger2np %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( (MASKS << 9 )`
		b1cb33	`+ \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVBF16GER2NN:`
		b1cb33	`+ PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvbf16ger2nn %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`+ getVSReg( rA_addr ),`
		b1cb33	`+ getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( (MASKS << 9 )`
		b1cb33	`+ \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`case XVF16GER2:`
		b1cb33	`PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`@@ -36181,6 +36345,11 @@ DisResult disInstr_PPC_WRK (`
		b1cb33	`(opc2 == XVI4GER8PP) \|\| // xvi4ger8pp`
		b1cb33	`(opc2 == XVI8GER4) \|\| // xvi8ger4`
		b1cb33	`(opc2 == XVI8GER4PP) \|\| // xvi8ger4pp`
		b1cb33	`+ (opc2 == XVBF16GER2) \|\| // xvbf16ger2`
		b1cb33	`+ (opc2 == XVBF16GER2PP) \|\| // xvbf16ger2pp`
		b1cb33	`+ (opc2 == XVBF16GER2PN) \|\| // xvbf16ger2pn`
		b1cb33	`+ (opc2 == XVBF16GER2NP) \|\| // xvbf16ger2np`
		b1cb33	`+ (opc2 == XVBF16GER2NN) \|\| // xvbf16ger2nn`
		b1cb33	`(opc2 == XVF16GER2) \|\| // xvf16ger2`
		b1cb33	`(opc2 == XVF16GER2PP) \|\| // xvf16ger2pp`
		b1cb33	`(opc2 == XVF16GER2PN) \|\| // xvf16ger2pn`
		b1cb33	`commit e09fdaf569b975717465ed8043820d0198d4d47d`
		b1cb33	`Author: Carl Love <cel@us.ibm.com>`
		b1cb33	`Date: Fri Feb 26 16:05:12 2021 -0600`
		b1cb33
		b1cb33	`PPC64: Reduced-Precision: Missing Integer-based Outer Product Operations`
		b1cb33
		b1cb33	`Add support for:`
		b1cb33
		b1cb33	`pmxvi16ger2 VSX Vector 16-bit Signed Integer GER (rank-2 update), Prefixed`
		b1cb33	`Masked`
		b1cb33	`pmxvi16ger2pp VSX Vector 16-bit Signed Integer GER (rank-2 update) (Positive`
		b1cb33	`multiply, Positive accumulate), Prefixed Masked`
		b1cb33	`pmxvi8ger4spp VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with`
		b1cb33	`Saturation (Positive multiply, Positive accumulate), Prefixed Masked`
		b1cb33	`xvi16ger2 VSX Vector 16-bit Signed Integer GER (rank-2 update)`
		b1cb33	`xvi16ger2pp VSX Vector 16-bit Signed Integer GER (rank-2 update) (Positive`
		b1cb33	`multiply, Positive accumulate)`
		b1cb33	`xvi8ger4spp VSX Vector 8-bit Signed/Unsigned Integer GER (rank-4 update) with`
		b1cb33	`Saturation (Positive multiply, Positive accumulate)`
		b1cb33
		b1cb33	`diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`index 6bcee966d..d8131eb60 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`+++ b/VEX/priv/guest_ppc_helpers.c`
		b1cb33	`@@ -1446,16 +1446,16 @@ static UInt exts4( UInt src)`
		b1cb33	`return src & 0xF; /* make sure high order bits are zero */`
		b1cb33	`}`
		b1cb33
		b1cb33	`-static UInt exts8( UInt src)`
		b1cb33	`+static ULong exts8( UInt src)`
		b1cb33	`{`
		b1cb33	`- /* Input is an 8-bit value. Extend bit 7 to bits [31:8] */`
		b1cb33	`+ /* Input is an 8-bit value. Extend bit 7 to bits [63:8] */`
		b1cb33	`if (( src >> 7 ) & 0x1)`
		b1cb33	`- return src \| 0xFFFFFF00; /* sign bit is a 1, extend */`
		b1cb33	`+ return src \| 0xFFFFFFFFFFFFFF00ULL; /* sign bit is a 1, extend */`
		b1cb33	`else`
		b1cb33	`return src & 0xFF; /* make sure high order bits are zero */`
		b1cb33	`}`
		b1cb33
		b1cb33	`-static UInt extz8( UInt src)`
		b1cb33	`+static ULong extz8( UInt src)`
		b1cb33	`{`
		b1cb33	`/* Input is an 8-bit value. Extend src on the left with zeros. */`
		b1cb33	`return src & 0xFF; /* make sure high order bits are zero */`
		b1cb33	`@@ -1662,12 +1662,12 @@ void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`ULong srcB_hi, ULong srcB_lo,`
		b1cb33	`UInt masks_inst )`
		b1cb33	`{`
		b1cb33	`- UInt i, j, mask, sum, inst, acc_entry, prefix_inst;`
		b1cb33	`+ UInt i, j, mask, inst, acc_entry, prefix_inst;`
		b1cb33
		b1cb33	`UInt srcA_bytes[4][4]; /* word, byte */`
		b1cb33	`UInt srcB_bytes[4][4]; /* word, byte */`
		b1cb33	`UInt acc_word[4];`
		b1cb33	`- UInt prod0, prod1, prod2, prod3;`
		b1cb33	`+ ULong prod0, prod1, prod2, prod3, sum;`
		b1cb33	`UInt result[4];`
		b1cb33	`UInt pmsk = 0;`
		b1cb33	`UInt xmsk = 0;`
		b1cb33	`@@ -1742,10 +1742,13 @@ void vsx_matrix_8bit_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`sum = prod0 + prod1 + prod2 + prod3;`
		b1cb33
		b1cb33	`if ( inst == XVI8GER4 )`
		b1cb33	`- result[j] = sum;`
		b1cb33	`+ result[j] = chop64to32( sum );`
		b1cb33
		b1cb33	`else if ( inst == XVI8GER4PP )`
		b1cb33	`- result[j] = sum + acc_word[j];`
		b1cb33	`+ result[j] = chop64to32( sum + acc_word[j] );`
		b1cb33	`+`
		b1cb33	`+ else if ( inst == XVI8GER4SPP )`
		b1cb33	`+ result[j] = clampS64toS32(sum + acc_word[j]);`
		b1cb33
		b1cb33	`} else {`
		b1cb33	`result[j] = 0;`
		b1cb33	`@@ -1821,7 +1824,7 @@ void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`else`
		b1cb33	`prod1 = exts16to64( srcA_word[i][1] )`
		b1cb33	`* exts16to64( srcB_word[j][1] );`
		b1cb33	`- /* sum is UInt so the result is choped to 32-bits */`
		b1cb33	`+`
		b1cb33	`sum = prod0 + prod1;`
		b1cb33
		b1cb33	`if ( inst == XVI16GER2 )`
		b1cb33	`@@ -1830,13 +1833,11 @@ void vsx_matrix_16bit_ger_dirty_helper( VexGuestPPC64State* gst,`
		b1cb33	`else if ( inst == XVI16GER2S )`
		b1cb33	`result[j] = clampS64toS32( sum );`
		b1cb33
		b1cb33	`- else if ( inst == XVI16GER2PP ) {`
		b1cb33	`+ else if ( inst == XVI16GER2PP )`
		b1cb33	`result[j] = chop64to32( sum + acc_word[j] );`
		b1cb33	`- }`
		b1cb33
		b1cb33	`- else if ( inst == XVI16GER2SPP ) {`
		b1cb33	`+ else if ( inst == XVI16GER2SPP )`
		b1cb33	`result[j] = clampS64toS32( sum + acc_word[j] );`
		b1cb33	`- }`
		b1cb33
		b1cb33	`} else {`
		b1cb33	`result[j] = 0;`
		b1cb33	`diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`index 20553a539..e54f0f389 100644`
		b1cb33	`--- a/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`+++ b/VEX/priv/guest_ppc_toIR.c`
		b1cb33	`@@ -5993,6 +5993,7 @@ static void vsx_matrix_ger ( const VexAbiInfo* vbi,`
		b1cb33	`break;`
		b1cb33	`case XVI4GER8PP:`
		b1cb33	`case XVI8GER4PP:`
		b1cb33	`+ case XVI8GER4SPP:`
		b1cb33	`case XVI16GER2PP:`
		b1cb33	`case XVI16GER2SPP:`
		b1cb33	`case XVBF16GER2PP:`
		b1cb33	`@@ -34983,6 +34984,12 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,`
		b1cb33	`getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`AT, ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`break;`
		b1cb33	`+ case XVI8GER4SPP:`
		b1cb33	`+ DIP("xvi8ger4spp %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_8BIT_INT_GER,`
		b1cb33	`+ getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`case XVI16GER2S:`
		b1cb33	`DIP("xvi16ger2s %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,`
		b1cb33	`@@ -34995,6 +35002,19 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,`
		b1cb33	`getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`AT, ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`break;`
		b1cb33	`+ case XVI16GER2:`
		b1cb33	`+ DIP("xvi16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,`
		b1cb33	`+ getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVI16GER2PP:`
		b1cb33	`+ DIP("xvi16ger2pp %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,`
		b1cb33	`+ getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`+ AT, ( ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+`
		b1cb33	`case XVF16GER2:`
		b1cb33	`DIP("xvf16ger2 %u,r%u, r%u\n", AT, rA_addr, rB_addr);`
		b1cb33	`vsx_matrix_ger( vbi, MATRIX_16BIT_FLOAT_GER,`
		b1cb33	`@@ -35193,6 +35213,39 @@ static Bool dis_vsx_accumulator_prefix ( UInt prefix, UInt theInstr,`
		b1cb33	`AT,`
		b1cb33	`( (MASKS << 9 ) \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`break;`
		b1cb33	`+ case XVI8GER4SPP:`
		b1cb33	`+ PMSK = IFIELD( prefix, 12, 4);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvi8ger4spp %u,r%u, r%u,%u,%u,%u\n",`
		b1cb33	`+ AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_8BIT_INT_GER,`
		b1cb33	`+ getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`+ AT,`
		b1cb33	`+ ( (MASKS << 9 ) \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVI16GER2:`
		b1cb33	`+ PMSK = IFIELD( prefix, 12, 4);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvi16ger2 %u,r%u, r%u,%u,%u,%u\n",`
		b1cb33	`+ AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,`
		b1cb33	`+ getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`+ AT,`
		b1cb33	`+ ( (MASKS << 9 ) \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`+ case XVI16GER2PP:`
		b1cb33	`+ PMSK = IFIELD( prefix, 12, 4);`
		b1cb33	`+ XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`+ YMSK = IFIELD( prefix, 0, 4);`
		b1cb33	`+ DIP("pmxvi16ger2pp %u,r%u, r%u,%u,%u,%u\n",`
		b1cb33	`+ AT, rA_addr, rB_addr, XMSK, YMSK, PMSK);`
		b1cb33	`+ vsx_matrix_ger( vbi, MATRIX_16BIT_INT_GER,`
		b1cb33	`+ getVSReg( rA_addr ), getVSReg( rB_addr ),`
		b1cb33	`+ AT,`
		b1cb33	`+ ( (MASKS << 9 ) \| ( inst_prefix << 8 ) \| XO ) );`
		b1cb33	`+ break;`
		b1cb33	`case XVI16GER2S:`
		b1cb33	`PMSK = IFIELD( prefix, 14, 2);`
		b1cb33	`XMSK = IFIELD( prefix, 4, 4);`
		b1cb33	`@@ -36345,6 +36398,9 @@ DisResult disInstr_PPC_WRK (`
		b1cb33	`(opc2 == XVI4GER8PP) \|\| // xvi4ger8pp`
		b1cb33	`(opc2 == XVI8GER4) \|\| // xvi8ger4`
		b1cb33	`(opc2 == XVI8GER4PP) \|\| // xvi8ger4pp`
		b1cb33	`+ (opc2 == XVI8GER4SPP) \|\| // xvi8ger4spp`
		b1cb33	`+ (opc2 == XVI16GER2) \|\| // xvi16ger2`
		b1cb33	`+ (opc2 == XVI16GER2PP) \|\| // xvi16ger2pp`
		b1cb33	`(opc2 == XVBF16GER2) \|\| // xvbf16ger2`
		b1cb33	`(opc2 == XVBF16GER2PP) \|\| // xvbf16ger2pp`
		b1cb33	`(opc2 == XVBF16GER2PN) \|\| // xvbf16ger2pn`

rpms / valgrind

Source Code

Blame SOURCES/valgrind-3.17.0-ppc64-isa-3.1.patch