From b3eda9b80b31bacf6ad1ef1f01a15efba10ff139 Mon Sep 17 00:00:00 2001 From: Mark Wielaard Date: Nov 23 2018 21:31:07 +0000 Subject: 3.14.0-4 gcc ppc64le inlined memcmp vs memcheck (#1652926) - Add valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch, valgrind-3.14.0-new-strlen-IROps.patch, valgrind-3.14.0-ppc-instr-new-IROps.patch, valgrind-3.14.0-memcheck-new-IROps.patch, valgrind-3.14.0-ppc-frontend-new-IROps.patch, valgrind-3.14.0-transform-popcount64-ctznat64.patch and valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch (#1652926) --- diff --git a/valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch b/valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch new file mode 100644 index 0000000..8101f93 --- /dev/null +++ b/valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch @@ -0,0 +1,18 @@ +commit 27fe22378da38424102c5292b782cacdd9d7b9e4 +Author: Julian Seward +Date: Tue Nov 20 12:09:03 2018 +0100 + + Add support for Iop_{Sar,Shr}8 on ppc. --expensive-definedness-checks=yes needs them. + +diff --git a/VEX/priv/host_ppc_isel.c b/VEX/priv/host_ppc_isel.c +index 5242176..750cf8d 100644 +--- a/VEX/priv/host_ppc_isel.c ++++ b/VEX/priv/host_ppc_isel.c +@@ -1528,7 +1528,6 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e, + True/*32bit shift*/, + tmp, tmp, amt)); + r_srcL = tmp; +- vassert(0); /* AWAITING TEST CASE */ + } + } + /* Only 64 expressions need 64bit shifts, diff --git a/valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch b/valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch new file mode 100644 index 0000000..d9df0d9 --- /dev/null +++ b/valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch @@ -0,0 +1,81 @@ +commit 7f1dd9d5aec1f1fd4eb0ae3a311358a914f1d73f +Author: Julian Seward +Date: Tue Nov 20 10:18:29 2018 +0100 + + get_otrack_shadow_offset_wrk for ppc32 and ppc64: add missing cases for XER_OV32, XER_CA32 and C_FPCC. + + The missing cases were discovered whilst testing fixes for bug 386945, but are + otherwise unrelated to that bug. + +diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c +index 5ed101f..4ce746e 100644 +--- a/memcheck/mc_machine.c ++++ b/memcheck/mc_machine.c +@@ -120,11 +120,11 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB ) + Int o = offset; + tl_assert(sz > 0); + +-#if defined(VGA_ppc64be) ++# if defined(VGA_ppc64be) + tl_assert(host_is_big_endian()); +-#elif defined(VGA_ppc64le) ++# elif defined(VGA_ppc64le) + tl_assert(host_is_little_endian()); +-#endif ++# endif + + if (sz == 8 || sz == 4) { + /* The point of this is to achieve +@@ -132,11 +132,11 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB ) + return GOF(GPRn); + by testing ox instead of o, and setting ox back 4 bytes when sz == 4. + */ +-#if defined(VGA_ppc64le) ++# if defined(VGA_ppc64le) + Int ox = o; +-#else ++# else + Int ox = sz == 8 ? o : (o - 4); +-#endif ++# endif + if (ox == GOF(GPR0)) return ox; + if (ox == GOF(GPR1)) return ox; + if (ox == GOF(GPR2)) return ox; +@@ -240,11 +240,13 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB ) + if (o == GOF(VSR31) && sz == 8) return o; + + /* For the various byte sized XER/CR pieces, use offset 8 +- in VSR0 .. VSR19. */ ++ in VSR0 .. VSR21. */ + tl_assert(SZB(VSR0) == 16); + if (o == GOF(XER_SO) && sz == 1) return 8 +GOF(VSR0); + if (o == GOF(XER_OV) && sz == 1) return 8 +GOF(VSR1); ++ if (o == GOF(XER_OV32) && sz == 1) return 8 +GOF(VSR20); + if (o == GOF(XER_CA) && sz == 1) return 8 +GOF(VSR2); ++ if (o == GOF(XER_CA32) && sz == 1) return 8 +GOF(VSR21); + if (o == GOF(XER_BC) && sz == 1) return 8 +GOF(VSR3); + + if (o == GOF(CR0_321) && sz == 1) return 8 +GOF(VSR4); +@@ -388,6 +390,7 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB ) + if (o == GOF(IP_AT_SYSCALL) && sz == 4) return -1; /* slot unused */ + if (o == GOF(FPROUND) && sz == 1) return -1; + if (o == GOF(DFPROUND) && sz == 1) return -1; ++ if (o == GOF(C_FPCC) && sz == 1) return -1; + if (o == GOF(VRSAVE) && sz == 4) return -1; + if (o == GOF(EMNOTE) && sz == 4) return -1; + if (o == GOF(CMSTART) && sz == 4) return -1; +@@ -440,11 +443,13 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB ) + if (o == GOF(VSR31) && sz == 8) return o; + + /* For the various byte sized XER/CR pieces, use offset 8 +- in VSR0 .. VSR19. */ ++ in VSR0 .. VSR21. */ + tl_assert(SZB(VSR0) == 16); + if (o == GOF(XER_SO) && sz == 1) return 8 +GOF(VSR0); + if (o == GOF(XER_OV) && sz == 1) return 8 +GOF(VSR1); ++ if (o == GOF(XER_OV32) && sz == 1) return 8 +GOF(VSR20); + if (o == GOF(XER_CA) && sz == 1) return 8 +GOF(VSR2); ++ if (o == GOF(XER_CA32) && sz == 1) return 8 +GOF(VSR21); + if (o == GOF(XER_BC) && sz == 1) return 8 +GOF(VSR3); + + if (o == GOF(CR0_321) && sz == 1) return 8 +GOF(VSR4); diff --git a/valgrind-3.14.0-memcheck-new-IROps.patch b/valgrind-3.14.0-memcheck-new-IROps.patch new file mode 100644 index 0000000..79e7113 --- /dev/null +++ b/valgrind-3.14.0-memcheck-new-IROps.patch @@ -0,0 +1,453 @@ +commit e221eca26be6b2396e3fcbf4117e630fc22e79f6 +Author: Julian Seward +Date: Tue Nov 20 11:28:42 2018 +0100 + + Add Memcheck support for IROps added in 42719898. + + memcheck/mc_translate.c: + + Add mkRight{32,64} as right-travelling analogues to mkLeft{32,64}. + + doCmpORD: for the cases of a signed comparison against zero, compute + definedness of the 3 result bits (lt,gt,eq) separately, and, for the lt and eq + bits, do it exactly accurately. + + expensiveCountTrailingZeroes: no functional change. Re-analyse/verify and add + comments. + + expensiveCountLeadingZeroes: add. Very similar to + expensiveCountTrailingZeroes. + + Add some comments to mark unary ops which are self-shadowing. + + Route Iop_Ctz{,Nat}{32,64} through expensiveCountTrailingZeroes. + Route Iop_Clz{,Nat}{32,64} through expensiveCountLeadingZeroes. + + Add instrumentation for Iop_PopCount{32,64} and Iop_Reverse8sIn32_x1. + + memcheck/tests/vbit-test/irops.c + + Add dummy new entries for all new IROps, just enough to make it compile and + run. + +diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c +index 68a2ab3..c24db91 100644 +--- a/memcheck/mc_translate.c ++++ b/memcheck/mc_translate.c +@@ -737,6 +737,34 @@ static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) { + return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1)); + } + ++/* --------- The Right-family of operations. --------- */ ++ ++/* Unfortunately these are a lot more expensive then their Left ++ counterparts. Fortunately they are only very rarely used -- only for ++ count-leading-zeroes instrumentation. */ ++ ++static IRAtom* mkRight32 ( MCEnv* mce, IRAtom* a1 ) ++{ ++ for (Int i = 1; i <= 16; i *= 2) { ++ // a1 |= (a1 >>u i) ++ IRAtom* tmp ++ = assignNew('V', mce, Ity_I32, binop(Iop_Shr32, a1, mkU8(i))); ++ a1 = assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, tmp)); ++ } ++ return a1; ++} ++ ++static IRAtom* mkRight64 ( MCEnv* mce, IRAtom* a1 ) ++{ ++ for (Int i = 1; i <= 32; i *= 2) { ++ // a1 |= (a1 >>u i) ++ IRAtom* tmp ++ = assignNew('V', mce, Ity_I64, binop(Iop_Shr64, a1, mkU8(i))); ++ a1 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, tmp)); ++ } ++ return a1; ++} ++ + /* --------- 'Improvement' functions for AND/OR. --------- */ + + /* ImproveAND(data, vbits) = data OR vbits. Defined (0) data 0s give +@@ -1280,20 +1308,18 @@ static IRAtom* doCmpORD ( MCEnv* mce, + IRAtom* xxhash, IRAtom* yyhash, + IRAtom* xx, IRAtom* yy ) + { +- Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U; +- Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S; +- IROp opOR = m64 ? Iop_Or64 : Iop_Or32; +- IROp opAND = m64 ? Iop_And64 : Iop_And32; +- IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32; +- IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32; +- IRType ty = m64 ? Ity_I64 : Ity_I32; +- Int width = m64 ? 64 : 32; ++ Bool m64 = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U; ++ Bool syned = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S; ++ IROp opOR = m64 ? Iop_Or64 : Iop_Or32; ++ IROp opAND = m64 ? Iop_And64 : Iop_And32; ++ IROp opSHL = m64 ? Iop_Shl64 : Iop_Shl32; ++ IROp opSHR = m64 ? Iop_Shr64 : Iop_Shr32; ++ IROp op1UtoWS = m64 ? Iop_1Uto64 : Iop_1Uto32; ++ IRType ty = m64 ? Ity_I64 : Ity_I32; ++ Int width = m64 ? 64 : 32; + + Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32; + +- IRAtom* threeLeft1 = NULL; +- IRAtom* sevenLeft1 = NULL; +- + tl_assert(isShadowAtom(mce,xxhash)); + tl_assert(isShadowAtom(mce,yyhash)); + tl_assert(isOriginalAtom(mce,xx)); +@@ -1312,30 +1338,55 @@ static IRAtom* doCmpORD ( MCEnv* mce, + /* fancy interpretation */ + /* if yy is zero, then it must be fully defined (zero#). */ + tl_assert(isZero(yyhash)); +- threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1); ++ // This is still inaccurate, but I don't think it matters, since ++ // nobody writes code of the form ++ // "is signedly greater than zero?". ++ // We therefore simply declare "x >s 0" to be undefined if any bit in ++ // x is undefined. That's clearly suboptimal in some cases. Eg, if ++ // the highest order bit is a defined 1 then x is negative so it ++ // doesn't matter whether the remaining bits are defined or not. ++ IRAtom* t_0_gt_0_0 ++ = assignNew( ++ 'V', mce,ty, ++ binop( ++ opAND, ++ mkPCastTo(mce,ty, xxhash), ++ m64 ? mkU64(1<<2) : mkU32(1<<2) ++ )); ++ // For "x >u 1) ++ // ++ // That is, improver has its upper clz(atom)+1 bits equal to one; ++ // lower bits (if any) equal to zero. So it's exactly the right ++ // mask to use to remove the irrelevant undefined input bits. ++ /* Here are some examples: ++ atom = 0...0 1 U...U ++ R(atom) = 0...0 1 1...1 ++ R(atom) >>u 1 = 0...0 0 1...1 ++ ~(R(atom) >>u 1) = 1...1 1 0...0 ++ which correctly describes which bits of |atom| ++ actually influence the result ++ A boundary case ++ atom = 0...0 ++ R(atom) = 0...0 ++ R(atom) >>u 1 = 0...0 ++ ~(R(atom) >>u 1) = 1...1 ++ also a correct mask for the input: all input bits ++ are relevant ++ Another boundary case ++ atom = 1 1..1 ++ R(atom) = 1 1..1 ++ R(atom) >>u 1 = 0 1..1 ++ ~(R(atom) >>u 1) = 1 0..0 ++ also a correct mask: only the leftmost input bit ++ is relevant ++ Now with misc U bits interspersed: ++ atom = 0...0 1 U...U 0 1 U...U ++ R(atom) = 0...0 1 1...1 1 1 1...1 ++ R(atom) >>u 1 = 0...0 0 1...1 1 1 1...1 ++ ~(R(atom) >>u 1) = 1...1 1 0...0 0 0 0...0, also correct ++ (Per initial implementation of 15 Nov 2018) ++ */ ++ improver = mkRight(mce, atom); ++ improver = assignNew('V', mce, ty, binop(shrOp, improver, mkU8(1))); ++ improver = assignNew('V', mce, ty, unop(notOp, improver)); ++ ++ // improved = vatom & improver ++ // ++ // That is, treat any V bits to the right of the leftmost clz(atom)+1 ++ // bits as "defined". + improved = assignNew('V', mce, ty, + binop(andOp, vatom, improver)); + +@@ -4705,6 +4866,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_RecipEst32F0x4: + return unary32F0x4(mce, vatom); + ++ // These are self-shadowing. + case Iop_32UtoV128: + case Iop_64UtoV128: + case Iop_Dup8x16: +@@ -4745,6 +4907,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_MulI128by10Carry: + case Iop_F16toF64x2: + case Iop_F64toF16x2: ++ // FIXME JRS 2018-Nov-15. This is surely not correct! + return vatom; + + case Iop_I32StoF128: /* signed I32 -> F128 */ +@@ -4770,7 +4933,6 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_RoundF64toF64_NegINF: + case Iop_RoundF64toF64_PosINF: + case Iop_RoundF64toF64_ZERO: +- case Iop_Clz64: + case Iop_D32toD64: + case Iop_I32StoD64: + case Iop_I32UtoD64: +@@ -4785,17 +4947,32 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_D64toD128: + return mkPCastTo(mce, Ity_I128, vatom); + +- case Iop_Clz32: + case Iop_TruncF64asF32: + case Iop_NegF32: + case Iop_AbsF32: + case Iop_F16toF32: + return mkPCastTo(mce, Ity_I32, vatom); + +- case Iop_Ctz32: +- case Iop_Ctz64: ++ case Iop_Ctz32: case Iop_CtzNat32: ++ case Iop_Ctz64: case Iop_CtzNat64: + return expensiveCountTrailingZeroes(mce, op, atom, vatom); + ++ case Iop_Clz32: case Iop_ClzNat32: ++ case Iop_Clz64: case Iop_ClzNat64: ++ return expensiveCountLeadingZeroes(mce, op, atom, vatom); ++ ++ // PopCount32: this is slightly pessimistic. It is true that the ++ // result depends on all input bits, so that aspect of the PCast is ++ // correct. However, regardless of the input, only the lowest 5 bits ++ // out of the output can ever be undefined. So we could actually ++ // "improve" the results here by marking the top 27 bits of output as ++ // defined. A similar comment applies for PopCount64. ++ case Iop_PopCount32: ++ return mkPCastTo(mce, Ity_I32, vatom); ++ case Iop_PopCount64: ++ return mkPCastTo(mce, Ity_I64, vatom); ++ ++ // These are self-shadowing. + case Iop_1Uto64: + case Iop_1Sto64: + case Iop_8Uto64: +@@ -4821,6 +4998,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_V256to64_2: case Iop_V256to64_3: + return assignNew('V', mce, Ity_I64, unop(op, vatom)); + ++ // These are self-shadowing. + case Iop_64to32: + case Iop_64HIto32: + case Iop_1Uto32: +@@ -4830,8 +5008,10 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_16Sto32: + case Iop_8Sto32: + case Iop_V128to32: ++ case Iop_Reverse8sIn32_x1: + return assignNew('V', mce, Ity_I32, unop(op, vatom)); + ++ // These are self-shadowing. + case Iop_8Sto16: + case Iop_8Uto16: + case Iop_32to16: +@@ -4840,6 +5020,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_GetMSBs8x16: + return assignNew('V', mce, Ity_I16, unop(op, vatom)); + ++ // These are self-shadowing. + case Iop_1Uto8: + case Iop_1Sto8: + case Iop_16to8: +@@ -4868,6 +5049,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_Not16: + case Iop_Not8: + case Iop_Not1: ++ // FIXME JRS 2018-Nov-15. This is surely not correct! + return vatom; + + case Iop_CmpNEZ8x8: +@@ -4929,6 +5111,7 @@ IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom ) + case Iop_Ctz64x2: + return mkPCast64x2(mce, vatom); + ++ // This is self-shadowing. + case Iop_PwBitMtxXpose64x2: + return assignNew('V', mce, Ity_V128, unop(op, vatom)); + +diff --git a/memcheck/tests/vbit-test/irops.c b/memcheck/tests/vbit-test/irops.c +index bfd82fc..e8bf67d 100644 +--- a/memcheck/tests/vbit-test/irops.c ++++ b/memcheck/tests/vbit-test/irops.c +@@ -111,6 +111,12 @@ static irop_t irops[] = { + { DEFOP(Iop_Clz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 }, + { DEFOP(Iop_Ctz64, UNDEF_ALL), .s390x = 0, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_Ctz32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 1, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, ++ { DEFOP(Iop_ClzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, // ppc32 asserts ++ { DEFOP(Iop_ClzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 }, ++ { DEFOP(Iop_CtzNat64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, ++ { DEFOP(Iop_CtzNat32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 1, .mips32 =0, .mips64 = 0 }, ++ { DEFOP(Iop_PopCount64, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 0, .mips32 =0, .mips64 = 0 }, ++ { DEFOP(Iop_PopCount32, UNDEF_ALL), .s390x = 0, .amd64 = 0, .x86 = 0, .arm = 0, .ppc64 = 1, .ppc32 = 1, .mips32 =0, .mips64 = 0 }, + { DEFOP(Iop_CmpLT32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 }, + { DEFOP(Iop_CmpLT64S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 0, .arm = 0, .ppc64 = 0, .ppc32 = 0, .mips32 =0, .mips64 = 1 }, // ppc, mips assert + { DEFOP(Iop_CmpLE32S, UNDEF_ALL), .s390x = 1, .amd64 = 1, .x86 = 1, .arm = 1, .ppc64 = 1, .ppc32 = 1, .mips32 =1, .mips64 = 1 }, +@@ -336,6 +342,7 @@ static irop_t irops[] = { + { DEFOP(Iop_Sad8Ux4, UNDEF_UNKNOWN), }, + { DEFOP(Iop_CmpNEZ16x2, UNDEF_UNKNOWN), }, + { DEFOP(Iop_CmpNEZ8x4, UNDEF_UNKNOWN), }, ++ { DEFOP(Iop_Reverse8sIn32_x1, UNDEF_UNKNOWN) }, + /* ------------------ 64-bit SIMD FP ------------------------ */ + { DEFOP(Iop_I32UtoFx2, UNDEF_UNKNOWN), }, + { DEFOP(Iop_I32StoFx2, UNDEF_UNKNOWN), }, diff --git a/valgrind-3.14.0-new-strlen-IROps.patch b/valgrind-3.14.0-new-strlen-IROps.patch new file mode 100644 index 0000000..d6587d8 --- /dev/null +++ b/valgrind-3.14.0-new-strlen-IROps.patch @@ -0,0 +1,124 @@ +commit 4271989815b5fc933c1e29bc75507c2726dc3738 +Author: Julian Seward +Date: Tue Nov 20 10:52:33 2018 +0100 + + Add some new IROps to support improved Memcheck analysis of strlen etc. + + This is part of the fix for bug 386945. It adds the following IROps, plus + their supporting type- and printing- fragments: + + Iop_Reverse8sIn32_x1: 32-bit byteswap. A fancy name, but it is consistent + with naming for the other swapping IROps that already exist. + + Iop_PopCount64, Iop_PopCount32: population count + + Iop_ClzNat64, Iop_ClzNat32, Iop_CtzNat64, Iop_CtzNat32: counting leading and + trailing zeroes, with "natural" (Nat) semantics for a zero input, meaning, in + the case of zero input, return the number of bits in the word. These + functionally overlap with the existing Iop_Clz64, Iop_Clz32, Iop_Ctz64, + Iop_Ctz32. The existing operations are undefined in case of a zero input. + Adding these new variants avoids the complexity of having to change the + declared semantics of the existing operations. Instead they are deprecated + but still available for use. + +diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c +index 823b6be..3221033 100644 +--- a/VEX/priv/ir_defs.c ++++ b/VEX/priv/ir_defs.c +@@ -194,6 +194,14 @@ void ppIROp ( IROp op ) + case Iop_Ctz64: vex_printf("Ctz64"); return; + case Iop_Ctz32: vex_printf("Ctz32"); return; + ++ case Iop_ClzNat64: vex_printf("ClzNat64"); return; ++ case Iop_ClzNat32: vex_printf("ClzNat32"); return; ++ case Iop_CtzNat64: vex_printf("CtzNat64"); return; ++ case Iop_CtzNat32: vex_printf("CtzNat32"); return; ++ ++ case Iop_PopCount64: vex_printf("PopCount64"); return; ++ case Iop_PopCount32: vex_printf("PopCount32"); return; ++ + case Iop_CmpLT32S: vex_printf("CmpLT32S"); return; + case Iop_CmpLE32S: vex_printf("CmpLE32S"); return; + case Iop_CmpLT32U: vex_printf("CmpLT32U"); return; +@@ -395,6 +403,7 @@ void ppIROp ( IROp op ) + + case Iop_CmpNEZ16x2: vex_printf("CmpNEZ16x2"); return; + case Iop_CmpNEZ8x4: vex_printf("CmpNEZ8x4"); return; ++ case Iop_Reverse8sIn32_x1: vex_printf("Reverse8sIn32_x1"); return; + + case Iop_CmpF64: vex_printf("CmpF64"); return; + +@@ -2719,6 +2728,7 @@ void typeOfPrimop ( IROp op, + UNARY(Ity_I16, Ity_I16); + case Iop_Not32: + case Iop_CmpNEZ16x2: case Iop_CmpNEZ8x4: ++ case Iop_Reverse8sIn32_x1: + UNARY(Ity_I32, Ity_I32); + + case Iop_Not64: +@@ -2782,9 +2792,13 @@ void typeOfPrimop ( IROp op, + BINARY(Ity_I64,Ity_I64, Ity_I128); + + case Iop_Clz32: case Iop_Ctz32: ++ case Iop_ClzNat32: case Iop_CtzNat32: ++ case Iop_PopCount32: + UNARY(Ity_I32, Ity_I32); + + case Iop_Clz64: case Iop_Ctz64: ++ case Iop_ClzNat64: case Iop_CtzNat64: ++ case Iop_PopCount64: + UNARY(Ity_I64, Ity_I64); + + case Iop_DivU32: case Iop_DivS32: case Iop_DivU32E: case Iop_DivS32E: +diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h +index 17bcb55..93fa5ac 100644 +--- a/VEX/pub/libvex_ir.h ++++ b/VEX/pub/libvex_ir.h +@@ -452,12 +452,21 @@ typedef + Iop_MullS8, Iop_MullS16, Iop_MullS32, Iop_MullS64, + Iop_MullU8, Iop_MullU16, Iop_MullU32, Iop_MullU64, + +- /* Wierdo integer stuff */ ++ /* Counting bits */ ++ /* Ctz64/Ctz32/Clz64/Clz32 are UNDEFINED when given arguments of zero. ++ You must ensure they are never given a zero argument. As of ++ 2018-Nov-14 they are deprecated. Try to use the Nat variants ++ immediately below, if you can. ++ */ + Iop_Clz64, Iop_Clz32, /* count leading zeroes */ + Iop_Ctz64, Iop_Ctz32, /* count trailing zeros */ +- /* Ctz64/Ctz32/Clz64/Clz32 are UNDEFINED when given arguments of +- zero. You must ensure they are never given a zero argument. +- */ ++ /* Count leading/trailing zeroes, with "natural" semantics for the ++ case where the input is zero: then the result is the number of bits ++ in the word. */ ++ Iop_ClzNat64, Iop_ClzNat32, ++ Iop_CtzNat64, Iop_CtzNat32, ++ /* Population count -- compute the number of 1 bits in the argument. */ ++ Iop_PopCount64, Iop_PopCount32, + + /* Standard integer comparisons */ + Iop_CmpLT32S, Iop_CmpLT64S, +@@ -831,6 +840,9 @@ typedef + /* MISC (vector integer cmp != 0) */ + Iop_CmpNEZ16x2, Iop_CmpNEZ8x4, + ++ /* Byte swap in a 32-bit word */ ++ Iop_Reverse8sIn32_x1, ++ + /* ------------------ 64-bit SIMD FP ------------------------ */ + + /* Convertion to/from int */ +@@ -1034,8 +1046,9 @@ typedef + Iop_Slice64, // (I64, I64, I8) -> I64 + + /* REVERSE the order of chunks in vector lanes. Chunks must be +- smaller than the vector lanes (obviously) and so may be 8-, +- 16- and 32-bit in size. */ ++ smaller than the vector lanes (obviously) and so may be 8-, 16- and ++ 32-bit in size. Note that the degenerate case, ++ Iop_Reverse8sIn64_x1, is a simply a vanilla byte-swap. */ + /* Examples: + Reverse8sIn16_x4([a,b,c,d,e,f,g,h]) = [b,a,d,c,f,e,h,g] + Reverse8sIn32_x2([a,b,c,d,e,f,g,h]) = [d,c,b,a,h,g,f,e] diff --git a/valgrind-3.14.0-ppc-frontend-new-IROps.patch b/valgrind-3.14.0-ppc-frontend-new-IROps.patch new file mode 100644 index 0000000..a550975 --- /dev/null +++ b/valgrind-3.14.0-ppc-frontend-new-IROps.patch @@ -0,0 +1,381 @@ +commit 81d9832226d6e3d1ee78ee3133189d7b520e7eea +Author: Julian Seward +Date: Tue Nov 20 11:36:53 2018 +0100 + + ppc front end: use new IROps added in 42719898. + + This pertains to bug 386945. + + VEX/priv/guest_ppc_toIR.c: + + gen_POPCOUNT: use Iop_PopCount{32,64} where possible. + + gen_vpopcntd_mode32: use Iop_PopCount32. + + for cntlz{w,d}, use Iop_CtzNat{32,64}. + + gen_byterev32: use Iop_Reverse8sIn32_x1 instead of lengthy sequence. + + verbose_Clz32: remove (was unused anyway). + +diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c +index cb1cae1..8977d4f 100644 +--- a/VEX/priv/guest_ppc_toIR.c ++++ b/VEX/priv/guest_ppc_toIR.c +@@ -1595,7 +1595,8 @@ typedef enum { + /* Generate an IR sequence to do a popcount operation on the supplied + IRTemp, and return a new IRTemp holding the result. 'ty' may be + Ity_I32 or Ity_I64 only. */ +-static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_type ) ++static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, ++ _popcount_data_type data_type ) + { + /* Do count across 2^data_type bits, + byte: data_type = 3 +@@ -1611,6 +1612,22 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_typ + + vassert(ty == Ity_I64 || ty == Ity_I32); + ++ // Use a single IROp in cases where we can. ++ ++ if (ty == Ity_I64 && data_type == DWORD) { ++ IRTemp res = newTemp(Ity_I64); ++ assign(res, unop(Iop_PopCount64, mkexpr(src))); ++ return res; ++ } ++ ++ if (ty == Ity_I32 && data_type == WORD) { ++ IRTemp res = newTemp(Ity_I32); ++ assign(res, unop(Iop_PopCount32, mkexpr(src))); ++ return res; ++ } ++ ++ // For the rest, we have to do it the slow way. ++ + if (ty == Ity_I32) { + + for (idx = 0; idx < WORD; idx++) { +@@ -1638,7 +1655,7 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_typ + return nyu; + } + +-// else, ty == Ity_I64 ++ // else, ty == Ity_I64 + vassert(mode64); + + for (i = 0; i < DWORD; i++) { +@@ -1670,52 +1687,15 @@ static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src, _popcount_data_type data_typ + */ + static IRTemp gen_vpopcntd_mode32 ( IRTemp src1, IRTemp src2 ) + { +- Int i, shift[6]; +- IRTemp mask[6]; +- IRTemp old = IRTemp_INVALID; +- IRTemp nyu1 = IRTemp_INVALID; +- IRTemp nyu2 = IRTemp_INVALID; + IRTemp retval = newTemp(Ity_I64); + + vassert(!mode64); + +- for (i = 0; i < WORD; i++) { +- mask[i] = newTemp(Ity_I32); +- shift[i] = 1 << i; +- } +- assign(mask[0], mkU32(0x55555555)); +- assign(mask[1], mkU32(0x33333333)); +- assign(mask[2], mkU32(0x0F0F0F0F)); +- assign(mask[3], mkU32(0x00FF00FF)); +- assign(mask[4], mkU32(0x0000FFFF)); +- old = src1; +- for (i = 0; i < WORD; i++) { +- nyu1 = newTemp(Ity_I32); +- assign(nyu1, +- binop(Iop_Add32, +- binop(Iop_And32, +- mkexpr(old), +- mkexpr(mask[i])), +- binop(Iop_And32, +- binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])), +- mkexpr(mask[i])))); +- old = nyu1; +- } +- +- old = src2; +- for (i = 0; i < WORD; i++) { +- nyu2 = newTemp(Ity_I32); +- assign(nyu2, +- binop(Iop_Add32, +- binop(Iop_And32, +- mkexpr(old), +- mkexpr(mask[i])), +- binop(Iop_And32, +- binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])), +- mkexpr(mask[i])))); +- old = nyu2; +- } +- assign(retval, unop(Iop_32Uto64, binop(Iop_Add32, mkexpr(nyu1), mkexpr(nyu2)))); ++ assign(retval, ++ unop(Iop_32Uto64, ++ binop(Iop_Add32, ++ unop(Iop_PopCount32, mkexpr(src1)), ++ unop(Iop_PopCount32, mkexpr(src2))))); + return retval; + } + +@@ -5715,7 +5695,7 @@ static Bool dis_modulo_int ( UInt theInstr ) + rA_address, rS_address); + + assign( rS, getIReg( rS_address ) ); +- assign( result, unop( Iop_Ctz32, ++ assign( result, unop( Iop_CtzNat32, + unop( Iop_64to32, mkexpr( rS ) ) ) ); + assign( rA, binop( Iop_32HLto64, mkU32( 0 ), mkexpr( result ) ) ); + +@@ -5746,7 +5726,7 @@ static Bool dis_modulo_int ( UInt theInstr ) + rA_address, rS_address); + + assign( rS, getIReg( rS_address ) ); +- assign( rA, unop( Iop_Ctz64, mkexpr( rS ) ) ); ++ assign( rA, unop( Iop_CtzNat64, mkexpr( rS ) ) ); + + if ( flag_rC == 1 ) + set_CR0( mkexpr( rA ) ); +@@ -6307,7 +6287,6 @@ static Bool dis_int_logic ( UInt theInstr ) + IRTemp rS = newTemp(ty); + IRTemp rA = newTemp(ty); + IRTemp rB = newTemp(ty); +- IRExpr* irx; + Bool do_rc = False; + + assign( rS, getIReg(rS_addr) ); +@@ -6404,26 +6383,16 @@ static Bool dis_int_logic ( UInt theInstr ) + break; + + case 0x01A: { // cntlzw (Count Leading Zeros Word, PPC32 p371) +- IRExpr* lo32; + if (rB_addr!=0) { + vex_printf("dis_int_logic(ppc)(cntlzw,rB_addr)\n"); + return False; + } +- DIP("cntlzw%s r%u,r%u\n", +- flag_rC ? ".":"", rA_addr, rS_addr); ++ DIP("cntlzw%s r%u,r%u\n", flag_rC ? ".":"", rA_addr, rS_addr); + + // mode64: count in low word only +- lo32 = mode64 ? unop(Iop_64to32, mkexpr(rS)) : mkexpr(rS); +- +- // Iop_Clz32 undefined for arg==0, so deal with that case: +- irx = binop(Iop_CmpNE32, lo32, mkU32(0)); +- assign(rA, mkWidenFrom32(ty, +- IRExpr_ITE( irx, +- unop(Iop_Clz32, lo32), +- mkU32(32)), +- False)); +- +- // TODO: alternatively: assign(rA, verbose_Clz32(rS)); ++ IRExpr* lo32 = mode64 ? unop(Iop_64to32, mkexpr(rS)) : mkexpr(rS); ++ IRExpr* res32 = unop(Iop_ClzNat32, lo32); ++ assign(rA, mode64 ? unop(Iop_32Uto64, res32) : res32); + break; + } + +@@ -6521,14 +6490,8 @@ static Bool dis_int_logic ( UInt theInstr ) + vex_printf("dis_int_logic(ppc)(cntlzd,rB_addr)\n"); + return False; + } +- DIP("cntlzd%s r%u,r%u\n", +- flag_rC ? ".":"", rA_addr, rS_addr); +- // Iop_Clz64 undefined for arg==0, so deal with that case: +- irx = binop(Iop_CmpNE64, mkexpr(rS), mkU64(0)); +- assign(rA, IRExpr_ITE( irx, +- unop(Iop_Clz64, mkexpr(rS)), +- mkU64(64) )); +- // TODO: alternatively: assign(rA, verbose_Clz64(rS)); ++ DIP("cntlzd%s r%u,r%u\n", flag_rC ? ".":"", rA_addr, rS_addr); ++ assign(rA, unop(Iop_ClzNat64, mkexpr(rS))); + break; + + case 0x1FC: // cmpb (Power6: compare bytes) +@@ -6574,8 +6537,9 @@ static Bool dis_int_logic ( UInt theInstr ) + putFReg( rS_addr, mkexpr(frA)); + return True; + } +- case 0x1FA: // popcntd (population count doubleword ++ case 0x1FA: // popcntd (population count doubleword) + { ++ vassert(mode64); + DIP("popcntd r%u,r%u\n", rA_addr, rS_addr); + IRTemp result = gen_POPCOUNT(ty, rS, DWORD); + putIReg( rA_addr, mkexpr(result) ); +@@ -9154,18 +9118,7 @@ static Bool dis_int_shift ( UInt theInstr ) + static IRExpr* /* :: Ity_I32 */ gen_byterev32 ( IRTemp t ) + { + vassert(typeOfIRTemp(irsb->tyenv, t) == Ity_I32); +- return +- binop(Iop_Or32, +- binop(Iop_Shl32, mkexpr(t), mkU8(24)), +- binop(Iop_Or32, +- binop(Iop_And32, binop(Iop_Shl32, mkexpr(t), mkU8(8)), +- mkU32(0x00FF0000)), +- binop(Iop_Or32, +- binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(8)), +- mkU32(0x0000FF00)), +- binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(24)), +- mkU32(0x000000FF) ) +- ))); ++ return unop(Iop_Reverse8sIn32_x1, mkexpr(t)); + } + + /* Generates code to swap the byte order in the lower half of an Ity_I32, +@@ -9225,6 +9178,10 @@ static Bool dis_int_ldst_rev ( UInt theInstr ) + + case 0x214: // ldbrx (Load Doubleword Byte-Reverse Indexed) + { ++ // JRS FIXME: ++ // * is the host_endness conditional below actually necessary? ++ // * can we just do a 64-bit load followed by by Iop_Reverse8sIn64_x1? ++ // That would be a lot more efficient. + IRExpr * nextAddr; + IRTemp w3 = newTemp( Ity_I32 ); + IRTemp w4 = newTemp( Ity_I32 ); +@@ -17056,8 +17013,8 @@ dis_av_count_bitTranspose ( UInt theInstr, UInt opc2 ) + case 0x7C3: // vpopcntd + { + if (mode64) { +- /* Break vector into 64-bit double words and do the population count +- * on each double word. ++ /* Break vector into 64-bit double words and do the population ++ count on each double word. + */ + IRType ty = Ity_I64; + IRTemp bits0_63 = newTemp(Ity_I64); +@@ -17077,15 +17034,16 @@ dis_av_count_bitTranspose ( UInt theInstr, UInt opc2 ) + mkexpr( cnt_bits0_63 ) ) ); + } else { + /* Break vector into 32-bit words and do the population count +- * on each doubleword. ++ on each 32-bit word. + */ + IRTemp bits0_31, bits32_63, bits64_95, bits96_127; + bits0_31 = bits32_63 = bits64_95 = bits96_127 = IRTemp_INVALID; +- IRTemp cnt_bits0_63 = newTemp(Ity_I64); ++ IRTemp cnt_bits0_63 = newTemp(Ity_I64); + IRTemp cnt_bits64_127 = newTemp(Ity_I64); + + DIP("vpopcntd v%d,v%d\n", vRT_addr, vRB_addr); +- breakV128to4x32(mkexpr( vB), &bits96_127, &bits64_95, &bits32_63, &bits0_31 ); ++ breakV128to4x32(mkexpr( vB), &bits96_127, &bits64_95, ++ &bits32_63, &bits0_31 ); + + cnt_bits0_63 = gen_vpopcntd_mode32(bits0_31, bits32_63); + cnt_bits64_127 = gen_vpopcntd_mode32(bits64_95, bits96_127); +@@ -29103,10 +29061,12 @@ DisResult disInstr_PPC_WRK ( + + /* Miscellaneous ISA 2.06 instructions */ + case 0x1FA: // popcntd ++ if (!mode64) goto decode_failure; ++ /* else fallthru */ + case 0x17A: // popcntw + case 0x7A: // popcntb +- if (dis_int_logic( theInstr )) goto decode_success; +- goto decode_failure; ++ if (dis_int_logic( theInstr )) goto decode_success; ++ goto decode_failure; + + case 0x0FC: // bpermd + if (!mode64) goto decode_failure; +@@ -29669,94 +29629,6 @@ DisResult disInstr_PPC ( IRSB* irsb_IN, + return dres; + } + +- +-/*------------------------------------------------------------*/ +-/*--- Unused stuff ---*/ +-/*------------------------------------------------------------*/ +- +-///* A potentially more memcheck-friendly implementation of Clz32, with +-// the boundary case Clz32(0) = 32, which is what ppc requires. */ +-// +-//static IRExpr* /* :: Ity_I32 */ verbose_Clz32 ( IRTemp arg ) +-//{ +-// /* Welcome ... to SSA R Us. */ +-// IRTemp n1 = newTemp(Ity_I32); +-// IRTemp n2 = newTemp(Ity_I32); +-// IRTemp n3 = newTemp(Ity_I32); +-// IRTemp n4 = newTemp(Ity_I32); +-// IRTemp n5 = newTemp(Ity_I32); +-// IRTemp n6 = newTemp(Ity_I32); +-// IRTemp n7 = newTemp(Ity_I32); +-// IRTemp n8 = newTemp(Ity_I32); +-// IRTemp n9 = newTemp(Ity_I32); +-// IRTemp n10 = newTemp(Ity_I32); +-// IRTemp n11 = newTemp(Ity_I32); +-// IRTemp n12 = newTemp(Ity_I32); +-// +-// /* First, propagate the most significant 1-bit into all lower +-// positions in the word. */ +-// /* unsigned int clz ( unsigned int n ) +-// { +-// n |= (n >> 1); +-// n |= (n >> 2); +-// n |= (n >> 4); +-// n |= (n >> 8); +-// n |= (n >> 16); +-// return bitcount(~n); +-// } +-// */ +-// assign(n1, mkexpr(arg)); +-// assign(n2, binop(Iop_Or32, mkexpr(n1), binop(Iop_Shr32, mkexpr(n1), mkU8(1)))); +-// assign(n3, binop(Iop_Or32, mkexpr(n2), binop(Iop_Shr32, mkexpr(n2), mkU8(2)))); +-// assign(n4, binop(Iop_Or32, mkexpr(n3), binop(Iop_Shr32, mkexpr(n3), mkU8(4)))); +-// assign(n5, binop(Iop_Or32, mkexpr(n4), binop(Iop_Shr32, mkexpr(n4), mkU8(8)))); +-// assign(n6, binop(Iop_Or32, mkexpr(n5), binop(Iop_Shr32, mkexpr(n5), mkU8(16)))); +-// /* This gives a word of the form 0---01---1. Now invert it, giving +-// a word of the form 1---10---0, then do a population-count idiom +-// (to count the 1s, which is the number of leading zeroes, or 32 +-// if the original word was 0. */ +-// assign(n7, unop(Iop_Not32, mkexpr(n6))); +-// +-// /* unsigned int bitcount ( unsigned int n ) +-// { +-// n = n - ((n >> 1) & 0x55555555); +-// n = (n & 0x33333333) + ((n >> 2) & 0x33333333); +-// n = (n + (n >> 4)) & 0x0F0F0F0F; +-// n = n + (n >> 8); +-// n = (n + (n >> 16)) & 0x3F; +-// return n; +-// } +-// */ +-// assign(n8, +-// binop(Iop_Sub32, +-// mkexpr(n7), +-// binop(Iop_And32, +-// binop(Iop_Shr32, mkexpr(n7), mkU8(1)), +-// mkU32(0x55555555)))); +-// assign(n9, +-// binop(Iop_Add32, +-// binop(Iop_And32, mkexpr(n8), mkU32(0x33333333)), +-// binop(Iop_And32, +-// binop(Iop_Shr32, mkexpr(n8), mkU8(2)), +-// mkU32(0x33333333)))); +-// assign(n10, +-// binop(Iop_And32, +-// binop(Iop_Add32, +-// mkexpr(n9), +-// binop(Iop_Shr32, mkexpr(n9), mkU8(4))), +-// mkU32(0x0F0F0F0F))); +-// assign(n11, +-// binop(Iop_Add32, +-// mkexpr(n10), +-// binop(Iop_Shr32, mkexpr(n10), mkU8(8)))); +-// assign(n12, +-// binop(Iop_Add32, +-// mkexpr(n11), +-// binop(Iop_Shr32, mkexpr(n11), mkU8(16)))); +-// return +-// binop(Iop_And32, mkexpr(n12), mkU32(0x3F)); +-//} +- + /*--------------------------------------------------------------------*/ + /*--- end guest_ppc_toIR.c ---*/ + /*--------------------------------------------------------------------*/ diff --git a/valgrind-3.14.0-ppc-instr-new-IROps.patch b/valgrind-3.14.0-ppc-instr-new-IROps.patch new file mode 100644 index 0000000..4332736 --- /dev/null +++ b/valgrind-3.14.0-ppc-instr-new-IROps.patch @@ -0,0 +1,257 @@ +commit 97d336b79e36f6c99d8b07f49ebc9b780e6df84e +Author: Julian Seward +Date: Tue Nov 20 11:07:37 2018 +0100 + + Add ppc host-side isel and instruction support for IROps added in previous commit. + + VEX/priv/host_ppc_defs.c, VEX/priv/host_ppc_defs.h: + + Dont emit cnttz{w,d}. We may need them on a target which doesn't support + them. Instead we can generate a fairly reasonable alternative sequence with + cntlz{w,d} instead. + + Add support for emitting popcnt{w,d}. + + VEX/priv/host_ppc_isel.c + + Add support for: Iop_ClzNat32 Iop_ClzNat64 + + Redo support for: Iop_Ctz{32,64} and their Nat equivalents, so as to not use + cnttz{w,d}, as mentioned above. + + Add support for: Iop_PopCount64 Iop_PopCount32 Iop_Reverse8sIn32_x1 + +diff --git a/VEX/priv/host_ppc_defs.c b/VEX/priv/host_ppc_defs.c +index b073c1d..f4b52e4 100644 +--- a/VEX/priv/host_ppc_defs.c ++++ b/VEX/priv/host_ppc_defs.c +@@ -501,9 +501,9 @@ const HChar* showPPCUnaryOp ( PPCUnaryOp op ) { + case Pun_NEG: return "neg"; + case Pun_CLZ32: return "cntlzw"; + case Pun_CLZ64: return "cntlzd"; +- case Pun_CTZ32: return "cnttzw"; +- case Pun_CTZ64: return "cnttzd"; + case Pun_EXTSW: return "extsw"; ++ case Pun_POP32: return "popcntw"; ++ case Pun_POP64: return "popcntd"; + default: vpanic("showPPCUnaryOp"); + } + } +@@ -4265,20 +4265,19 @@ Int emit_PPCInstr ( /*MB_MOD*/Bool* is_profInc, + vassert(mode64); + p = mkFormX(p, 31, r_src, r_dst, 0, 58, 0, endness_host); + break; +- case Pun_CTZ32: // cnttzw r_dst, r_src +- /* Note oder of src and dst is backwards from normal */ +- p = mkFormX(p, 31, r_src, r_dst, 0, 538, 0, endness_host); +- break; +- case Pun_CTZ64: // cnttzd r_dst, r_src +- /* Note oder of src and dst is backwards from normal */ +- vassert(mode64); +- p = mkFormX(p, 31, r_src, r_dst, 0, 570, 0, endness_host); +- break; + case Pun_EXTSW: // extsw r_dst, r_src + vassert(mode64); + p = mkFormX(p, 31, r_src, r_dst, 0, 986, 0, endness_host); + break; +- default: goto bad; ++ case Pun_POP32: // popcntw r_dst, r_src ++ p = mkFormX(p, 31, r_src, r_dst, 0, 378, 0, endness_host); ++ break; ++ case Pun_POP64: // popcntd r_dst, r_src ++ vassert(mode64); ++ p = mkFormX(p, 31, r_src, r_dst, 0, 506, 0, endness_host); ++ break; ++ default: ++ goto bad; + } + goto done; + } +diff --git a/VEX/priv/host_ppc_defs.h b/VEX/priv/host_ppc_defs.h +index 17baff5..321fba9 100644 +--- a/VEX/priv/host_ppc_defs.h ++++ b/VEX/priv/host_ppc_defs.h +@@ -291,9 +291,9 @@ typedef + Pun_NOT, + Pun_CLZ32, + Pun_CLZ64, +- Pun_CTZ32, +- Pun_CTZ64, +- Pun_EXTSW ++ Pun_EXTSW, ++ Pun_POP32, // popcntw ++ Pun_POP64 // popcntd + } + PPCUnaryOp; + +diff --git a/VEX/priv/host_ppc_isel.c b/VEX/priv/host_ppc_isel.c +index 6bdb5f7..5242176 100644 +--- a/VEX/priv/host_ppc_isel.c ++++ b/VEX/priv/host_ppc_isel.c +@@ -2065,12 +2065,15 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e, + return r_dst; + } + break; +- case Iop_Clz32: +- case Iop_Clz64: { ++ ++ case Iop_Clz32: case Iop_ClzNat32: ++ case Iop_Clz64: case Iop_ClzNat64: { ++ // cntlz is available even in the most basic (earliest) ppc ++ // variants, so it's safe to generate it unconditionally. + HReg r_src, r_dst; +- PPCUnaryOp op_clz = (op_unop == Iop_Clz32) ? Pun_CLZ32 : +- Pun_CLZ64; +- if (op_unop == Iop_Clz64 && !mode64) ++ PPCUnaryOp op_clz = (op_unop == Iop_Clz32 || op_unop == Iop_ClzNat32) ++ ? Pun_CLZ32 : Pun_CLZ64; ++ if ((op_unop == Iop_Clz64 || op_unop == Iop_ClzNat64) && !mode64) + goto irreducible; + /* Count leading zeroes. */ + r_dst = newVRegI(env); +@@ -2079,18 +2082,133 @@ static HReg iselWordExpr_R_wrk ( ISelEnv* env, const IRExpr* e, + return r_dst; + } + +- case Iop_Ctz32: +- case Iop_Ctz64: { +- HReg r_src, r_dst; +- PPCUnaryOp op_clz = (op_unop == Iop_Ctz32) ? Pun_CTZ32 : +- Pun_CTZ64; +- if (op_unop == Iop_Ctz64 && !mode64) +- goto irreducible; +- /* Count trailing zeroes. */ +- r_dst = newVRegI(env); +- r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess); +- addInstr(env, PPCInstr_Unary(op_clz,r_dst,r_src)); +- return r_dst; ++ //case Iop_Ctz32: ++ case Iop_CtzNat32: ++ //case Iop_Ctz64: ++ case Iop_CtzNat64: ++ { ++ // Generate code using Clz, because we can't assume the host has ++ // Ctz. In particular, part of the fix for bug 386945 involves ++ // creating a Ctz in ir_opt.c from smaller fragments. ++ PPCUnaryOp op_clz = Pun_CLZ64; ++ Int WS = 64; ++ if (op_unop == Iop_Ctz32 || op_unop == Iop_CtzNat32) { ++ op_clz = Pun_CLZ32; ++ WS = 32; ++ } ++ /* Compute ctz(arg) = wordsize - clz(~arg & (arg - 1)), thusly: ++ t1 = arg - 1 ++ t2 = not arg ++ t2 = t2 & t1 ++ t2 = clz t2 ++ t1 = WS ++ t2 = t1 - t2 ++ // result in t2 ++ */ ++ HReg arg = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess); ++ HReg t1 = newVRegI(env); ++ HReg t2 = newVRegI(env); ++ addInstr(env, PPCInstr_Alu(Palu_SUB, t1, arg, PPCRH_Imm(True, 1))); ++ addInstr(env, PPCInstr_Unary(Pun_NOT, t2, arg)); ++ addInstr(env, PPCInstr_Alu(Palu_AND, t2, t2, PPCRH_Reg(t1))); ++ addInstr(env, PPCInstr_Unary(op_clz, t2, t2)); ++ addInstr(env, PPCInstr_LI(t1, WS, False/*!64-bit imm*/)); ++ addInstr(env, PPCInstr_Alu(Palu_SUB, t2, t1, PPCRH_Reg(t2))); ++ return t2; ++ } ++ ++ case Iop_PopCount64: { ++ // popcnt{x,d} is only available in later arch revs (ISA 3.0, ++ // maybe) so it's not really correct to emit it here without a caps ++ // check for the host. ++ if (mode64) { ++ HReg r_dst = newVRegI(env); ++ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess); ++ addInstr(env, PPCInstr_Unary(Pun_POP64, r_dst, r_src)); ++ return r_dst; ++ } ++ // We don't expect to be required to handle this in 32-bit mode. ++ break; ++ } ++ ++ case Iop_PopCount32: { ++ // Similar comment as for Ctz just above applies -- we really ++ // should have a caps check here. ++ ++ HReg r_dst = newVRegI(env); ++ // This actually generates popcntw, which in 64 bit mode does a ++ // 32-bit count individually for both low and high halves of the ++ // word. Per the comment at the top of iselIntExpr_R, in the 64 ++ // bit mode case, the user of this result is required to ignore ++ // the upper 32 bits of the result. In 32 bit mode this is all ++ // moot. It is however unclear from the PowerISA 3.0 docs that ++ // the instruction exists in 32 bit mode; however our own front ++ // end (guest_ppc_toIR.c) accepts it, so I guess it does exist. ++ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess); ++ addInstr(env, PPCInstr_Unary(Pun_POP32, r_dst, r_src)); ++ return r_dst; ++ } ++ ++ case Iop_Reverse8sIn32_x1: { ++ // A bit of a mouthful, but simply .. 32-bit byte swap. ++ // This is pretty rubbish code. We could do vastly better if ++ // rotates, and better, rotate-inserts, were allowed. Note that ++ // even on a 64 bit target, the right shifts must be done as 32-bit ++ // so as to introduce zero bits in the right places. So it seems ++ // simplest to do the whole sequence in 32-bit insns. ++ /* ++ r = // working temporary, initial byte order ABCD ++ Mask = 00FF00FF ++ nMask = not Mask ++ tHi = and r, Mask ++ tHi = shl tHi, 8 ++ tLo = and r, nMask ++ tLo = shr tLo, 8 ++ r = or tHi, tLo // now r has order BADC ++ and repeat for 16 bit chunks .. ++ Mask = 0000FFFF ++ nMask = not Mask ++ tHi = and r, Mask ++ tHi = shl tHi, 16 ++ tLo = and r, nMask ++ tLo = shr tLo, 16 ++ r = or tHi, tLo // now r has order DCBA ++ */ ++ HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg, IEndianess); ++ HReg rr = newVRegI(env); ++ HReg rMask = newVRegI(env); ++ HReg rnMask = newVRegI(env); ++ HReg rtHi = newVRegI(env); ++ HReg rtLo = newVRegI(env); ++ // Copy r_src since we need to modify it ++ addInstr(env, mk_iMOVds_RR(rr, r_src)); ++ // Swap within 16-bit lanes ++ addInstr(env, PPCInstr_LI(rMask, 0x00FF00FFULL, ++ False/* !64bit imm*/)); ++ addInstr(env, PPCInstr_Unary(Pun_NOT, rnMask, rMask)); ++ addInstr(env, PPCInstr_Alu(Palu_AND, rtHi, rr, PPCRH_Reg(rMask))); ++ addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32 bit shift*/, ++ rtHi, rtHi, ++ PPCRH_Imm(False/*!signed imm*/, 8))); ++ addInstr(env, PPCInstr_Alu(Palu_AND, rtLo, rr, PPCRH_Reg(rnMask))); ++ addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32 bit shift*/, ++ rtLo, rtLo, ++ PPCRH_Imm(False/*!signed imm*/, 8))); ++ addInstr(env, PPCInstr_Alu(Palu_OR, rr, rtHi, PPCRH_Reg(rtLo))); ++ // And now swap the two 16-bit chunks ++ addInstr(env, PPCInstr_LI(rMask, 0x0000FFFFULL, ++ False/* !64bit imm*/)); ++ addInstr(env, PPCInstr_Unary(Pun_NOT, rnMask, rMask)); ++ addInstr(env, PPCInstr_Alu(Palu_AND, rtHi, rr, PPCRH_Reg(rMask))); ++ addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32 bit shift*/, ++ rtHi, rtHi, ++ PPCRH_Imm(False/*!signed imm*/, 16))); ++ addInstr(env, PPCInstr_Alu(Palu_AND, rtLo, rr, PPCRH_Reg(rnMask))); ++ addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32 bit shift*/, ++ rtLo, rtLo, ++ PPCRH_Imm(False/*!signed imm*/, 16))); ++ addInstr(env, PPCInstr_Alu(Palu_OR, rr, rtHi, PPCRH_Reg(rtLo))); ++ return rr; + } + + case Iop_Left8: diff --git a/valgrind-3.14.0-transform-popcount64-ctznat64.patch b/valgrind-3.14.0-transform-popcount64-ctznat64.patch new file mode 100644 index 0000000..c8b2ac1 --- /dev/null +++ b/valgrind-3.14.0-transform-popcount64-ctznat64.patch @@ -0,0 +1,82 @@ +commit cb5d7e047598bff6d0f1d707a70d9fb1a1c7f0e2 +Author: Julian Seward +Date: Tue Nov 20 11:46:55 2018 +0100 + + VEX/priv/ir_opt.c + + fold_Expr: transform PopCount64(And64(Add64(x,-1),Not64(x))) into CtzNat64(x). + + This is part of the fix for bug 386945. + +diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c +index f40870b..23964be 100644 +--- a/VEX/priv/ir_opt.c ++++ b/VEX/priv/ir_opt.c +@@ -1377,6 +1377,8 @@ static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e ) + case Iex_Unop: + /* UNARY ops */ + if (e->Iex.Unop.arg->tag == Iex_Const) { ++ ++ /* cases where the arg is a const */ + switch (e->Iex.Unop.op) { + case Iop_1Uto8: + e2 = IRExpr_Const(IRConst_U8(toUChar( +@@ -1690,8 +1692,56 @@ static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e ) + + default: + goto unhandled; +- } +- } ++ } // switch (e->Iex.Unop.op) ++ ++ } else { ++ ++ /* other cases (identities, etc) */ ++ switch (e->Iex.Unop.op) { ++ case Iop_PopCount64: { ++ // PopCount64( And64( Add64(x,-1), Not64(x) ) ) ==> CtzNat64(x) ++ // bindings: ++ // a1:And64( a11:Add64(a111:x,a112:-1), a12:Not64(a121:x) ) ++ IRExpr* a1 = chase(env, e->Iex.Unop.arg); ++ if (!a1) ++ goto nomatch; ++ if (a1->tag != Iex_Binop || a1->Iex.Binop.op != Iop_And64) ++ goto nomatch; ++ // a1 is established ++ IRExpr* a11 = chase(env, a1->Iex.Binop.arg1); ++ if (!a11) ++ goto nomatch; ++ if (a11->tag != Iex_Binop || a11->Iex.Binop.op != Iop_Add64) ++ goto nomatch; ++ // a11 is established ++ IRExpr* a12 = chase(env, a1->Iex.Binop.arg2); ++ if (!a12) ++ goto nomatch; ++ if (a12->tag != Iex_Unop || a12->Iex.Unop.op != Iop_Not64) ++ goto nomatch; ++ // a12 is established ++ IRExpr* a111 = a11->Iex.Binop.arg1; ++ IRExpr* a112 = chase(env, a11->Iex.Binop.arg2); ++ IRExpr* a121 = a12->Iex.Unop.arg; ++ if (!a111 || !a112 || !a121) ++ goto nomatch; ++ // a111 and a121 need to be the same temp. ++ if (!eqIRAtom(a111, a121)) ++ goto nomatch; ++ // Finally, a112 must be a 64-bit version of -1. ++ if (!isOnesU(a112)) ++ goto nomatch; ++ // Match established. Transform. ++ e2 = IRExpr_Unop(Iop_CtzNat64, a111); ++ break; ++ nomatch: ++ break; ++ } ++ default: ++ break; ++ } // switch (e->Iex.Unop.op) ++ ++ } // if (e->Iex.Unop.arg->tag == Iex_Const) + break; + + case Iex_Binop: diff --git a/valgrind.spec b/valgrind.spec index 30013ba..0d35170 100644 --- a/valgrind.spec +++ b/valgrind.spec @@ -3,7 +3,7 @@ Summary: Tool for finding memory management bugs in programs Name: %{?scl_prefix}valgrind Version: 3.14.0 -Release: 3%{?dist} +Release: 4%{?dist} Epoch: 1 License: GPLv2+ URL: http://www.valgrind.org/ @@ -119,6 +119,15 @@ Patch8: valgrind-3.14.0-s390x-vec-float-point-tests.patch # KDE#401277 More bugs in z13 support Patch9: valgrind-3.14.0-s390z-more-z13-fixes.patch +# KDE#386945 Bogus memcheck errors on ppc64(le) when using strcmp +Patch10: valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch +Patch11: valgrind-3.14.0-new-strlen-IROps.patch +Patch12: valgrind-3.14.0-ppc-instr-new-IROps.patch +Patch13: valgrind-3.14.0-memcheck-new-IROps.patch +Patch14: valgrind-3.14.0-ppc-frontend-new-IROps.patch +Patch15: valgrind-3.14.0-transform-popcount64-ctznat64.patch +Patch16: valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch + %if %{build_multilib} # Ensure glibc{,-devel} is installed for both multilib arches BuildRequires: /lib/libc.so.6 /usr/lib/libc.so /lib64/libc.so.6 /usr/lib64/libc.so @@ -260,6 +269,13 @@ Valgrind User Manual for details. %patch7 -p1 %patch8 -p1 %patch9 -p1 +%patch10 -p1 +%patch11 -p1 +%patch12 -p1 +%patch13 -p1 +%patch14 -p1 +%patch15 -p1 +%patch16 -p1 %build CC=gcc @@ -494,6 +510,15 @@ fi %endif %changelog +* Fri Nov 23 2018 Mark Wielaard - 3.14.0-4 +- Add valgrind-3.14.0-get_otrack_shadow_offset_wrk-ppc.patch, + valgrind-3.14.0-new-strlen-IROps.patch, + valgrind-3.14.0-ppc-instr-new-IROps.patch, + valgrind-3.14.0-memcheck-new-IROps.patch, + valgrind-3.14.0-ppc-frontend-new-IROps.patch, + valgrind-3.14.0-transform-popcount64-ctznat64.patch and + valgrind-3.14.0-enable-ppc-Iop_Sar_Shr8.patch (#1652926) + * Wed Nov 21 2018 Mark Wielaard - 3.14.0-3 - Add valgrind-3.14.0-s390z-more-z13-fixes.patch.