4db4a6
commit 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650
4db4a6
Author: Julian Seward <jseward@acm.org>
4db4a6
Date:   Fri Nov 12 12:13:45 2021 +0100
4db4a6
4db4a6
    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).
4db4a6
    
4db4a6
    This is unfortunately a big and complex patch, to implement LD{,A}XP and
4db4a6
    ST{,L}XP.  These were omitted from the original AArch64 v8.0 implementation
4db4a6
    for unknown reasons.
4db4a6
    
4db4a6
    (Background) the patch is made significantly more complex because for AArch64
4db4a6
    we actually have two implementations of the underlying
4db4a6
    Load-Linked/Store-Conditional (LL/SC) machinery: a "primary" implementation,
4db4a6
    which translates LL/SC more or less directly into IR and re-emits them at the
4db4a6
    back end, and a "fallback" implementation that implements LL/SC "manually", by
4db4a6
    taking advantage of the fact that V serialises thread execution, so we can
4db4a6
    "implement" LL/SC by simulating a reservation using fields LLSC_* in the guest
4db4a6
    state, and invalidating the reservation at every thread switch.
4db4a6
    
4db4a6
    (Background) the fallback scheme is needed because the primary scheme is in
4db4a6
    violation of the ARMv8 semantics in that it can (easily) introduce extra
4db4a6
    memory references between the LL and SC, hence on some hardware causing the
4db4a6
    reservation to always fail and so the simulated program to wind up looping
4db4a6
    forever.
4db4a6
    
4db4a6
    For these instructions, big picture:
4db4a6
    
4db4a6
    * for the primary implementation, we take advantage of the fact that
4db4a6
      IRStmt_LLSC allows I128 bit transactions to be represented.  Hence we bundle
4db4a6
      up the two 64-bit data elements into an I128 (or vice versa) and present a
4db4a6
      single I128-typed IRStmt_LLSC in the IR.  In the backend, those are
4db4a6
      re-emitted as LDXP/STXP respectively.  For LL/SC on 32-bit register pairs,
4db4a6
      that bundling produces a single 64-bit item, and so the existing LL/SC
4db4a6
      backend machinery handles it.  The effect is that a doubleword 32-bit LL/SC
4db4a6
      in the front end translates into a single 64-bit LL/SC in the back end.
4db4a6
      Overall, though, the implementation is straightforward.
4db4a6
    
4db4a6
    * for the fallback implementation, it is necessary to extend the guest state
4db4a6
      field `guest_LLSC_DATA` to represent a 128-bit transaction, by splitting it
4db4a6
      into _DATA_LO64 and DATA_HI64.  Then, the implementation is an exact
4db4a6
      analogue of the fallback implementation for single-word LL/SC.  It takes
4db4a6
      advantage of the fact that the backend already supports 128-bit CAS, as
4db4a6
      fixed in bug 445354.  As with the primary implementation, doubleword 32-bit
4db4a6
      LL/SC is bundled into a single 64-bit transaction.
4db4a6
    
4db4a6
    Detailed changes:
4db4a6
    
4db4a6
    * new arm64 guest state fields LLSC_DATA_LO64/LLSC_DATA_LO64 to replace
4db4a6
      guest_LLSC_DATA
4db4a6
    
4db4a6
    * (ridealong fix) arm64 front end: a fix to a minor and harmless decoding bug
4db4a6
      for the single-word LDX/STX case.
4db4a6
    
4db4a6
    * arm64 front end: IR generation for LD{,A}XP/ST{,L}XP: tedious and
4db4a6
      longwinded, but per comments above, an exact(ish) analogue of the singleword
4db4a6
      case
4db4a6
    
4db4a6
    * arm64 backend: new insns ARM64Instr_LdrEXP / ARM64Instr_StrEXP to wrap up 2
4db4a6
      x 64 exclusive loads/stores.  Per comments above, there's no need to handle
4db4a6
      the 2 x 32 case.
4db4a6
    
4db4a6
    * arm64 isel: translate I128-typed IRStmt_LLSC into the above two insns
4db4a6
    
4db4a6
    * arm64 isel: some auxiliary bits and pieces needed to handle I128 values;
4db4a6
      this is standard doubleword isel stuff
4db4a6
    
4db4a6
    * arm64 isel: (ridealong fix): Ist_CAS: check for endianness of the CAS!
4db4a6
    
4db4a6
    * arm64 isel: (ridealong) a couple of formatting fixes
4db4a6
    
4db4a6
    * IR infrastructure: add support for I128 constants, done the same as V128
4db4a6
      constants
4db4a6
    
4db4a6
    * memcheck: handle shadow loads and stores for I128 values
4db4a6
    
4db4a6
    * testcase: memcheck/tests/atomic_incs.c: on arm64, also test 128-bit atomic
4db4a6
      addition, to check we really have atomicity right
4db4a6
    
4db4a6
    * testcase: new test none/tests/arm64/ldxp_stxp.c, tests operation but not
4db4a6
      atomicity.  (Smoke test).
4db4a6
4db4a6
diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
4db4a6
index 12a1c5978..ee018c6a9 100644
4db4a6
--- a/VEX/priv/guest_arm64_toIR.c
4db4a6
+++ b/VEX/priv/guest_arm64_toIR.c
4db4a6
@@ -1184,9 +1184,10 @@ static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
4db4a6
 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
4db4a6
 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
4db4a6
 
4db4a6
-#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
4db4a6
-#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
4db4a6
-#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
4db4a6
+#define OFFB_LLSC_SIZE      offsetof(VexGuestARM64State,guest_LLSC_SIZE)
4db4a6
+#define OFFB_LLSC_ADDR      offsetof(VexGuestARM64State,guest_LLSC_ADDR)
4db4a6
+#define OFFB_LLSC_DATA_LO64 offsetof(VexGuestARM64State,guest_LLSC_DATA_LO64)
4db4a6
+#define OFFB_LLSC_DATA_HI64 offsetof(VexGuestARM64State,guest_LLSC_DATA_HI64)
4db4a6
 
4db4a6
 
4db4a6
 /* ---------------- Integer registers ---------------- */
4db4a6
@@ -6652,7 +6653,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4db4a6
         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
4db4a6
          has to do this bit)
4db4a6
    */   
4db4a6
-   if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
4db4a6
+   if (INSN(29,24) == BITS6(0,0,1,0,0,0)
4db4a6
        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
4db4a6
        && INSN(14,10) == BITS5(1,1,1,1,1)) {
4db4a6
       UInt szBlg2     = INSN(31,30);
4db4a6
@@ -6678,7 +6679,8 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4db4a6
             // if it faults.
4db4a6
             IRTemp loaded_data64 = newTemp(Ity_I64);
4db4a6
             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
4db4a6
-            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
4db4a6
+            stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
4db4a6
+            stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
4db4a6
             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
4db4a6
             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
4db4a6
             putIReg64orZR(tt, mkexpr(loaded_data64));
4db4a6
@@ -6729,7 +6731,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4db4a6
             ));
4db4a6
             // Fail if the data doesn't match the LL data
4db4a6
             IRTemp llsc_data64 = newTemp(Ity_I64);
4db4a6
-            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
4db4a6
+            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
4db4a6
             stmt( IRStmt_Exit(
4db4a6
                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
4db4a6
                                          mkexpr(llsc_data64)),
4db4a6
@@ -6771,6 +6773,257 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
4db4a6
       /* else fall through */
4db4a6
    }
4db4a6
 
4db4a6
+   /* -------------------- LD{,A}XP -------------------- */
4db4a6
+   /* -------------------- ST{,L}XP -------------------- */
4db4a6
+   /* 31 30 29     23  20    15 14  9  4
4db4a6
+       1 sz 001000 011 11111 0  t2  n  t1   LDXP  Rt1, Rt2, [Xn|SP]
4db4a6
+       1 sz 001000 011 11111 1  t2  n  t1   LDAXP Rt1, Rt2, [Xn|SP]
4db4a6
+       1 sz 001000 001 s     0  t2  n  t1   STXP  Ws, Rt1, Rt2, [Xn|SP]
4db4a6
+       1 sz 001000 001 s     1  t2  n  t1   STLXP Ws, Rt1, Rt2, [Xn|SP]
4db4a6
+   */
4db4a6
+   /* See just above, "LD{,A}X{R,RH,RB} / ST{,L}X{R,RH,RB}", for detailed
4db4a6
+      comments about this implementation.  Note the 'sz' field here is only 1
4db4a6
+      bit; above, it is 2 bits, and has a different encoding.
4db4a6
+   */
4db4a6
+   if (INSN(31,31) == 1
4db4a6
+       && INSN(29,24) == BITS6(0,0,1,0,0,0)
4db4a6
+       && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,1)) {
4db4a6
+      Bool elemIs64   = INSN(30,30) == 1;
4db4a6
+      Bool isLD       = INSN(22,22) == 1;
4db4a6
+      Bool isAcqOrRel = INSN(15,15) == 1;
4db4a6
+      UInt ss         = INSN(20,16);
4db4a6
+      UInt tt2        = INSN(14,10);
4db4a6
+      UInt nn         = INSN(9,5);
4db4a6
+      UInt tt1        = INSN(4,0);
4db4a6
+
4db4a6
+      UInt   elemSzB = elemIs64 ? 8 : 4;
4db4a6
+      UInt   fullSzB = 2 * elemSzB;
4db4a6
+      IRType elemTy  = integerIRTypeOfSize(elemSzB);
4db4a6
+      IRType fullTy  = integerIRTypeOfSize(fullSzB);
4db4a6
+
4db4a6
+      IRTemp ea = newTemp(Ity_I64);
4db4a6
+      assign(ea, getIReg64orSP(nn));
4db4a6
+      /* FIXME generate check that ea is 2*elemSzB-aligned */
4db4a6
+
4db4a6
+      if (isLD && ss == BITS5(1,1,1,1,1)) {
4db4a6
+         if (abiinfo->guest__use_fallback_LLSC) {
4db4a6
+            // Fallback implementation of LL.
4db4a6
+            // Do the load first so we don't update any guest state if it
4db4a6
+            // faults.  Assumes little-endian guest.
4db4a6
+            if (fullTy == Ity_I64) {
4db4a6
+               vassert(elemSzB == 4);
4db4a6
+               IRTemp loaded_data64 = newTemp(Ity_I64);
4db4a6
+               assign(loaded_data64, loadLE(fullTy, mkexpr(ea)));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(8) ));
4db4a6
+               putIReg64orZR(tt1, unop(Iop_32Uto64,
4db4a6
+                                       unop(Iop_64to32,
4db4a6
+                                            mkexpr(loaded_data64))));
4db4a6
+               putIReg64orZR(tt2, unop(Iop_32Uto64,
4db4a6
+                                       unop(Iop_64HIto32,
4db4a6
+                                            mkexpr(loaded_data64))));
4db4a6
+            } else {
4db4a6
+               vassert(elemSzB == 8 && fullTy == Ity_I128);
4db4a6
+               IRTemp loaded_data128 = newTemp(Ity_I128);
4db4a6
+               // Hack: do the load as V128 rather than I128 so as to avoid
4db4a6
+               // having to implement I128 loads in the arm64 back end.
4db4a6
+               assign(loaded_data128, unop(Iop_ReinterpV128asI128,
4db4a6
+                                           loadLE(Ity_V128, mkexpr(ea))));
4db4a6
+               IRTemp loaded_data_lo64 = newTemp(Ity_I64);
4db4a6
+               IRTemp loaded_data_hi64 = newTemp(Ity_I64);
4db4a6
+               assign(loaded_data_lo64, unop(Iop_128to64,
4db4a6
+                                             mkexpr(loaded_data128)));
4db4a6
+               assign(loaded_data_hi64, unop(Iop_128HIto64,
4db4a6
+                                             mkexpr(loaded_data128)));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64,
4db4a6
+                                 mkexpr(loaded_data_lo64) ));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64,
4db4a6
+                                 mkexpr(loaded_data_hi64) ));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
4db4a6
+               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(16) ));
4db4a6
+               putIReg64orZR(tt1, mkexpr(loaded_data_lo64));
4db4a6
+               putIReg64orZR(tt2, mkexpr(loaded_data_hi64));
4db4a6
+            }
4db4a6
+         } else {
4db4a6
+            // Non-fallback implementation of LL.
4db4a6
+            IRTemp res = newTemp(fullTy); // I64 or I128
4db4a6
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
4db4a6
+            // Assuming a little-endian guest here.  Rt1 goes at the lower
4db4a6
+            // address, so it must live in the least significant half of `res`.
4db4a6
+            IROp opGetLO = fullTy == Ity_I128 ? Iop_128to64   : Iop_64to32;
4db4a6
+            IROp opGetHI = fullTy == Ity_I128 ? Iop_128HIto64 : Iop_64HIto32;
4db4a6
+            putIReg64orZR(tt1, widenUto64(elemTy, unop(opGetLO, mkexpr(res))));
4db4a6
+            putIReg64orZR(tt2, widenUto64(elemTy, unop(opGetHI, mkexpr(res))));
4db4a6
+         }
4db4a6
+         if (isAcqOrRel) {
4db4a6
+            stmt(IRStmt_MBE(Imbe_Fence));
4db4a6
+         }
4db4a6
+         DIP("ld%sxp %s, %s, [%s] %s\n",
4db4a6
+             isAcqOrRel ? (isLD ? "a" : "l") : "",
4db4a6
+             nameIRegOrZR(elemSzB == 8, tt1),
4db4a6
+             nameIRegOrZR(elemSzB == 8, tt2),
4db4a6
+             nameIReg64orSP(nn),
4db4a6
+             abiinfo->guest__use_fallback_LLSC
4db4a6
+                ? "(fallback implementation)" : "");
4db4a6
+         return True;
4db4a6
+      }
4db4a6
+      if (!isLD) {
4db4a6
+         if (isAcqOrRel) {
4db4a6
+            stmt(IRStmt_MBE(Imbe_Fence));
4db4a6
+         }
4db4a6
+         if (abiinfo->guest__use_fallback_LLSC) {
4db4a6
+            // Fallback implementation of SC.
4db4a6
+            // This is really ugly, since we don't have any way to do
4db4a6
+            // proper if-then-else.  First, set up as if the SC failed,
4db4a6
+            // and jump forwards if it really has failed.
4db4a6
+
4db4a6
+            // Continuation address
4db4a6
+            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
4db4a6
+
4db4a6
+            // "the SC failed".  Any non-zero value means failure.
4db4a6
+            putIReg64orZR(ss, mkU64(1));
4db4a6
+
4db4a6
+            IRTemp tmp_LLsize = newTemp(Ity_I64);
4db4a6
+            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
4db4a6
+            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
4db4a6
+            ));
4db4a6
+            // Fail if no or wrong-size transaction
4db4a6
+            vassert((fullSzB == 8 && fullTy == Ity_I64)
4db4a6
+                    || (fullSzB == 16 && fullTy == Ity_I128));
4db4a6
+            stmt( IRStmt_Exit(
4db4a6
+                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(fullSzB)),
4db4a6
+                     Ijk_Boring, nia, OFFB_PC
4db4a6
+            ));
4db4a6
+            // Fail if the address doesn't match the LL address
4db4a6
+            stmt( IRStmt_Exit(
4db4a6
+                      binop(Iop_CmpNE64, mkexpr(ea),
4db4a6
+                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
4db4a6
+                      Ijk_Boring, nia, OFFB_PC
4db4a6
+            ));
4db4a6
+            // The data to be stored.
4db4a6
+            IRTemp store_data = newTemp(fullTy);
4db4a6
+            if (fullTy == Ity_I64) {
4db4a6
+               assign(store_data,
4db4a6
+                      binop(Iop_32HLto64,
4db4a6
+                            narrowFrom64(Ity_I32, getIReg64orZR(tt2)),
4db4a6
+                            narrowFrom64(Ity_I32, getIReg64orZR(tt1))));
4db4a6
+            } else {
4db4a6
+               assign(store_data,
4db4a6
+                      binop(Iop_64HLto128,
4db4a6
+                            getIReg64orZR(tt2), getIReg64orZR(tt1)));
4db4a6
+            }
4db4a6
+
4db4a6
+            if (fullTy == Ity_I64) {
4db4a6
+               // 64 bit (2x32 bit) path
4db4a6
+               // Fail if the data in memory doesn't match the data stashed by
4db4a6
+               // the LL.
4db4a6
+               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
4db4a6
+               assign(llsc_data_lo64,
4db4a6
+                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
4db4a6
+               stmt( IRStmt_Exit(
4db4a6
+                         binop(Iop_CmpNE64, loadLE(Ity_I64, mkexpr(ea)),
4db4a6
+                                            mkexpr(llsc_data_lo64)),
4db4a6
+                      Ijk_Boring, nia, OFFB_PC
4db4a6
+               ));
4db4a6
+               // Try to CAS the new value in.
4db4a6
+               IRTemp old = newTemp(Ity_I64);
4db4a6
+               IRTemp expd = newTemp(Ity_I64);
4db4a6
+               assign(expd, mkexpr(llsc_data_lo64));
4db4a6
+               stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
4db4a6
+                                        Iend_LE, mkexpr(ea),
4db4a6
+                                        /*expdHi*/NULL, mkexpr(expd),
4db4a6
+                                        /*dataHi*/NULL, mkexpr(store_data)
4db4a6
+               )));
4db4a6
+               // Fail if the CAS failed (viz, old != expd)
4db4a6
+               stmt( IRStmt_Exit(
4db4a6
+                         binop(Iop_CmpNE64, mkexpr(old), mkexpr(expd)),
4db4a6
+                         Ijk_Boring, nia, OFFB_PC
4db4a6
+               ));
4db4a6
+            } else {
4db4a6
+               // 128 bit (2x64 bit) path
4db4a6
+               // Fail if the data in memory doesn't match the data stashed by
4db4a6
+               // the LL.
4db4a6
+               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
4db4a6
+               assign(llsc_data_lo64,
4db4a6
+                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
4db4a6
+               IRTemp llsc_data_hi64 = newTemp(Ity_I64);
4db4a6
+               assign(llsc_data_hi64,
4db4a6
+                      IRExpr_Get(OFFB_LLSC_DATA_HI64, Ity_I64));
4db4a6
+               IRTemp data_at_ea = newTemp(Ity_I128);
4db4a6
+               assign(data_at_ea,
4db4a6
+                      unop(Iop_ReinterpV128asI128,
4db4a6
+                           loadLE(Ity_V128, mkexpr(ea))));
4db4a6
+               stmt( IRStmt_Exit(
4db4a6
+                        binop(Iop_CmpNE64,
4db4a6
+                              unop(Iop_128to64, mkexpr(data_at_ea)),
4db4a6
+                              mkexpr(llsc_data_lo64)),
4db4a6
+                        Ijk_Boring, nia, OFFB_PC
4db4a6
+               ));
4db4a6
+               stmt( IRStmt_Exit(
4db4a6
+                        binop(Iop_CmpNE64,
4db4a6
+                              unop(Iop_128HIto64, mkexpr(data_at_ea)),
4db4a6
+                              mkexpr(llsc_data_hi64)),
4db4a6
+                        Ijk_Boring, nia, OFFB_PC
4db4a6
+               ));
4db4a6
+               // Try to CAS the new value in.
4db4a6
+               IRTemp old_lo64 = newTemp(Ity_I64);
4db4a6
+               IRTemp old_hi64 = newTemp(Ity_I64);
4db4a6
+               IRTemp expd_lo64 = newTemp(Ity_I64);
4db4a6
+               IRTemp expd_hi64 = newTemp(Ity_I64);
4db4a6
+               IRTemp store_data_lo64 = newTemp(Ity_I64);
4db4a6
+               IRTemp store_data_hi64 = newTemp(Ity_I64);
4db4a6
+               assign(expd_lo64, mkexpr(llsc_data_lo64));
4db4a6
+               assign(expd_hi64, mkexpr(llsc_data_hi64));
4db4a6
+               assign(store_data_lo64, unop(Iop_128to64, mkexpr(store_data)));
4db4a6
+               assign(store_data_hi64, unop(Iop_128HIto64, mkexpr(store_data)));
4db4a6
+               stmt( IRStmt_CAS(mkIRCAS(old_hi64, old_lo64,
4db4a6
+                                        Iend_LE, mkexpr(ea),
4db4a6
+                                        mkexpr(expd_hi64), mkexpr(expd_lo64),
4db4a6
+                                        mkexpr(store_data_hi64),
4db4a6
+                                        mkexpr(store_data_lo64)
4db4a6
+               )));
4db4a6
+               // Fail if the CAS failed (viz, old != expd)
4db4a6
+               stmt( IRStmt_Exit(
4db4a6
+                        binop(Iop_CmpNE64, mkexpr(old_lo64), mkexpr(expd_lo64)),
4db4a6
+                        Ijk_Boring, nia, OFFB_PC
4db4a6
+               ));
4db4a6
+               stmt( IRStmt_Exit(
4db4a6
+                        binop(Iop_CmpNE64, mkexpr(old_hi64), mkexpr(expd_hi64)),
4db4a6
+                        Ijk_Boring, nia, OFFB_PC
4db4a6
+               ));
4db4a6
+            }
4db4a6
+            // Otherwise we succeeded (!)
4db4a6
+            putIReg64orZR(ss, mkU64(0));
4db4a6
+         } else {
4db4a6
+            // Non-fallback implementation of SC.
4db4a6
+            IRTemp  res     = newTemp(Ity_I1);
4db4a6
+            IRExpr* dataLO  = narrowFrom64(elemTy, getIReg64orZR(tt1));
4db4a6
+            IRExpr* dataHI  = narrowFrom64(elemTy, getIReg64orZR(tt2));
4db4a6
+            IROp    opMerge = fullTy == Ity_I128 ? Iop_64HLto128 : Iop_32HLto64;
4db4a6
+            IRExpr* data    = binop(opMerge, dataHI, dataLO);
4db4a6
+            // Assuming a little-endian guest here.  Rt1 goes at the lower
4db4a6
+            // address, so it must live in the least significant half of `data`.
4db4a6
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
4db4a6
+            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
4db4a6
+               Need to set rS to 1 on failure, 0 on success. */
4db4a6
+            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
4db4a6
+                                               mkU64(1)));
4db4a6
+         }
4db4a6
+         DIP("st%sxp %s, %s, %s, [%s] %s\n",
4db4a6
+             isAcqOrRel ? (isLD ? "a" : "l") : "",
4db4a6
+             nameIRegOrZR(False, ss),
4db4a6
+             nameIRegOrZR(elemSzB == 8, tt1),
4db4a6
+             nameIRegOrZR(elemSzB == 8, tt2),
4db4a6
+             nameIReg64orSP(nn),
4db4a6
+             abiinfo->guest__use_fallback_LLSC
4db4a6
+                ? "(fallback implementation)" : "");
4db4a6
+         return True;
4db4a6
+      }
4db4a6
+      /* else fall through */
4db4a6
+   }
4db4a6
+
4db4a6
    /* ------------------ LDA{R,RH,RB} ------------------ */
4db4a6
    /* ------------------ STL{R,RH,RB} ------------------ */
4db4a6
    /* 31 29     23  20      14    9 4
4db4a6
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
4db4a6
index 5657bcab9..b65e27db4 100644
4db4a6
--- a/VEX/priv/host_arm64_defs.c
4db4a6
+++ b/VEX/priv/host_arm64_defs.c
4db4a6
@@ -1059,6 +1059,16 @@ ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
4db4a6
    vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
4db4a6
    return i;
4db4a6
 }
4db4a6
+ARM64Instr* ARM64Instr_LdrEXP ( void ) {
4db4a6
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
4db4a6
+   i->tag        = ARM64in_LdrEXP;
4db4a6
+   return i;
4db4a6
+}
4db4a6
+ARM64Instr* ARM64Instr_StrEXP ( void ) {
4db4a6
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
4db4a6
+   i->tag        = ARM64in_StrEXP;
4db4a6
+   return i;
4db4a6
+}
4db4a6
 ARM64Instr* ARM64Instr_CAS ( Int szB ) {
4db4a6
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
4db4a6
    i->tag             = ARM64in_CAS;
4db4a6
@@ -1699,12 +1709,19 @@ void ppARM64Instr ( const ARM64Instr* i ) {
4db4a6
                     sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
4db4a6
          return;
4db4a6
       }
4db4a6
+      case ARM64in_LdrEXP:
4db4a6
+         vex_printf("ldxp   x2, x3, [x4]");
4db4a6
+         return;
4db4a6
+      case ARM64in_StrEXP:
4db4a6
+         vex_printf("stxp   w0, x2, x3, [x4]");
4db4a6
+         return;
4db4a6
       case ARM64in_CAS: {
4db4a6
          vex_printf("x1 = cas(%dbit)(x3, x5 -> x7)", 8 * i->ARM64in.CAS.szB);
4db4a6
          return;
4db4a6
       }
4db4a6
       case ARM64in_CASP: {
4db4a6
-         vex_printf("x0,x1 = casp(%dbit)(x2, x4,x5 -> x6,x7)", 8 * i->ARM64in.CASP.szB);
4db4a6
+         vex_printf("x0,x1 = casp(2x%dbit)(x2, x4,x5 -> x6,x7)",
4db4a6
+                    8 * i->ARM64in.CASP.szB);
4db4a6
          return;
4db4a6
       }
4db4a6
       case ARM64in_MFence:
4db4a6
@@ -2253,6 +2270,17 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
4db4a6
          addHRegUse(u, HRmWrite, hregARM64_X0());
4db4a6
          addHRegUse(u, HRmRead, hregARM64_X2());
4db4a6
          return;
4db4a6
+      case ARM64in_LdrEXP:
4db4a6
+         addHRegUse(u, HRmRead, hregARM64_X4());
4db4a6
+         addHRegUse(u, HRmWrite, hregARM64_X2());
4db4a6
+         addHRegUse(u, HRmWrite, hregARM64_X3());
4db4a6
+         return;
4db4a6
+      case ARM64in_StrEXP:
4db4a6
+         addHRegUse(u, HRmRead, hregARM64_X4());
4db4a6
+         addHRegUse(u, HRmWrite, hregARM64_X0());
4db4a6
+         addHRegUse(u, HRmRead, hregARM64_X2());
4db4a6
+         addHRegUse(u, HRmRead, hregARM64_X3());
4db4a6
+         return;
4db4a6
       case ARM64in_CAS:
4db4a6
          addHRegUse(u, HRmRead, hregARM64_X3());
4db4a6
          addHRegUse(u, HRmRead, hregARM64_X5());
4db4a6
@@ -2571,6 +2599,10 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
4db4a6
          return;
4db4a6
       case ARM64in_StrEX:
4db4a6
          return;
4db4a6
+      case ARM64in_LdrEXP:
4db4a6
+         return;
4db4a6
+      case ARM64in_StrEXP:
4db4a6
+         return;
4db4a6
       case ARM64in_CAS:
4db4a6
          return;
4db4a6
       case ARM64in_CASP:
4db4a6
@@ -4167,6 +4199,16 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
4db4a6
          }
4db4a6
          goto bad;
4db4a6
       }
4db4a6
+      case ARM64in_LdrEXP: {
4db4a6
+         // 820C7FC8   ldxp x2, x3, [x4]
4db4a6
+         *p++ = 0xC87F0C82;
4db4a6
+         goto done;
4db4a6
+      }
4db4a6
+      case ARM64in_StrEXP: {
4db4a6
+         // 820C20C8   stxp w0, x2, x3, [x4]
4db4a6
+         *p++ = 0xC8200C82;
4db4a6
+         goto done;
4db4a6
+      }
4db4a6
       case ARM64in_CAS: {
4db4a6
          /* This isn't simple.  For an explanation see the comment in
4db4a6
             host_arm64_defs.h on the definition of ARM64Instr case CAS.
4db4a6
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
4db4a6
index 01fb5708e..dc686dff7 100644
4db4a6
--- a/VEX/priv/host_arm64_defs.h
4db4a6
+++ b/VEX/priv/host_arm64_defs.h
4db4a6
@@ -509,8 +509,10 @@ typedef
4db4a6
       ARM64in_AddToSP,     /* move SP by small, signed constant */
4db4a6
       ARM64in_FromSP,      /* move SP to integer register */
4db4a6
       ARM64in_Mul,
4db4a6
-      ARM64in_LdrEX,
4db4a6
-      ARM64in_StrEX,
4db4a6
+      ARM64in_LdrEX,       /* load exclusive, single register */
4db4a6
+      ARM64in_StrEX,       /* store exclusive, single register */
4db4a6
+      ARM64in_LdrEXP,      /* load exclusive, register pair, 2x64-bit only */
4db4a6
+      ARM64in_StrEXP,      /* store exclusive, register pair, 2x64-bit only */
4db4a6
       ARM64in_CAS,
4db4a6
       ARM64in_CASP,
4db4a6
       ARM64in_MFence,
4db4a6
@@ -719,6 +721,12 @@ typedef
4db4a6
          struct {
4db4a6
             Int  szB; /* 1, 2, 4 or 8 */
4db4a6
          } StrEX;
4db4a6
+         /* LDXP x2, x3, [x4].  This is 2x64-bit only. */
4db4a6
+         struct {
4db4a6
+         } LdrEXP;
4db4a6
+         /* STXP w0, x2, x3, [x4].  This is 2x64-bit only. */
4db4a6
+         struct {
4db4a6
+         } StrEXP;
4db4a6
          /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
4db4a6
             and trashes x8
4db4a6
             where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
4db4a6
@@ -1037,6 +1045,8 @@ extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
4db4a6
                                         ARM64MulOp op );
4db4a6
 extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
4db4a6
 extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
4db4a6
+extern ARM64Instr* ARM64Instr_LdrEXP  ( void );
4db4a6
+extern ARM64Instr* ARM64Instr_StrEXP  ( void );
4db4a6
 extern ARM64Instr* ARM64Instr_CAS     ( Int szB );
4db4a6
 extern ARM64Instr* ARM64Instr_CASP    ( Int szB );
4db4a6
 extern ARM64Instr* ARM64Instr_MFence  ( void );
4db4a6
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
4db4a6
index 4b1d8c846..094e7e74b 100644
4db4a6
--- a/VEX/priv/host_arm64_isel.c
4db4a6
+++ b/VEX/priv/host_arm64_isel.c
4db4a6
@@ -196,9 +196,9 @@ static HReg        iselCondCode_R        ( ISelEnv* env, IRExpr* e );
4db4a6
 static HReg        iselIntExpr_R_wrk     ( ISelEnv* env, IRExpr* e );
4db4a6
 static HReg        iselIntExpr_R         ( ISelEnv* env, IRExpr* e );
4db4a6
 
4db4a6
-static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, HReg* rLo, 
4db4a6
+static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
4db4a6
                                            ISelEnv* env, IRExpr* e );
4db4a6
-static void        iselInt128Expr        ( /*OUT*/HReg* rHi, HReg* rLo, 
4db4a6
+static void        iselInt128Expr        ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
4db4a6
                                            ISelEnv* env, IRExpr* e );
4db4a6
 
4db4a6
 static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
4db4a6
@@ -1759,9 +1759,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
4db4a6
 
4db4a6
       /* AND/OR/XOR(e1, e2) (for any e1, e2) */
4db4a6
       switch (e->Iex.Binop.op) {
4db4a6
-         case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop;
4db4a6
-         case Iop_Or64:  case Iop_Or32:  case Iop_Or16: lop = ARM64lo_OR;  goto log_binop;
4db4a6
-         case Iop_Xor64: case Iop_Xor32: lop = ARM64lo_XOR; goto log_binop;
4db4a6
+         case Iop_And64: case Iop_And32:
4db4a6
+            lop = ARM64lo_AND; goto log_binop;
4db4a6
+         case Iop_Or64:  case Iop_Or32:  case Iop_Or16:
4db4a6
+            lop = ARM64lo_OR;  goto log_binop;
4db4a6
+         case Iop_Xor64: case Iop_Xor32:
4db4a6
+            lop = ARM64lo_XOR; goto log_binop;
4db4a6
          log_binop: {
4db4a6
             HReg      dst  = newVRegI(env);
4db4a6
             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
4db4a6
@@ -2013,6 +2016,11 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
4db4a6
             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
4db4a6
             return rHi; /* and abandon rLo */
4db4a6
          }
4db4a6
+         case Iop_128to64: {
4db4a6
+            HReg rHi, rLo;
4db4a6
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
4db4a6
+            return rLo; /* and abandon rHi */
4db4a6
+         }
4db4a6
          case Iop_8Sto32: case Iop_8Sto64: {
4db4a6
             IRExpr* arg = e->Iex.Unop.arg;
4db4a6
             HReg    src = iselIntExpr_R(env, arg);
4db4a6
@@ -2185,13 +2193,19 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
4db4a6
             }
4db4a6
             return dst;
4db4a6
          }
4db4a6
+         case Iop_64HIto32: {
4db4a6
+            HReg dst = newVRegI(env);
4db4a6
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
4db4a6
+            addInstr(env, ARM64Instr_Shift(dst, src, ARM64RI6_I6(32),
4db4a6
+                                           ARM64sh_SHR));
4db4a6
+            return dst;
4db4a6
+         }
4db4a6
          case Iop_64to32:
4db4a6
          case Iop_64to16:
4db4a6
          case Iop_64to8:
4db4a6
          case Iop_32to16:
4db4a6
             /* These are no-ops. */
4db4a6
             return iselIntExpr_R(env, e->Iex.Unop.arg);
4db4a6
-
4db4a6
          default:
4db4a6
             break;
4db4a6
       }
4db4a6
@@ -2335,6 +2349,43 @@ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
4db4a6
    vassert(e);
4db4a6
    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
4db4a6
 
4db4a6
+   /* --------- TEMP --------- */
4db4a6
+   if (e->tag == Iex_RdTmp) {
4db4a6
+      lookupIRTempPair(rHi, rLo, env, e->Iex.RdTmp.tmp);
4db4a6
+      return;
4db4a6
+   }
4db4a6
+
4db4a6
+   /* --------- CONST --------- */
4db4a6
+   if (e->tag == Iex_Const) {
4db4a6
+      IRConst* c = e->Iex.Const.con;
4db4a6
+      vassert(c->tag == Ico_U128);
4db4a6
+      if (c->Ico.U128 == 0) {
4db4a6
+         // The only case we need to handle (so far)
4db4a6
+         HReg zero = newVRegI(env);
4db4a6
+         addInstr(env, ARM64Instr_Imm64(zero, 0));
4db4a6
+         *rHi = *rLo = zero;
4db4a6
+         return;
4db4a6
+      }
4db4a6
+   }
4db4a6
+
4db4a6
+   /* --------- UNARY ops --------- */
4db4a6
+   if (e->tag == Iex_Unop) {
4db4a6
+      switch (e->Iex.Unop.op) {
4db4a6
+         case Iop_ReinterpV128asI128: {
4db4a6
+            HReg dstHi = newVRegI(env);
4db4a6
+            HReg dstLo = newVRegI(env);
4db4a6
+            HReg src    = iselV128Expr(env, e->Iex.Unop.arg);
4db4a6
+            addInstr(env, ARM64Instr_VXfromQ(dstHi, src, 1));
4db4a6
+            addInstr(env, ARM64Instr_VXfromQ(dstLo, src, 0));
4db4a6
+            *rHi = dstHi;
4db4a6
+            *rLo = dstLo;
4db4a6
+            return;
4db4a6
+         }
4db4a6
+         default:
4db4a6
+            break;
4db4a6
+      }
4db4a6
+   }
4db4a6
+
4db4a6
    /* --------- BINARY ops --------- */
4db4a6
    if (e->tag == Iex_Binop) {
4db4a6
       switch (e->Iex.Binop.op) {
4db4a6
@@ -4086,6 +4137,14 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4db4a6
          addInstr(env, ARM64Instr_VMov(8/*yes, really*/, dst, src));
4db4a6
          return;
4db4a6
       }
4db4a6
+      if (ty == Ity_I128) {
4db4a6
+         HReg rHi, rLo, dstHi, dstLo;
4db4a6
+         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
4db4a6
+         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
4db4a6
+         addInstr(env, ARM64Instr_MovI(dstHi, rHi));
4db4a6
+         addInstr(env, ARM64Instr_MovI(dstLo, rLo));
4db4a6
+         return;
4db4a6
+      }
4db4a6
       if (ty == Ity_V128) {
4db4a6
          HReg src = iselV128Expr(env, stmt->Ist.WrTmp.data);
4db4a6
          HReg dst = lookupIRTemp(env, tmp);
4db4a6
@@ -4183,42 +4242,67 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4db4a6
          /* LL */
4db4a6
          IRTemp res = stmt->Ist.LLSC.result;
4db4a6
          IRType ty  = typeOfIRTemp(env->type_env, res);
4db4a6
-         if (ty == Ity_I64 || ty == Ity_I32 
4db4a6
+         if (ty == Ity_I128 || ty == Ity_I64 || ty == Ity_I32
4db4a6
              || ty == Ity_I16 || ty == Ity_I8) {
4db4a6
             Int  szB   = 0;
4db4a6
-            HReg r_dst = lookupIRTemp(env, res);
4db4a6
             HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
4db4a6
             switch (ty) {
4db4a6
-               case Ity_I8:  szB = 1; break;
4db4a6
-               case Ity_I16: szB = 2; break;
4db4a6
-               case Ity_I32: szB = 4; break;
4db4a6
-               case Ity_I64: szB = 8; break;
4db4a6
-               default:      vassert(0);
4db4a6
+               case Ity_I8:   szB = 1;  break;
4db4a6
+               case Ity_I16:  szB = 2;  break;
4db4a6
+               case Ity_I32:  szB = 4;  break;
4db4a6
+               case Ity_I64:  szB = 8;  break;
4db4a6
+               case Ity_I128: szB = 16; break;
4db4a6
+               default:       vassert(0);
4db4a6
+            }
4db4a6
+            if (szB == 16) {
4db4a6
+               HReg r_dstMSword = INVALID_HREG;
4db4a6
+               HReg r_dstLSword = INVALID_HREG;
4db4a6
+               lookupIRTempPair(&r_dstMSword, &r_dstLSword, env, res);
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
4db4a6
+               addInstr(env, ARM64Instr_LdrEXP());
4db4a6
+               addInstr(env, ARM64Instr_MovI(r_dstLSword, hregARM64_X2()));
4db4a6
+               addInstr(env, ARM64Instr_MovI(r_dstMSword, hregARM64_X3()));
4db4a6
+            } else {
4db4a6
+               vassert(szB != 0);
4db4a6
+               HReg r_dst = lookupIRTemp(env, res);
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
4db4a6
+               addInstr(env, ARM64Instr_LdrEX(szB));
4db4a6
+               addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
4db4a6
             }
4db4a6
-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
4db4a6
-            addInstr(env, ARM64Instr_LdrEX(szB));
4db4a6
-            addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
4db4a6
             return;
4db4a6
          }
4db4a6
          goto stmt_fail;
4db4a6
       } else {
4db4a6
          /* SC */
4db4a6
          IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
4db4a6
-         if (tyd == Ity_I64 || tyd == Ity_I32
4db4a6
+         if (tyd == Ity_I128 || tyd == Ity_I64 || tyd == Ity_I32
4db4a6
              || tyd == Ity_I16 || tyd == Ity_I8) {
4db4a6
             Int  szB = 0;
4db4a6
-            HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
4db4a6
             HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
4db4a6
             switch (tyd) {
4db4a6
-               case Ity_I8:  szB = 1; break;
4db4a6
-               case Ity_I16: szB = 2; break;
4db4a6
-               case Ity_I32: szB = 4; break;
4db4a6
-               case Ity_I64: szB = 8; break;
4db4a6
-               default:      vassert(0);
4db4a6
+               case Ity_I8:   szB = 1; break;
4db4a6
+               case Ity_I16:  szB = 2; break;
4db4a6
+               case Ity_I32:  szB = 4; break;
4db4a6
+               case Ity_I64:  szB = 8; break;
4db4a6
+               case Ity_I128: szB = 16; break;
4db4a6
+               default:       vassert(0);
4db4a6
+            }
4db4a6
+            if (szB == 16) {
4db4a6
+               HReg rD_MSword = INVALID_HREG;
4db4a6
+               HReg rD_LSword = INVALID_HREG;
4db4a6
+               iselInt128Expr(&rD_MSword,
4db4a6
+                              &rD_LSword, env, stmt->Ist.LLSC.storedata);
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD_LSword));
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rD_MSword));
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
4db4a6
+               addInstr(env, ARM64Instr_StrEXP());
4db4a6
+            } else {
4db4a6
+               vassert(szB != 0);
4db4a6
+               HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
4db4a6
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
4db4a6
+               addInstr(env, ARM64Instr_StrEX(szB));
4db4a6
             }
4db4a6
-            addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
4db4a6
-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
4db4a6
-            addInstr(env, ARM64Instr_StrEX(szB));
4db4a6
          } else {
4db4a6
             goto stmt_fail;
4db4a6
          }
4db4a6
@@ -4243,10 +4327,10 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4db4a6
 
4db4a6
    /* --------- ACAS --------- */
4db4a6
    case Ist_CAS: {
4db4a6
-      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
4db4a6
+      IRCAS* cas = stmt->Ist.CAS.details;
4db4a6
+      if (cas->oldHi == IRTemp_INVALID && cas->end == Iend_LE) {
4db4a6
          /* "normal" singleton CAS */
4db4a6
          UChar  sz;
4db4a6
-         IRCAS* cas = stmt->Ist.CAS.details;
4db4a6
          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4db4a6
          switch (ty) { 
4db4a6
             case Ity_I64: sz = 8; break;
4db4a6
@@ -4281,10 +4365,9 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
4db4a6
          addInstr(env, ARM64Instr_MovI(rOld, rResult));
4db4a6
          return;
4db4a6
       }
4db4a6
-      else {
4db4a6
+      if (cas->oldHi != IRTemp_INVALID && cas->end == Iend_LE) {
4db4a6
          /* Paired register CAS, i.e. CASP */
4db4a6
          UChar  sz;
4db4a6
-         IRCAS* cas = stmt->Ist.CAS.details;
4db4a6
          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
4db4a6
          switch (ty) {
4db4a6
             case Ity_I64: sz = 8; break;
4db4a6
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
4db4a6
index 25566c41c..2d82c41a1 100644
4db4a6
--- a/VEX/priv/ir_defs.c
4db4a6
+++ b/VEX/priv/ir_defs.c
4db4a6
@@ -76,6 +76,7 @@ void ppIRConst ( const IRConst* con )
4db4a6
       case Ico_U16:  vex_printf( "0x%x:I16",     (UInt)(con->Ico.U16)); break;
4db4a6
       case Ico_U32:  vex_printf( "0x%x:I32",     (UInt)(con->Ico.U32)); break;
4db4a6
       case Ico_U64:  vex_printf( "0x%llx:I64",   (ULong)(con->Ico.U64)); break;
4db4a6
+      case Ico_U128: vex_printf( "I128{0x%04x}", (UInt)(con->Ico.U128)); break;
4db4a6
       case Ico_F32:  u.f32 = con->Ico.F32;
4db4a6
                      vex_printf( "F32{0x%x}",   u.i32);
4db4a6
                      break;
4db4a6
@@ -2266,6 +2267,13 @@ IRConst* IRConst_U64 ( ULong u64 )
4db4a6
    c->Ico.U64 = u64;
4db4a6
    return c;
4db4a6
 }
4db4a6
+IRConst* IRConst_U128 ( UShort con )
4db4a6
+{
4db4a6
+   IRConst* c  = LibVEX_Alloc_inline(sizeof(IRConst));
4db4a6
+   c->tag      = Ico_U128;
4db4a6
+   c->Ico.U128 = con;
4db4a6
+   return c;
4db4a6
+}
4db4a6
 IRConst* IRConst_F32 ( Float f32 )
4db4a6
 {
4db4a6
    IRConst* c = LibVEX_Alloc_inline(sizeof(IRConst));
4db4a6
@@ -4230,6 +4238,7 @@ IRType typeOfIRConst ( const IRConst* con )
4db4a6
       case Ico_U16:   return Ity_I16;
4db4a6
       case Ico_U32:   return Ity_I32;
4db4a6
       case Ico_U64:   return Ity_I64;
4db4a6
+      case Ico_U128:  return Ity_I128;
4db4a6
       case Ico_F32:   return Ity_F32;
4db4a6
       case Ico_F32i:  return Ity_F32;
4db4a6
       case Ico_F64:   return Ity_F64;
4db4a6
@@ -5129,7 +5138,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
4db4a6
          tyRes = typeOfIRTemp(tyenv, stmt->Ist.LLSC.result);
4db4a6
          if (stmt->Ist.LLSC.storedata == NULL) {
4db4a6
             /* it's a LL */
4db4a6
-            if (tyRes != Ity_I64 && tyRes != Ity_I32
4db4a6
+            if (tyRes != Ity_I128 && tyRes != Ity_I64 && tyRes != Ity_I32
4db4a6
                 && tyRes != Ity_I16 && tyRes != Ity_I8)
4db4a6
                sanityCheckFail(bb,stmt,"Ist.LLSC(LL).result :: bogus");
4db4a6
          } else {
4db4a6
@@ -5137,7 +5146,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
4db4a6
             if (tyRes != Ity_I1)
4db4a6
                sanityCheckFail(bb,stmt,"Ist.LLSC(SC).result: not :: Ity_I1");
4db4a6
             tyData = typeOfIRExpr(tyenv, stmt->Ist.LLSC.storedata);
4db4a6
-            if (tyData != Ity_I64 && tyData != Ity_I32
4db4a6
+            if (tyData != Ity_I128 && tyData != Ity_I64 && tyData != Ity_I32
4db4a6
                 && tyData != Ity_I16 && tyData != Ity_I8)
4db4a6
                sanityCheckFail(bb,stmt,
4db4a6
                                "Ist.LLSC(SC).result :: storedata bogus");
4db4a6
@@ -5385,6 +5394,7 @@ Int sizeofIRType ( IRType ty )
4db4a6
 IRType integerIRTypeOfSize ( Int szB )
4db4a6
 {
4db4a6
    switch (szB) {
4db4a6
+      case 16: return Ity_I128;
4db4a6
       case 8: return Ity_I64;
4db4a6
       case 4: return Ity_I32;
4db4a6
       case 2: return Ity_I16;
4db4a6
diff --git a/VEX/pub/libvex_guest_arm64.h b/VEX/pub/libvex_guest_arm64.h
4db4a6
index 39b6ecdc2..91d06bd75 100644
4db4a6
--- a/VEX/pub/libvex_guest_arm64.h
4db4a6
+++ b/VEX/pub/libvex_guest_arm64.h
4db4a6
@@ -157,14 +157,18 @@ typedef
4db4a6
          note of bits 23 and 22. */
4db4a6
       UInt  guest_FPCR;
4db4a6
 
4db4a6
-      /* Fallback LL/SC support.  See bugs 344524 and 369459. */
4db4a6
-      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4 or 8.
4db4a6
+      /* Fallback LL/SC support.  See bugs 344524 and 369459.  _LO64 and _HI64
4db4a6
+         contain the original contents of _ADDR+0 .. _ADDR+15, but only _SIZE
4db4a6
+         number of bytes of it.  The remaining 16-_SIZE bytes of them must be
4db4a6
+         zero. */
4db4a6
+      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4,8 or 16.
4db4a6
       ULong guest_LLSC_ADDR; // Address of transaction.
4db4a6
-      ULong guest_LLSC_DATA; // Original value at _ADDR, zero-extended.
4db4a6
+      ULong guest_LLSC_DATA_LO64; // Original value at _ADDR+0.
4db4a6
+      ULong guest_LLSC_DATA_HI64; // Original value at _ADDR+8.
4db4a6
 
4db4a6
       /* Padding to make it have an 16-aligned size */
4db4a6
       /* UInt  pad_end_0; */
4db4a6
-      ULong pad_end_1;
4db4a6
+      /* ULong pad_end_1; */
4db4a6
    }
4db4a6
    VexGuestARM64State;
4db4a6
 
4db4a6
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
4db4a6
index deaa044c1..85805bb69 100644
4db4a6
--- a/VEX/pub/libvex_ir.h
4db4a6
+++ b/VEX/pub/libvex_ir.h
4db4a6
@@ -269,6 +269,8 @@ typedef
4db4a6
       Ico_U16, 
4db4a6
       Ico_U32, 
4db4a6
       Ico_U64,
4db4a6
+      Ico_U128,  /* 128-bit restricted integer constant,
4db4a6
+                    same encoding scheme as V128 */
4db4a6
       Ico_F32,   /* 32-bit IEEE754 floating */
4db4a6
       Ico_F32i,  /* 32-bit unsigned int to be interpreted literally
4db4a6
                     as a IEEE754 single value. */
4db4a6
@@ -295,6 +297,7 @@ typedef
4db4a6
          UShort U16;
4db4a6
          UInt   U32;
4db4a6
          ULong  U64;
4db4a6
+         UShort U128;
4db4a6
          Float  F32;
4db4a6
          UInt   F32i;
4db4a6
          Double F64;
4db4a6
@@ -311,6 +314,7 @@ extern IRConst* IRConst_U8   ( UChar );
4db4a6
 extern IRConst* IRConst_U16  ( UShort );
4db4a6
 extern IRConst* IRConst_U32  ( UInt );
4db4a6
 extern IRConst* IRConst_U64  ( ULong );
4db4a6
+extern IRConst* IRConst_U128 ( UShort );
4db4a6
 extern IRConst* IRConst_F32  ( Float );
4db4a6
 extern IRConst* IRConst_F32i ( UInt );
4db4a6
 extern IRConst* IRConst_F64  ( Double );
4db4a6
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
4db4a6
index 919c7fae8..176c8e5cb 100644
4db4a6
--- a/memcheck/mc_machine.c
4db4a6
+++ b/memcheck/mc_machine.c
4db4a6
@@ -1115,9 +1115,10 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
4db4a6
    if (o == GOF(CMSTART) && sz == 8) return -1; // untracked
4db4a6
    if (o == GOF(CMLEN)   && sz == 8) return -1; // untracked
4db4a6
 
4db4a6
-   if (o == GOF(LLSC_SIZE) && sz == 8) return -1; // untracked
4db4a6
-   if (o == GOF(LLSC_ADDR) && sz == 8) return o;
4db4a6
-   if (o == GOF(LLSC_DATA) && sz == 8) return o;
4db4a6
+   if (o == GOF(LLSC_SIZE)      && sz == 8) return -1; // untracked
4db4a6
+   if (o == GOF(LLSC_ADDR)      && sz == 8) return o;
4db4a6
+   if (o == GOF(LLSC_DATA_LO64) && sz == 8) return o;
4db4a6
+   if (o == GOF(LLSC_DATA_HI64) && sz == 8) return o;
4db4a6
 
4db4a6
    VG_(printf)("MC_(get_otrack_shadow_offset)(arm64)(off=%d,sz=%d)\n",
4db4a6
                offset,szB);
4db4a6
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
4db4a6
index c6fd2653f..72ccb3c8c 100644
4db4a6
--- a/memcheck/mc_translate.c
4db4a6
+++ b/memcheck/mc_translate.c
4db4a6
@@ -5497,8 +5497,11 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4db4a6
       the address (shadow) to 'defined' following the test. */
4db4a6
    complainIfUndefined( mce, addr, guard );
4db4a6
 
4db4a6
-   /* Now cook up a call to the relevant helper function, to read the
4db4a6
-      data V bits from shadow memory. */
4db4a6
+   /* Now cook up a call to the relevant helper function, to read the data V
4db4a6
+      bits from shadow memory.  Note that I128 loads are done by pretending
4db4a6
+      we're doing a V128 load, and then converting the resulting V128 vbits
4db4a6
+      word to an I128, right at the end of this function -- see `castedToI128`
4db4a6
+      below.  (It's only a minor hack :-) This pertains to bug 444399. */
4db4a6
    ty = shadowTypeV(ty);
4db4a6
 
4db4a6
    void*        helper           = NULL;
4db4a6
@@ -5511,6 +5514,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4db4a6
                         hname = "MC_(helperc_LOADV256le)";
4db4a6
                         ret_via_outparam = True;
4db4a6
                         break;
4db4a6
+         case Ity_I128: // fallthrough.  See comment above.
4db4a6
          case Ity_V128: helper = &MC_(helperc_LOADV128le);
4db4a6
                         hname = "MC_(helperc_LOADV128le)";
4db4a6
                         ret_via_outparam = True;
4db4a6
@@ -5576,7 +5580,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4db4a6
 
4db4a6
    /* We need to have a place to park the V bits we're just about to
4db4a6
       read. */
4db4a6
-   IRTemp datavbits = newTemp(mce, ty, VSh);
4db4a6
+   IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
4db4a6
 
4db4a6
    /* Here's the call. */
4db4a6
    IRDirty* di;
4db4a6
@@ -5603,7 +5607,14 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
4db4a6
    }
4db4a6
    stmt( 'V', mce, IRStmt_Dirty(di) );
4db4a6
 
4db4a6
-   return mkexpr(datavbits);
4db4a6
+   if (ty == Ity_I128) {
4db4a6
+      IRAtom* castedToI128
4db4a6
+         = assignNew('V', mce, Ity_I128,
4db4a6
+                     unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
4db4a6
+      return castedToI128;
4db4a6
+   } else {
4db4a6
+      return mkexpr(datavbits);
4db4a6
+   }
4db4a6
 }
4db4a6
 
4db4a6
 
4db4a6
@@ -5631,6 +5642,7 @@ IRAtom* expr2vbits_Load ( MCEnv* mce,
4db4a6
       case Ity_I16:
4db4a6
       case Ity_I32:
4db4a6
       case Ity_I64:
4db4a6
+      case Ity_I128:
4db4a6
       case Ity_V128:
4db4a6
       case Ity_V256:
4db4a6
          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
4db4a6
@@ -5928,6 +5940,7 @@ void do_shadow_Store ( MCEnv* mce,
4db4a6
                         c = IRConst_V256(V_BITS32_DEFINED); break;
4db4a6
          case Ity_V128: // V128 weirdness -- used twice
4db4a6
                         c = IRConst_V128(V_BITS16_DEFINED); break;
4db4a6
+         case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
4db4a6
          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
4db4a6
          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
4db4a6
          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
4db4a6
@@ -5948,6 +5961,7 @@ void do_shadow_Store ( MCEnv* mce,
4db4a6
       switch (ty) {
4db4a6
          case Ity_V256: /* we'll use the helper four times */
4db4a6
          case Ity_V128: /* we'll use the helper twice */
4db4a6
+         case Ity_I128: /* we'll use the helper twice */
4db4a6
          case Ity_I64: helper = &MC_(helperc_STOREV64le);
4db4a6
                        hname = "MC_(helperc_STOREV64le)";
4db4a6
                        break;
4db4a6
@@ -6051,9 +6065,9 @@ void do_shadow_Store ( MCEnv* mce,
4db4a6
       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
4db4a6
 
4db4a6
    } 
4db4a6
-   else if (UNLIKELY(ty == Ity_V128)) {
4db4a6
+   else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
4db4a6
 
4db4a6
-      /* V128-bit case */
4db4a6
+      /* V128/I128-bit case */
4db4a6
       /* See comment in next clause re 64-bit regparms */
4db4a6
       /* also, need to be careful about endianness */
4db4a6
 
4db4a6
@@ -6062,6 +6076,7 @@ void do_shadow_Store ( MCEnv* mce,
4db4a6
       IRAtom  *addrLo64, *addrHi64;
4db4a6
       IRAtom  *vdataLo64, *vdataHi64;
4db4a6
       IRAtom  *eBiasLo64, *eBiasHi64;
4db4a6
+      IROp    opGetLO64,  opGetHI64;
4db4a6
 
4db4a6
       if (end == Iend_LE) {
4db4a6
          offLo64 = 0;
4db4a6
@@ -6071,9 +6086,17 @@ void do_shadow_Store ( MCEnv* mce,
4db4a6
          offHi64 = 0;
4db4a6
       }
4db4a6
 
4db4a6
+      if (ty == Ity_V128) {
4db4a6
+         opGetLO64 = Iop_V128to64;
4db4a6
+         opGetHI64 = Iop_V128HIto64;
4db4a6
+      } else {
4db4a6
+         opGetLO64 = Iop_128to64;
4db4a6
+         opGetHI64 = Iop_128HIto64;
4db4a6
+      }
4db4a6
+
4db4a6
       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
4db4a6
       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
4db4a6
-      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
4db4a6
+      vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
4db4a6
       diLo64    = unsafeIRDirty_0_N( 
4db4a6
                      1/*regparms*/, 
4db4a6
                      hname, VG_(fnptr_to_fnentry)( helper ), 
4db4a6
@@ -6081,7 +6104,7 @@ void do_shadow_Store ( MCEnv* mce,
4db4a6
                   );
4db4a6
       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
4db4a6
       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
4db4a6
-      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
4db4a6
+      vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
4db4a6
       diHi64    = unsafeIRDirty_0_N( 
4db4a6
                      1/*regparms*/, 
4db4a6
                      hname, VG_(fnptr_to_fnentry)( helper ), 
4db4a6
@@ -6888,7 +6911,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
4db4a6
       /* Just treat this as a normal load, followed by an assignment of
4db4a6
          the value to .result. */
4db4a6
       /* Stay sane */
4db4a6
-      tl_assert(resTy == Ity_I64 || resTy == Ity_I32
4db4a6
+      tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
4db4a6
                 || resTy == Ity_I16 || resTy == Ity_I8);
4db4a6
       assign( 'V', mce, resTmp,
4db4a6
                    expr2vbits_Load(
4db4a6
@@ -6899,7 +6922,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
4db4a6
       /* Stay sane */
4db4a6
       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
4db4a6
                                    stStoredata);
4db4a6
-      tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
4db4a6
+      tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
4db4a6
                 || dataTy == Ity_I16 || dataTy == Ity_I8);
4db4a6
       do_shadow_Store( mce, stEnd,
4db4a6
                             stAddr, 0/* addr bias */,
4db4a6
@@ -7684,7 +7707,7 @@ static void schemeS ( MCEnv* mce, IRStmt* st )
4db4a6
                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
4db4a6
             IRExpr* vanillaLoad
4db4a6
                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
4db4a6
-            tl_assert(resTy == Ity_I64 || resTy == Ity_I32
4db4a6
+            tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
4db4a6
                       || resTy == Ity_I16 || resTy == Ity_I8);
4db4a6
             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
4db4a6
                               schemeE(mce, vanillaLoad));
4db4a6
diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
4db4a6
index 449710020..2b43ef7d7 100644
4db4a6
--- a/memcheck/tests/Makefile.am
4db4a6
+++ b/memcheck/tests/Makefile.am
4db4a6
@@ -90,6 +90,7 @@ EXTRA_DIST = \
4db4a6
 	addressable.stderr.exp addressable.stdout.exp addressable.vgtest \
4db4a6
 	atomic_incs.stderr.exp atomic_incs.vgtest \
4db4a6
 	atomic_incs.stdout.exp-32bit atomic_incs.stdout.exp-64bit \
4db4a6
+	atomic_incs.stdout.exp-64bit-and-128bit \
4db4a6
 	badaddrvalue.stderr.exp \
4db4a6
 	badaddrvalue.stdout.exp badaddrvalue.vgtest \
4db4a6
         exit_on_first_error.stderr.exp \
4db4a6
diff --git a/memcheck/tests/atomic_incs.c b/memcheck/tests/atomic_incs.c
4db4a6
index f931750f4..1c738c530 100644
4db4a6
--- a/memcheck/tests/atomic_incs.c
4db4a6
+++ b/memcheck/tests/atomic_incs.c
4db4a6
@@ -22,6 +22,17 @@
4db4a6
 #define NNN 3456987
4db4a6
 
4db4a6
 #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
4db4a6
+#define IS_16_ALIGNED(_ptr)  (0 == (((unsigned long)(_ptr)) & 15))
4db4a6
+
4db4a6
+// U128 from libvex_basictypes.h is a 4-x-UInt array, which is a bit
4db4a6
+// inconvenient, hence:
4db4a6
+typedef
4db4a6
+   struct {
4db4a6
+      // assuming little-endianness
4db4a6
+      unsigned long long int lo64;
4db4a6
+      unsigned long long int hi64;
4db4a6
+   }
4db4a6
+   MyU128;
4db4a6
 
4db4a6
 
4db4a6
 __attribute__((noinline)) void atomic_add_8bit ( char* p, int n ) 
4db4a6
@@ -712,6 +723,40 @@ __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
4db4a6
 #endif
4db4a6
 }
4db4a6
 
4db4a6
+__attribute__((noinline)) void atomic_add_128bit ( MyU128* p,
4db4a6
+                                                   unsigned long long int n )
4db4a6
+{
4db4a6
+#if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32) \
4db4a6
+    || defined (VGA_nanomips) || defined(VGA_mips64) \
4db4a6
+    || defined(VGA_amd64) \
4db4a6
+    || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
4db4a6
+    || defined(VGA_arm) \
4db4a6
+    || defined(VGA_s390x)
4db4a6
+   /* do nothing; is not supported */
4db4a6
+#elif defined(VGA_arm64)
4db4a6
+   unsigned long long int block[3]
4db4a6
+      = { (unsigned long long int)p, (unsigned long long int)n,
4db4a6
+          0xFFFFFFFFFFFFFFFFULL};
4db4a6
+   do {
4db4a6
+      __asm__ __volatile__(
4db4a6
+         "mov   x5, %0"             "\n\t" // &block[0]
4db4a6
+         "ldr   x9, [x5, #0]"       "\n\t" // p
4db4a6
+         "ldr   x10, [x5, #8]"      "\n\t" // n
4db4a6
+         "ldxp  x7, x8, [x9]"       "\n\t"
4db4a6
+         "adds  x7, x7, x10"        "\n\t"
4db4a6
+         "adc   x8, x8, xzr"        "\n\t"
4db4a6
+         "stxp  w4, x7, x8, [x9]"   "\n\t"
4db4a6
+         "str   x4, [x5, #16]"      "\n\t"
4db4a6
+         : /*out*/
4db4a6
+         : /*in*/ "r"(&block[0])
4db4a6
+         : /*trash*/ "memory", "cc", "x5", "x7", "x8", "x9", "x10", "x4"
4db4a6
+      );
4db4a6
+   } while (block[2] != 0);
4db4a6
+#else
4db4a6
+# error "Unsupported arch"
4db4a6
+#endif
4db4a6
+}
4db4a6
+
4db4a6
 int main ( int argc, char** argv )
4db4a6
 {
4db4a6
    int    i, status;
4db4a6
@@ -720,8 +765,12 @@ int main ( int argc, char** argv )
4db4a6
    short* p16;
4db4a6
    int*   p32;
4db4a6
    long long int* p64;
4db4a6
+   MyU128*  p128;
4db4a6
    pid_t  child, p2;
4db4a6
 
4db4a6
+   assert(sizeof(MyU128) == 16);
4db4a6
+   assert(sysconf(_SC_PAGESIZE) >= 4096);
4db4a6
+
4db4a6
    printf("parent, pre-fork\n");
4db4a6
 
4db4a6
    page = mmap( 0, sysconf(_SC_PAGESIZE),
4db4a6
@@ -736,11 +785,13 @@ int main ( int argc, char** argv )
4db4a6
    p16 = (short*)(page+256);
4db4a6
    p32 = (int*)(page+512);
4db4a6
    p64 = (long long int*)(page+768);
4db4a6
+   p128 = (MyU128*)(page+1024);
4db4a6
 
4db4a6
    assert( IS_8_ALIGNED(p8) );
4db4a6
    assert( IS_8_ALIGNED(p16) );
4db4a6
    assert( IS_8_ALIGNED(p32) );
4db4a6
    assert( IS_8_ALIGNED(p64) );
4db4a6
+   assert( IS_16_ALIGNED(p128) );
4db4a6
 
4db4a6
    memset(page, 0, 1024);
4db4a6
 
4db4a6
@@ -748,6 +799,7 @@ int main ( int argc, char** argv )
4db4a6
    *p16 = 0;
4db4a6
    *p32 = 0;
4db4a6
    *p64 = 0;
4db4a6
+   p128->lo64 = p128->hi64 = 0;
4db4a6
 
4db4a6
    child = fork();
4db4a6
    if (child == -1) {
4db4a6
@@ -763,6 +815,7 @@ int main ( int argc, char** argv )
4db4a6
          atomic_add_16bit(p16, 1);
4db4a6
          atomic_add_32bit(p32, 1);
4db4a6
          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
4db4a6
+         atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
4db4a6
       }
4db4a6
       return 1;
4db4a6
       /* NOTREACHED */
4db4a6
@@ -778,6 +831,7 @@ int main ( int argc, char** argv )
4db4a6
       atomic_add_16bit(p16, 1);
4db4a6
       atomic_add_32bit(p32, 1);
4db4a6
       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
4db4a6
+      atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
4db4a6
    }
4db4a6
 
4db4a6
    p2 = waitpid(child, &status, 0);
4db4a6
@@ -788,11 +842,17 @@ int main ( int argc, char** argv )
4db4a6
 
4db4a6
    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
4db4a6
           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
4db4a6
+   printf("               128 bit 0x%016llx:0x%016llx\n",
4db4a6
+          p128->hi64, p128->lo64);
4db4a6
 
4db4a6
    if (-74 == (int)(*(signed char*)p8) 
4db4a6
        && 32694 == (int)(*p16) 
4db4a6
        && 6913974 == *p32
4db4a6
-       && (0LL == *p64 || 682858642110LL == *p64)) {
4db4a6
+       && (0LL == *p64 || 682858642110LL == *p64)
4db4a6
+       && ((0 == p128->hi64 && 0 == p128->lo64)
4db4a6
+           || (0x00000000000697fb == p128->hi64
4db4a6
+               && 0x6007eb426316d956ULL == p128->lo64))
4db4a6
+      ) {
4db4a6
       printf("PASS\n");
4db4a6
    } else {
4db4a6
       printf("FAIL -- see source code for expected values\n");
4db4a6
diff --git a/memcheck/tests/atomic_incs.stdout.exp-32bit b/memcheck/tests/atomic_incs.stdout.exp-32bit
4db4a6
index c5b8781e5..55e5044b5 100644
4db4a6
--- a/memcheck/tests/atomic_incs.stdout.exp-32bit
4db4a6
+++ b/memcheck/tests/atomic_incs.stdout.exp-32bit
4db4a6
@@ -3,5 +3,6 @@ child
4db4a6
 parent, pre-fork
4db4a6
 parent
4db4a6
 FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 0
4db4a6
+               128 bit 0x0000000000000000:0x0000000000000000
4db4a6
 PASS
4db4a6
 parent exits
4db4a6
diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit b/memcheck/tests/atomic_incs.stdout.exp-64bit
4db4a6
index 82405c520..ca2f4fc97 100644
4db4a6
--- a/memcheck/tests/atomic_incs.stdout.exp-64bit
4db4a6
+++ b/memcheck/tests/atomic_incs.stdout.exp-64bit
4db4a6
@@ -3,5 +3,6 @@ child
4db4a6
 parent, pre-fork
4db4a6
 parent
4db4a6
 FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
4db4a6
+               128 bit 0x0000000000000000:0x0000000000000000
4db4a6
 PASS
4db4a6
 parent exits
4db4a6
diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
4db4a6
new file mode 100644
4db4a6
index 000000000..ef6580917
4db4a6
--- /dev/null
4db4a6
+++ b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
4db4a6
@@ -0,0 +1,8 @@
4db4a6
+parent, pre-fork
4db4a6
+child
4db4a6
+parent, pre-fork
4db4a6
+parent
4db4a6
+FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
4db4a6
+               128 bit 0x00000000000697fb:0x6007eb426316d956
4db4a6
+PASS
4db4a6
+parent exits
4db4a6
diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
4db4a6
index 00cbfa52c..9efb49b27 100644
4db4a6
--- a/none/tests/arm64/Makefile.am
4db4a6
+++ b/none/tests/arm64/Makefile.am
4db4a6
@@ -12,7 +12,10 @@ EXTRA_DIST = \
4db4a6
 	atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \
4db4a6
 	simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \
4db4a6
         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
4db4a6
-	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp fp_and_simd_v82.vgtest
4db4a6
+	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
4db4a6
+	fp_and_simd_v82.vgtest \
4db4a6
+	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
4db4a6
+	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
4db4a6
 
4db4a6
 check_PROGRAMS = \
4db4a6
 	allexec \
4db4a6
@@ -20,7 +23,8 @@ check_PROGRAMS = \
4db4a6
 	fp_and_simd \
4db4a6
 	integer \
4db4a6
 	memory \
4db4a6
-	fmadd_sub
4db4a6
+	fmadd_sub \
4db4a6
+	ldxp_stxp
4db4a6
 
4db4a6
 if BUILD_ARMV8_CRC_TESTS
4db4a6
   check_PROGRAMS += crc32
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp.c b/none/tests/arm64/ldxp_stxp.c
4db4a6
new file mode 100644
4db4a6
index 000000000..b5f6ea121
4db4a6
--- /dev/null
4db4a6
+++ b/none/tests/arm64/ldxp_stxp.c
4db4a6
@@ -0,0 +1,93 @@
4db4a6
+
4db4a6
+/* Note, this is only a basic smoke test of LD{A}XP and ST{L}XP.  Their
4db4a6
+   atomicity properties are tested by memcheck/tests/atomic_incs.c. */
4db4a6
+
4db4a6
+#include <stdio.h>
4db4a6
+#include <stdlib.h>
4db4a6
+#include <malloc.h>
4db4a6
+#include <assert.h>
4db4a6
+
4db4a6
+typedef  unsigned int            UInt;
4db4a6
+typedef  unsigned long long int  ULong;
4db4a6
+
4db4a6
+
4db4a6
+void initBlock ( ULong* block )
4db4a6
+{
4db4a6
+   block[0] = 0x0001020304050607ULL;
4db4a6
+   block[1] = 0x1011121314151617ULL;
4db4a6
+   block[2] = 0x2021222324252627ULL;
4db4a6
+   block[3] = 0x3031323334353637ULL;
4db4a6
+   block[4] = 0x4041424344454647ULL;
4db4a6
+   block[5] = 0x5051525354555657ULL;
4db4a6
+}
4db4a6
+
4db4a6
+void printBlock ( const char* who,
4db4a6
+                  ULong* block, ULong rt1contents, ULong rt2contents,
4db4a6
+                  UInt zeroIfSuccess )
4db4a6
+{
4db4a6
+   printf("Block %s (%s)\n", who, zeroIfSuccess == 0 ? "success" : "FAILURE" );
4db4a6
+   for (int i = 0; i < 6; i++) {
4db4a6
+      printf("0x%016llx\n", block[i]);
4db4a6
+   }
4db4a6
+   printf("0x%016llx rt1contents\n", rt1contents);
4db4a6
+   printf("0x%016llx rt2contents\n", rt2contents);
4db4a6
+   printf("\n");
4db4a6
+}
4db4a6
+
4db4a6
+int main ( void )
4db4a6
+{
4db4a6
+   ULong* block = memalign(16, 6 * sizeof(ULong));
4db4a6
+   assert(block);
4db4a6
+
4db4a6
+   ULong rt1in, rt2in, rt1out, rt2out;
4db4a6
+   UInt scRes;
4db4a6
+
4db4a6
+   // Do ldxp then stxp with x-registers
4db4a6
+   initBlock(block);
4db4a6
+   rt1in  = 0x5555666677778888ULL;
4db4a6
+   rt2in  = 0xAAAA9999BBBB0000ULL;
4db4a6
+   rt1out = 0x1111222233334444ULL;
4db4a6
+   rt2out = 0xFFFFEEEEDDDDCCCCULL;
4db4a6
+   scRes  = 0x55555555;
4db4a6
+   __asm__ __volatile__(
4db4a6
+      "ldxp %1, %2, [%5]"       "\n\t"
4db4a6
+      "stxp %w0, %3, %4, [%5]"  "\n\t"
4db4a6
+      : /*OUT*/
4db4a6
+        "=&r"(scRes),  // %0
4db4a6
+        "=&r"(rt1out), // %1
4db4a6
+        "=&r"(rt2out)  // %2
4db4a6
+      : /*IN*/
4db4a6
+        "r"(rt1in),    // %3
4db4a6
+        "r"(rt2in),    // %4
4db4a6
+        "r"(&block[2]) // %5
4db4a6
+      : /*TRASH*/
4db4a6
+        "memory","cc"
4db4a6
+   );
4db4a6
+   printBlock("after ldxp/stxp 2x64-bit", block, rt1out, rt2out, scRes);
4db4a6
+
4db4a6
+   // Do ldxp then stxp with w-registers
4db4a6
+   initBlock(block);
4db4a6
+   rt1in  = 0x5555666677778888ULL;
4db4a6
+   rt2in  = 0xAAAA9999BBBB0000ULL;
4db4a6
+   rt1out = 0x1111222233334444ULL;
4db4a6
+   rt2out = 0xFFFFEEEEDDDDCCCCULL;
4db4a6
+   scRes  = 0x55555555;
4db4a6
+   __asm__ __volatile__(
4db4a6
+      "ldxp %w1, %w2, [%5]"       "\n\t"
4db4a6
+      "stxp %w0, %w3, %w4, [%5]"  "\n\t"
4db4a6
+      : /*OUT*/
4db4a6
+        "=&r"(scRes),  // %0
4db4a6
+        "=&r"(rt1out), // %1
4db4a6
+        "=&r"(rt2out)  // %2
4db4a6
+      : /*IN*/
4db4a6
+        "r"(rt1in),    // %3
4db4a6
+        "r"(rt2in),    // %4
4db4a6
+        "r"(&block[2]) // %5
4db4a6
+      : /*TRASH*/
4db4a6
+        "memory","cc"
4db4a6
+   );
4db4a6
+   printBlock("after ldxp/stxp 2x32-bit", block, rt1out, rt2out, scRes);
4db4a6
+
4db4a6
+   free(block);
4db4a6
+   return 0;
4db4a6
+}
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp
4db4a6
new file mode 100644
4db4a6
index 000000000..e69de29bb
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
4db4a6
new file mode 100644
4db4a6
index 000000000..f269ecdcc
4db4a6
--- /dev/null
4db4a6
+++ b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
4db4a6
@@ -0,0 +1,20 @@
4db4a6
+Block after ldxp/stxp 2x64-bit (success)
4db4a6
+0x0001020304050607
4db4a6
+0x1011121314151617
4db4a6
+0x5555666677778888
4db4a6
+0xaaaa9999bbbb0000
4db4a6
+0x4041424344454647
4db4a6
+0x5051525354555657
4db4a6
+0x2021222324252627 rt1contents
4db4a6
+0x3031323334353637 rt2contents
4db4a6
+
4db4a6
+Block after ldxp/stxp 2x32-bit (success)
4db4a6
+0x0001020304050607
4db4a6
+0x1011121314151617
4db4a6
+0xbbbb000077778888
4db4a6
+0x3031323334353637
4db4a6
+0x4041424344454647
4db4a6
+0x5051525354555657
4db4a6
+0x0000000024252627 rt1contents
4db4a6
+0x0000000020212223 rt2contents
4db4a6
+
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.vgtest b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
4db4a6
new file mode 100644
4db4a6
index 000000000..29133729a
4db4a6
--- /dev/null
4db4a6
+++ b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
4db4a6
@@ -0,0 +1,2 @@
4db4a6
+prog: ldxp_stxp
4db4a6
+vgopts: -q
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp
4db4a6
new file mode 100644
4db4a6
index 000000000..e69de29bb
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
4db4a6
new file mode 100644
4db4a6
index 000000000..f269ecdcc
4db4a6
--- /dev/null
4db4a6
+++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
4db4a6
@@ -0,0 +1,20 @@
4db4a6
+Block after ldxp/stxp 2x64-bit (success)
4db4a6
+0x0001020304050607
4db4a6
+0x1011121314151617
4db4a6
+0x5555666677778888
4db4a6
+0xaaaa9999bbbb0000
4db4a6
+0x4041424344454647
4db4a6
+0x5051525354555657
4db4a6
+0x2021222324252627 rt1contents
4db4a6
+0x3031323334353637 rt2contents
4db4a6
+
4db4a6
+Block after ldxp/stxp 2x32-bit (success)
4db4a6
+0x0001020304050607
4db4a6
+0x1011121314151617
4db4a6
+0xbbbb000077778888
4db4a6
+0x3031323334353637
4db4a6
+0x4041424344454647
4db4a6
+0x5051525354555657
4db4a6
+0x0000000024252627 rt1contents
4db4a6
+0x0000000020212223 rt2contents
4db4a6
+
4db4a6
diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
4db4a6
new file mode 100644
4db4a6
index 000000000..474282a03
4db4a6
--- /dev/null
4db4a6
+++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
4db4a6
@@ -0,0 +1,2 @@
4db4a6
+prog: ldxp_stxp
4db4a6
+vgopts: -q --sim-hints=fallback-llsc
4db4a6
4db4a6
commit 0d38ca5dd6b446c70738031132d41f09de0f7a8a
4db4a6
Author: Julian Seward <jseward@acm.org>
4db4a6
Date:   Fri Nov 12 13:08:45 2021 +0100
4db4a6
4db4a6
    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).  FOLLOWUP FIX.
4db4a6
    
4db4a6
    This is an attempt to un-break 'make dist', as broken by the main commit for
4db4a6
    this bug, which was 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650.
4db4a6
4db4a6
diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
4db4a6
index 9efb49b27..4a06f0996 100644
4db4a6
--- a/none/tests/arm64/Makefile.am
4db4a6
+++ b/none/tests/arm64/Makefile.am
4db4a6
@@ -14,8 +14,10 @@ EXTRA_DIST = \
4db4a6
         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
4db4a6
 	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
4db4a6
 	fp_and_simd_v82.vgtest \
4db4a6
-	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
4db4a6
-	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
4db4a6
+	ldxp_stxp_basisimpl.stdout.exp ldxp_stxp_basisimpl.stderr.exp \
4db4a6
+	ldxp_stxp_basisimpl.vgtest \
4db4a6
+	ldxp_stxp_fallbackimpl.stdout.exp ldxp_stxp_fallbackimpl.stderr.exp \
4db4a6
+	ldxp_stxp_fallbackimpl.vgtest
4db4a6
 
4db4a6
 check_PROGRAMS = \
4db4a6
 	allexec \