6e98bf
commit 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650
6e98bf
Author: Julian Seward <jseward@acm.org>
6e98bf
Date:   Fri Nov 12 12:13:45 2021 +0100
6e98bf
6e98bf
    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).
6e98bf
    
6e98bf
    This is unfortunately a big and complex patch, to implement LD{,A}XP and
6e98bf
    ST{,L}XP.  These were omitted from the original AArch64 v8.0 implementation
6e98bf
    for unknown reasons.
6e98bf
    
6e98bf
    (Background) the patch is made significantly more complex because for AArch64
6e98bf
    we actually have two implementations of the underlying
6e98bf
    Load-Linked/Store-Conditional (LL/SC) machinery: a "primary" implementation,
6e98bf
    which translates LL/SC more or less directly into IR and re-emits them at the
6e98bf
    back end, and a "fallback" implementation that implements LL/SC "manually", by
6e98bf
    taking advantage of the fact that V serialises thread execution, so we can
6e98bf
    "implement" LL/SC by simulating a reservation using fields LLSC_* in the guest
6e98bf
    state, and invalidating the reservation at every thread switch.
6e98bf
    
6e98bf
    (Background) the fallback scheme is needed because the primary scheme is in
6e98bf
    violation of the ARMv8 semantics in that it can (easily) introduce extra
6e98bf
    memory references between the LL and SC, hence on some hardware causing the
6e98bf
    reservation to always fail and so the simulated program to wind up looping
6e98bf
    forever.
6e98bf
    
6e98bf
    For these instructions, big picture:
6e98bf
    
6e98bf
    * for the primary implementation, we take advantage of the fact that
6e98bf
      IRStmt_LLSC allows I128 bit transactions to be represented.  Hence we bundle
6e98bf
      up the two 64-bit data elements into an I128 (or vice versa) and present a
6e98bf
      single I128-typed IRStmt_LLSC in the IR.  In the backend, those are
6e98bf
      re-emitted as LDXP/STXP respectively.  For LL/SC on 32-bit register pairs,
6e98bf
      that bundling produces a single 64-bit item, and so the existing LL/SC
6e98bf
      backend machinery handles it.  The effect is that a doubleword 32-bit LL/SC
6e98bf
      in the front end translates into a single 64-bit LL/SC in the back end.
6e98bf
      Overall, though, the implementation is straightforward.
6e98bf
    
6e98bf
    * for the fallback implementation, it is necessary to extend the guest state
6e98bf
      field `guest_LLSC_DATA` to represent a 128-bit transaction, by splitting it
6e98bf
      into _DATA_LO64 and DATA_HI64.  Then, the implementation is an exact
6e98bf
      analogue of the fallback implementation for single-word LL/SC.  It takes
6e98bf
      advantage of the fact that the backend already supports 128-bit CAS, as
6e98bf
      fixed in bug 445354.  As with the primary implementation, doubleword 32-bit
6e98bf
      LL/SC is bundled into a single 64-bit transaction.
6e98bf
    
6e98bf
    Detailed changes:
6e98bf
    
6e98bf
    * new arm64 guest state fields LLSC_DATA_LO64/LLSC_DATA_LO64 to replace
6e98bf
      guest_LLSC_DATA
6e98bf
    
6e98bf
    * (ridealong fix) arm64 front end: a fix to a minor and harmless decoding bug
6e98bf
      for the single-word LDX/STX case.
6e98bf
    
6e98bf
    * arm64 front end: IR generation for LD{,A}XP/ST{,L}XP: tedious and
6e98bf
      longwinded, but per comments above, an exact(ish) analogue of the singleword
6e98bf
      case
6e98bf
    
6e98bf
    * arm64 backend: new insns ARM64Instr_LdrEXP / ARM64Instr_StrEXP to wrap up 2
6e98bf
      x 64 exclusive loads/stores.  Per comments above, there's no need to handle
6e98bf
      the 2 x 32 case.
6e98bf
    
6e98bf
    * arm64 isel: translate I128-typed IRStmt_LLSC into the above two insns
6e98bf
    
6e98bf
    * arm64 isel: some auxiliary bits and pieces needed to handle I128 values;
6e98bf
      this is standard doubleword isel stuff
6e98bf
    
6e98bf
    * arm64 isel: (ridealong fix): Ist_CAS: check for endianness of the CAS!
6e98bf
    
6e98bf
    * arm64 isel: (ridealong) a couple of formatting fixes
6e98bf
    
6e98bf
    * IR infrastructure: add support for I128 constants, done the same as V128
6e98bf
      constants
6e98bf
    
6e98bf
    * memcheck: handle shadow loads and stores for I128 values
6e98bf
    
6e98bf
    * testcase: memcheck/tests/atomic_incs.c: on arm64, also test 128-bit atomic
6e98bf
      addition, to check we really have atomicity right
6e98bf
    
6e98bf
    * testcase: new test none/tests/arm64/ldxp_stxp.c, tests operation but not
6e98bf
      atomicity.  (Smoke test).
6e98bf
6e98bf
diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
6e98bf
index 12a1c5978..ee018c6a9 100644
6e98bf
--- a/VEX/priv/guest_arm64_toIR.c
6e98bf
+++ b/VEX/priv/guest_arm64_toIR.c
6e98bf
@@ -1184,9 +1184,10 @@ static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
6e98bf
 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
6e98bf
 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
6e98bf
 
6e98bf
-#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
6e98bf
-#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
6e98bf
-#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
6e98bf
+#define OFFB_LLSC_SIZE      offsetof(VexGuestARM64State,guest_LLSC_SIZE)
6e98bf
+#define OFFB_LLSC_ADDR      offsetof(VexGuestARM64State,guest_LLSC_ADDR)
6e98bf
+#define OFFB_LLSC_DATA_LO64 offsetof(VexGuestARM64State,guest_LLSC_DATA_LO64)
6e98bf
+#define OFFB_LLSC_DATA_HI64 offsetof(VexGuestARM64State,guest_LLSC_DATA_HI64)
6e98bf
 
6e98bf
 
6e98bf
 /* ---------------- Integer registers ---------------- */
6e98bf
@@ -6652,7 +6653,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
6e98bf
         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
6e98bf
          has to do this bit)
6e98bf
    */   
6e98bf
-   if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
6e98bf
+   if (INSN(29,24) == BITS6(0,0,1,0,0,0)
6e98bf
        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
6e98bf
        && INSN(14,10) == BITS5(1,1,1,1,1)) {
6e98bf
       UInt szBlg2     = INSN(31,30);
6e98bf
@@ -6678,7 +6679,8 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
6e98bf
             // if it faults.
6e98bf
             IRTemp loaded_data64 = newTemp(Ity_I64);
6e98bf
             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
6e98bf
-            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
6e98bf
+            stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
6e98bf
+            stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
6e98bf
             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6e98bf
             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
6e98bf
             putIReg64orZR(tt, mkexpr(loaded_data64));
6e98bf
@@ -6729,7 +6731,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
6e98bf
             ));
6e98bf
             // Fail if the data doesn't match the LL data
6e98bf
             IRTemp llsc_data64 = newTemp(Ity_I64);
6e98bf
-            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
6e98bf
+            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6e98bf
             stmt( IRStmt_Exit(
6e98bf
                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
6e98bf
                                          mkexpr(llsc_data64)),
6e98bf
@@ -6771,6 +6773,257 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
6e98bf
       /* else fall through */
6e98bf
    }
6e98bf
 
6e98bf
+   /* -------------------- LD{,A}XP -------------------- */
6e98bf
+   /* -------------------- ST{,L}XP -------------------- */
6e98bf
+   /* 31 30 29     23  20    15 14  9  4
6e98bf
+       1 sz 001000 011 11111 0  t2  n  t1   LDXP  Rt1, Rt2, [Xn|SP]
6e98bf
+       1 sz 001000 011 11111 1  t2  n  t1   LDAXP Rt1, Rt2, [Xn|SP]
6e98bf
+       1 sz 001000 001 s     0  t2  n  t1   STXP  Ws, Rt1, Rt2, [Xn|SP]
6e98bf
+       1 sz 001000 001 s     1  t2  n  t1   STLXP Ws, Rt1, Rt2, [Xn|SP]
6e98bf
+   */
6e98bf
+   /* See just above, "LD{,A}X{R,RH,RB} / ST{,L}X{R,RH,RB}", for detailed
6e98bf
+      comments about this implementation.  Note the 'sz' field here is only 1
6e98bf
+      bit; above, it is 2 bits, and has a different encoding.
6e98bf
+   */
6e98bf
+   if (INSN(31,31) == 1
6e98bf
+       && INSN(29,24) == BITS6(0,0,1,0,0,0)
6e98bf
+       && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,1)) {
6e98bf
+      Bool elemIs64   = INSN(30,30) == 1;
6e98bf
+      Bool isLD       = INSN(22,22) == 1;
6e98bf
+      Bool isAcqOrRel = INSN(15,15) == 1;
6e98bf
+      UInt ss         = INSN(20,16);
6e98bf
+      UInt tt2        = INSN(14,10);
6e98bf
+      UInt nn         = INSN(9,5);
6e98bf
+      UInt tt1        = INSN(4,0);
6e98bf
+
6e98bf
+      UInt   elemSzB = elemIs64 ? 8 : 4;
6e98bf
+      UInt   fullSzB = 2 * elemSzB;
6e98bf
+      IRType elemTy  = integerIRTypeOfSize(elemSzB);
6e98bf
+      IRType fullTy  = integerIRTypeOfSize(fullSzB);
6e98bf
+
6e98bf
+      IRTemp ea = newTemp(Ity_I64);
6e98bf
+      assign(ea, getIReg64orSP(nn));
6e98bf
+      /* FIXME generate check that ea is 2*elemSzB-aligned */
6e98bf
+
6e98bf
+      if (isLD && ss == BITS5(1,1,1,1,1)) {
6e98bf
+         if (abiinfo->guest__use_fallback_LLSC) {
6e98bf
+            // Fallback implementation of LL.
6e98bf
+            // Do the load first so we don't update any guest state if it
6e98bf
+            // faults.  Assumes little-endian guest.
6e98bf
+            if (fullTy == Ity_I64) {
6e98bf
+               vassert(elemSzB == 4);
6e98bf
+               IRTemp loaded_data64 = newTemp(Ity_I64);
6e98bf
+               assign(loaded_data64, loadLE(fullTy, mkexpr(ea)));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(8) ));
6e98bf
+               putIReg64orZR(tt1, unop(Iop_32Uto64,
6e98bf
+                                       unop(Iop_64to32,
6e98bf
+                                            mkexpr(loaded_data64))));
6e98bf
+               putIReg64orZR(tt2, unop(Iop_32Uto64,
6e98bf
+                                       unop(Iop_64HIto32,
6e98bf
+                                            mkexpr(loaded_data64))));
6e98bf
+            } else {
6e98bf
+               vassert(elemSzB == 8 && fullTy == Ity_I128);
6e98bf
+               IRTemp loaded_data128 = newTemp(Ity_I128);
6e98bf
+               // Hack: do the load as V128 rather than I128 so as to avoid
6e98bf
+               // having to implement I128 loads in the arm64 back end.
6e98bf
+               assign(loaded_data128, unop(Iop_ReinterpV128asI128,
6e98bf
+                                           loadLE(Ity_V128, mkexpr(ea))));
6e98bf
+               IRTemp loaded_data_lo64 = newTemp(Ity_I64);
6e98bf
+               IRTemp loaded_data_hi64 = newTemp(Ity_I64);
6e98bf
+               assign(loaded_data_lo64, unop(Iop_128to64,
6e98bf
+                                             mkexpr(loaded_data128)));
6e98bf
+               assign(loaded_data_hi64, unop(Iop_128HIto64,
6e98bf
+                                             mkexpr(loaded_data128)));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64,
6e98bf
+                                 mkexpr(loaded_data_lo64) ));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64,
6e98bf
+                                 mkexpr(loaded_data_hi64) ));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
6e98bf
+               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(16) ));
6e98bf
+               putIReg64orZR(tt1, mkexpr(loaded_data_lo64));
6e98bf
+               putIReg64orZR(tt2, mkexpr(loaded_data_hi64));
6e98bf
+            }
6e98bf
+         } else {
6e98bf
+            // Non-fallback implementation of LL.
6e98bf
+            IRTemp res = newTemp(fullTy); // I64 or I128
6e98bf
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
6e98bf
+            // Assuming a little-endian guest here.  Rt1 goes at the lower
6e98bf
+            // address, so it must live in the least significant half of `res`.
6e98bf
+            IROp opGetLO = fullTy == Ity_I128 ? Iop_128to64   : Iop_64to32;
6e98bf
+            IROp opGetHI = fullTy == Ity_I128 ? Iop_128HIto64 : Iop_64HIto32;
6e98bf
+            putIReg64orZR(tt1, widenUto64(elemTy, unop(opGetLO, mkexpr(res))));
6e98bf
+            putIReg64orZR(tt2, widenUto64(elemTy, unop(opGetHI, mkexpr(res))));
6e98bf
+         }
6e98bf
+         if (isAcqOrRel) {
6e98bf
+            stmt(IRStmt_MBE(Imbe_Fence));
6e98bf
+         }
6e98bf
+         DIP("ld%sxp %s, %s, [%s] %s\n",
6e98bf
+             isAcqOrRel ? (isLD ? "a" : "l") : "",
6e98bf
+             nameIRegOrZR(elemSzB == 8, tt1),
6e98bf
+             nameIRegOrZR(elemSzB == 8, tt2),
6e98bf
+             nameIReg64orSP(nn),
6e98bf
+             abiinfo->guest__use_fallback_LLSC
6e98bf
+                ? "(fallback implementation)" : "");
6e98bf
+         return True;
6e98bf
+      }
6e98bf
+      if (!isLD) {
6e98bf
+         if (isAcqOrRel) {
6e98bf
+            stmt(IRStmt_MBE(Imbe_Fence));
6e98bf
+         }
6e98bf
+         if (abiinfo->guest__use_fallback_LLSC) {
6e98bf
+            // Fallback implementation of SC.
6e98bf
+            // This is really ugly, since we don't have any way to do
6e98bf
+            // proper if-then-else.  First, set up as if the SC failed,
6e98bf
+            // and jump forwards if it really has failed.
6e98bf
+
6e98bf
+            // Continuation address
6e98bf
+            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
6e98bf
+
6e98bf
+            // "the SC failed".  Any non-zero value means failure.
6e98bf
+            putIReg64orZR(ss, mkU64(1));
6e98bf
+
6e98bf
+            IRTemp tmp_LLsize = newTemp(Ity_I64);
6e98bf
+            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
6e98bf
+            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
6e98bf
+            ));
6e98bf
+            // Fail if no or wrong-size transaction
6e98bf
+            vassert((fullSzB == 8 && fullTy == Ity_I64)
6e98bf
+                    || (fullSzB == 16 && fullTy == Ity_I128));
6e98bf
+            stmt( IRStmt_Exit(
6e98bf
+                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(fullSzB)),
6e98bf
+                     Ijk_Boring, nia, OFFB_PC
6e98bf
+            ));
6e98bf
+            // Fail if the address doesn't match the LL address
6e98bf
+            stmt( IRStmt_Exit(
6e98bf
+                      binop(Iop_CmpNE64, mkexpr(ea),
6e98bf
+                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
6e98bf
+                      Ijk_Boring, nia, OFFB_PC
6e98bf
+            ));
6e98bf
+            // The data to be stored.
6e98bf
+            IRTemp store_data = newTemp(fullTy);
6e98bf
+            if (fullTy == Ity_I64) {
6e98bf
+               assign(store_data,
6e98bf
+                      binop(Iop_32HLto64,
6e98bf
+                            narrowFrom64(Ity_I32, getIReg64orZR(tt2)),
6e98bf
+                            narrowFrom64(Ity_I32, getIReg64orZR(tt1))));
6e98bf
+            } else {
6e98bf
+               assign(store_data,
6e98bf
+                      binop(Iop_64HLto128,
6e98bf
+                            getIReg64orZR(tt2), getIReg64orZR(tt1)));
6e98bf
+            }
6e98bf
+
6e98bf
+            if (fullTy == Ity_I64) {
6e98bf
+               // 64 bit (2x32 bit) path
6e98bf
+               // Fail if the data in memory doesn't match the data stashed by
6e98bf
+               // the LL.
6e98bf
+               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
6e98bf
+               assign(llsc_data_lo64,
6e98bf
+                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6e98bf
+               stmt( IRStmt_Exit(
6e98bf
+                         binop(Iop_CmpNE64, loadLE(Ity_I64, mkexpr(ea)),
6e98bf
+                                            mkexpr(llsc_data_lo64)),
6e98bf
+                      Ijk_Boring, nia, OFFB_PC
6e98bf
+               ));
6e98bf
+               // Try to CAS the new value in.
6e98bf
+               IRTemp old = newTemp(Ity_I64);
6e98bf
+               IRTemp expd = newTemp(Ity_I64);
6e98bf
+               assign(expd, mkexpr(llsc_data_lo64));
6e98bf
+               stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
6e98bf
+                                        Iend_LE, mkexpr(ea),
6e98bf
+                                        /*expdHi*/NULL, mkexpr(expd),
6e98bf
+                                        /*dataHi*/NULL, mkexpr(store_data)
6e98bf
+               )));
6e98bf
+               // Fail if the CAS failed (viz, old != expd)
6e98bf
+               stmt( IRStmt_Exit(
6e98bf
+                         binop(Iop_CmpNE64, mkexpr(old), mkexpr(expd)),
6e98bf
+                         Ijk_Boring, nia, OFFB_PC
6e98bf
+               ));
6e98bf
+            } else {
6e98bf
+               // 128 bit (2x64 bit) path
6e98bf
+               // Fail if the data in memory doesn't match the data stashed by
6e98bf
+               // the LL.
6e98bf
+               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
6e98bf
+               assign(llsc_data_lo64,
6e98bf
+                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
6e98bf
+               IRTemp llsc_data_hi64 = newTemp(Ity_I64);
6e98bf
+               assign(llsc_data_hi64,
6e98bf
+                      IRExpr_Get(OFFB_LLSC_DATA_HI64, Ity_I64));
6e98bf
+               IRTemp data_at_ea = newTemp(Ity_I128);
6e98bf
+               assign(data_at_ea,
6e98bf
+                      unop(Iop_ReinterpV128asI128,
6e98bf
+                           loadLE(Ity_V128, mkexpr(ea))));
6e98bf
+               stmt( IRStmt_Exit(
6e98bf
+                        binop(Iop_CmpNE64,
6e98bf
+                              unop(Iop_128to64, mkexpr(data_at_ea)),
6e98bf
+                              mkexpr(llsc_data_lo64)),
6e98bf
+                        Ijk_Boring, nia, OFFB_PC
6e98bf
+               ));
6e98bf
+               stmt( IRStmt_Exit(
6e98bf
+                        binop(Iop_CmpNE64,
6e98bf
+                              unop(Iop_128HIto64, mkexpr(data_at_ea)),
6e98bf
+                              mkexpr(llsc_data_hi64)),
6e98bf
+                        Ijk_Boring, nia, OFFB_PC
6e98bf
+               ));
6e98bf
+               // Try to CAS the new value in.
6e98bf
+               IRTemp old_lo64 = newTemp(Ity_I64);
6e98bf
+               IRTemp old_hi64 = newTemp(Ity_I64);
6e98bf
+               IRTemp expd_lo64 = newTemp(Ity_I64);
6e98bf
+               IRTemp expd_hi64 = newTemp(Ity_I64);
6e98bf
+               IRTemp store_data_lo64 = newTemp(Ity_I64);
6e98bf
+               IRTemp store_data_hi64 = newTemp(Ity_I64);
6e98bf
+               assign(expd_lo64, mkexpr(llsc_data_lo64));
6e98bf
+               assign(expd_hi64, mkexpr(llsc_data_hi64));
6e98bf
+               assign(store_data_lo64, unop(Iop_128to64, mkexpr(store_data)));
6e98bf
+               assign(store_data_hi64, unop(Iop_128HIto64, mkexpr(store_data)));
6e98bf
+               stmt( IRStmt_CAS(mkIRCAS(old_hi64, old_lo64,
6e98bf
+                                        Iend_LE, mkexpr(ea),
6e98bf
+                                        mkexpr(expd_hi64), mkexpr(expd_lo64),
6e98bf
+                                        mkexpr(store_data_hi64),
6e98bf
+                                        mkexpr(store_data_lo64)
6e98bf
+               )));
6e98bf
+               // Fail if the CAS failed (viz, old != expd)
6e98bf
+               stmt( IRStmt_Exit(
6e98bf
+                        binop(Iop_CmpNE64, mkexpr(old_lo64), mkexpr(expd_lo64)),
6e98bf
+                        Ijk_Boring, nia, OFFB_PC
6e98bf
+               ));
6e98bf
+               stmt( IRStmt_Exit(
6e98bf
+                        binop(Iop_CmpNE64, mkexpr(old_hi64), mkexpr(expd_hi64)),
6e98bf
+                        Ijk_Boring, nia, OFFB_PC
6e98bf
+               ));
6e98bf
+            }
6e98bf
+            // Otherwise we succeeded (!)
6e98bf
+            putIReg64orZR(ss, mkU64(0));
6e98bf
+         } else {
6e98bf
+            // Non-fallback implementation of SC.
6e98bf
+            IRTemp  res     = newTemp(Ity_I1);
6e98bf
+            IRExpr* dataLO  = narrowFrom64(elemTy, getIReg64orZR(tt1));
6e98bf
+            IRExpr* dataHI  = narrowFrom64(elemTy, getIReg64orZR(tt2));
6e98bf
+            IROp    opMerge = fullTy == Ity_I128 ? Iop_64HLto128 : Iop_32HLto64;
6e98bf
+            IRExpr* data    = binop(opMerge, dataHI, dataLO);
6e98bf
+            // Assuming a little-endian guest here.  Rt1 goes at the lower
6e98bf
+            // address, so it must live in the least significant half of `data`.
6e98bf
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
6e98bf
+            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
6e98bf
+               Need to set rS to 1 on failure, 0 on success. */
6e98bf
+            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
6e98bf
+                                               mkU64(1)));
6e98bf
+         }
6e98bf
+         DIP("st%sxp %s, %s, %s, [%s] %s\n",
6e98bf
+             isAcqOrRel ? (isLD ? "a" : "l") : "",
6e98bf
+             nameIRegOrZR(False, ss),
6e98bf
+             nameIRegOrZR(elemSzB == 8, tt1),
6e98bf
+             nameIRegOrZR(elemSzB == 8, tt2),
6e98bf
+             nameIReg64orSP(nn),
6e98bf
+             abiinfo->guest__use_fallback_LLSC
6e98bf
+                ? "(fallback implementation)" : "");
6e98bf
+         return True;
6e98bf
+      }
6e98bf
+      /* else fall through */
6e98bf
+   }
6e98bf
+
6e98bf
    /* ------------------ LDA{R,RH,RB} ------------------ */
6e98bf
    /* ------------------ STL{R,RH,RB} ------------------ */
6e98bf
    /* 31 29     23  20      14    9 4
6e98bf
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
6e98bf
index 5657bcab9..b65e27db4 100644
6e98bf
--- a/VEX/priv/host_arm64_defs.c
6e98bf
+++ b/VEX/priv/host_arm64_defs.c
6e98bf
@@ -1059,6 +1059,16 @@ ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
6e98bf
    vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
6e98bf
    return i;
6e98bf
 }
6e98bf
+ARM64Instr* ARM64Instr_LdrEXP ( void ) {
6e98bf
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
6e98bf
+   i->tag        = ARM64in_LdrEXP;
6e98bf
+   return i;
6e98bf
+}
6e98bf
+ARM64Instr* ARM64Instr_StrEXP ( void ) {
6e98bf
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
6e98bf
+   i->tag        = ARM64in_StrEXP;
6e98bf
+   return i;
6e98bf
+}
6e98bf
 ARM64Instr* ARM64Instr_CAS ( Int szB ) {
6e98bf
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
6e98bf
    i->tag             = ARM64in_CAS;
6e98bf
@@ -1699,12 +1709,19 @@ void ppARM64Instr ( const ARM64Instr* i ) {
6e98bf
                     sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
6e98bf
          return;
6e98bf
       }
6e98bf
+      case ARM64in_LdrEXP:
6e98bf
+         vex_printf("ldxp   x2, x3, [x4]");
6e98bf
+         return;
6e98bf
+      case ARM64in_StrEXP:
6e98bf
+         vex_printf("stxp   w0, x2, x3, [x4]");
6e98bf
+         return;
6e98bf
       case ARM64in_CAS: {
6e98bf
          vex_printf("x1 = cas(%dbit)(x3, x5 -> x7)", 8 * i->ARM64in.CAS.szB);
6e98bf
          return;
6e98bf
       }
6e98bf
       case ARM64in_CASP: {
6e98bf
-         vex_printf("x0,x1 = casp(%dbit)(x2, x4,x5 -> x6,x7)", 8 * i->ARM64in.CASP.szB);
6e98bf
+         vex_printf("x0,x1 = casp(2x%dbit)(x2, x4,x5 -> x6,x7)",
6e98bf
+                    8 * i->ARM64in.CASP.szB);
6e98bf
          return;
6e98bf
       }
6e98bf
       case ARM64in_MFence:
6e98bf
@@ -2253,6 +2270,17 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
6e98bf
          addHRegUse(u, HRmWrite, hregARM64_X0());
6e98bf
          addHRegUse(u, HRmRead, hregARM64_X2());
6e98bf
          return;
6e98bf
+      case ARM64in_LdrEXP:
6e98bf
+         addHRegUse(u, HRmRead, hregARM64_X4());
6e98bf
+         addHRegUse(u, HRmWrite, hregARM64_X2());
6e98bf
+         addHRegUse(u, HRmWrite, hregARM64_X3());
6e98bf
+         return;
6e98bf
+      case ARM64in_StrEXP:
6e98bf
+         addHRegUse(u, HRmRead, hregARM64_X4());
6e98bf
+         addHRegUse(u, HRmWrite, hregARM64_X0());
6e98bf
+         addHRegUse(u, HRmRead, hregARM64_X2());
6e98bf
+         addHRegUse(u, HRmRead, hregARM64_X3());
6e98bf
+         return;
6e98bf
       case ARM64in_CAS:
6e98bf
          addHRegUse(u, HRmRead, hregARM64_X3());
6e98bf
          addHRegUse(u, HRmRead, hregARM64_X5());
6e98bf
@@ -2571,6 +2599,10 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
6e98bf
          return;
6e98bf
       case ARM64in_StrEX:
6e98bf
          return;
6e98bf
+      case ARM64in_LdrEXP:
6e98bf
+         return;
6e98bf
+      case ARM64in_StrEXP:
6e98bf
+         return;
6e98bf
       case ARM64in_CAS:
6e98bf
          return;
6e98bf
       case ARM64in_CASP:
6e98bf
@@ -4167,6 +4199,16 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
6e98bf
          }
6e98bf
          goto bad;
6e98bf
       }
6e98bf
+      case ARM64in_LdrEXP: {
6e98bf
+         // 820C7FC8   ldxp x2, x3, [x4]
6e98bf
+         *p++ = 0xC87F0C82;
6e98bf
+         goto done;
6e98bf
+      }
6e98bf
+      case ARM64in_StrEXP: {
6e98bf
+         // 820C20C8   stxp w0, x2, x3, [x4]
6e98bf
+         *p++ = 0xC8200C82;
6e98bf
+         goto done;
6e98bf
+      }
6e98bf
       case ARM64in_CAS: {
6e98bf
          /* This isn't simple.  For an explanation see the comment in
6e98bf
             host_arm64_defs.h on the definition of ARM64Instr case CAS.
6e98bf
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
6e98bf
index 01fb5708e..dc686dff7 100644
6e98bf
--- a/VEX/priv/host_arm64_defs.h
6e98bf
+++ b/VEX/priv/host_arm64_defs.h
6e98bf
@@ -509,8 +509,10 @@ typedef
6e98bf
       ARM64in_AddToSP,     /* move SP by small, signed constant */
6e98bf
       ARM64in_FromSP,      /* move SP to integer register */
6e98bf
       ARM64in_Mul,
6e98bf
-      ARM64in_LdrEX,
6e98bf
-      ARM64in_StrEX,
6e98bf
+      ARM64in_LdrEX,       /* load exclusive, single register */
6e98bf
+      ARM64in_StrEX,       /* store exclusive, single register */
6e98bf
+      ARM64in_LdrEXP,      /* load exclusive, register pair, 2x64-bit only */
6e98bf
+      ARM64in_StrEXP,      /* store exclusive, register pair, 2x64-bit only */
6e98bf
       ARM64in_CAS,
6e98bf
       ARM64in_CASP,
6e98bf
       ARM64in_MFence,
6e98bf
@@ -719,6 +721,12 @@ typedef
6e98bf
          struct {
6e98bf
             Int  szB; /* 1, 2, 4 or 8 */
6e98bf
          } StrEX;
6e98bf
+         /* LDXP x2, x3, [x4].  This is 2x64-bit only. */
6e98bf
+         struct {
6e98bf
+         } LdrEXP;
6e98bf
+         /* STXP w0, x2, x3, [x4].  This is 2x64-bit only. */
6e98bf
+         struct {
6e98bf
+         } StrEXP;
6e98bf
          /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
6e98bf
             and trashes x8
6e98bf
             where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
6e98bf
@@ -1037,6 +1045,8 @@ extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
6e98bf
                                         ARM64MulOp op );
6e98bf
 extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
6e98bf
 extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
6e98bf
+extern ARM64Instr* ARM64Instr_LdrEXP  ( void );
6e98bf
+extern ARM64Instr* ARM64Instr_StrEXP  ( void );
6e98bf
 extern ARM64Instr* ARM64Instr_CAS     ( Int szB );
6e98bf
 extern ARM64Instr* ARM64Instr_CASP    ( Int szB );
6e98bf
 extern ARM64Instr* ARM64Instr_MFence  ( void );
6e98bf
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
6e98bf
index 4b1d8c846..094e7e74b 100644
6e98bf
--- a/VEX/priv/host_arm64_isel.c
6e98bf
+++ b/VEX/priv/host_arm64_isel.c
6e98bf
@@ -196,9 +196,9 @@ static HReg        iselCondCode_R        ( ISelEnv* env, IRExpr* e );
6e98bf
 static HReg        iselIntExpr_R_wrk     ( ISelEnv* env, IRExpr* e );
6e98bf
 static HReg        iselIntExpr_R         ( ISelEnv* env, IRExpr* e );
6e98bf
 
6e98bf
-static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, HReg* rLo, 
6e98bf
+static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
6e98bf
                                            ISelEnv* env, IRExpr* e );
6e98bf
-static void        iselInt128Expr        ( /*OUT*/HReg* rHi, HReg* rLo, 
6e98bf
+static void        iselInt128Expr        ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
6e98bf
                                            ISelEnv* env, IRExpr* e );
6e98bf
 
6e98bf
 static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
6e98bf
@@ -1759,9 +1759,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
6e98bf
 
6e98bf
       /* AND/OR/XOR(e1, e2) (for any e1, e2) */
6e98bf
       switch (e->Iex.Binop.op) {
6e98bf
-         case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop;
6e98bf
-         case Iop_Or64:  case Iop_Or32:  case Iop_Or16: lop = ARM64lo_OR;  goto log_binop;
6e98bf
-         case Iop_Xor64: case Iop_Xor32: lop = ARM64lo_XOR; goto log_binop;
6e98bf
+         case Iop_And64: case Iop_And32:
6e98bf
+            lop = ARM64lo_AND; goto log_binop;
6e98bf
+         case Iop_Or64:  case Iop_Or32:  case Iop_Or16:
6e98bf
+            lop = ARM64lo_OR;  goto log_binop;
6e98bf
+         case Iop_Xor64: case Iop_Xor32:
6e98bf
+            lop = ARM64lo_XOR; goto log_binop;
6e98bf
          log_binop: {
6e98bf
             HReg      dst  = newVRegI(env);
6e98bf
             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
6e98bf
@@ -2013,6 +2016,11 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
6e98bf
             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
6e98bf
             return rHi; /* and abandon rLo */
6e98bf
          }
6e98bf
+         case Iop_128to64: {
6e98bf
+            HReg rHi, rLo;
6e98bf
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
6e98bf
+            return rLo; /* and abandon rHi */
6e98bf
+         }
6e98bf
          case Iop_8Sto32: case Iop_8Sto64: {
6e98bf
             IRExpr* arg = e->Iex.Unop.arg;
6e98bf
             HReg    src = iselIntExpr_R(env, arg);
6e98bf
@@ -2185,13 +2193,19 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
6e98bf
             }
6e98bf
             return dst;
6e98bf
          }
6e98bf
+         case Iop_64HIto32: {
6e98bf
+            HReg dst = newVRegI(env);
6e98bf
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
6e98bf
+            addInstr(env, ARM64Instr_Shift(dst, src, ARM64RI6_I6(32),
6e98bf
+                                           ARM64sh_SHR));
6e98bf
+            return dst;
6e98bf
+         }
6e98bf
          case Iop_64to32:
6e98bf
          case Iop_64to16:
6e98bf
          case Iop_64to8:
6e98bf
          case Iop_32to16:
6e98bf
             /* These are no-ops. */
6e98bf
             return iselIntExpr_R(env, e->Iex.Unop.arg);
6e98bf
-
6e98bf
          default:
6e98bf
             break;
6e98bf
       }
6e98bf
@@ -2335,6 +2349,43 @@ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
6e98bf
    vassert(e);
6e98bf
    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
6e98bf
 
6e98bf
+   /* --------- TEMP --------- */
6e98bf
+   if (e->tag == Iex_RdTmp) {
6e98bf
+      lookupIRTempPair(rHi, rLo, env, e->Iex.RdTmp.tmp);
6e98bf
+      return;
6e98bf
+   }
6e98bf
+
6e98bf
+   /* --------- CONST --------- */
6e98bf
+   if (e->tag == Iex_Const) {
6e98bf
+      IRConst* c = e->Iex.Const.con;
6e98bf
+      vassert(c->tag == Ico_U128);
6e98bf
+      if (c->Ico.U128 == 0) {
6e98bf
+         // The only case we need to handle (so far)
6e98bf
+         HReg zero = newVRegI(env);
6e98bf
+         addInstr(env, ARM64Instr_Imm64(zero, 0));
6e98bf
+         *rHi = *rLo = zero;
6e98bf
+         return;
6e98bf
+      }
6e98bf
+   }
6e98bf
+
6e98bf
+   /* --------- UNARY ops --------- */
6e98bf
+   if (e->tag == Iex_Unop) {
6e98bf
+      switch (e->Iex.Unop.op) {
6e98bf
+         case Iop_ReinterpV128asI128: {
6e98bf
+            HReg dstHi = newVRegI(env);
6e98bf
+            HReg dstLo = newVRegI(env);
6e98bf
+            HReg src    = iselV128Expr(env, e->Iex.Unop.arg);
6e98bf
+            addInstr(env, ARM64Instr_VXfromQ(dstHi, src, 1));
6e98bf
+            addInstr(env, ARM64Instr_VXfromQ(dstLo, src, 0));
6e98bf
+            *rHi = dstHi;
6e98bf
+            *rLo = dstLo;
6e98bf
+            return;
6e98bf
+         }
6e98bf
+         default:
6e98bf
+            break;
6e98bf
+      }
6e98bf
+   }
6e98bf
+
6e98bf
    /* --------- BINARY ops --------- */
6e98bf
    if (e->tag == Iex_Binop) {
6e98bf
       switch (e->Iex.Binop.op) {
6e98bf
@@ -4086,6 +4137,14 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
6e98bf
          addInstr(env, ARM64Instr_VMov(8/*yes, really*/, dst, src));
6e98bf
          return;
6e98bf
       }
6e98bf
+      if (ty == Ity_I128) {
6e98bf
+         HReg rHi, rLo, dstHi, dstLo;
6e98bf
+         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
6e98bf
+         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
6e98bf
+         addInstr(env, ARM64Instr_MovI(dstHi, rHi));
6e98bf
+         addInstr(env, ARM64Instr_MovI(dstLo, rLo));
6e98bf
+         return;
6e98bf
+      }
6e98bf
       if (ty == Ity_V128) {
6e98bf
          HReg src = iselV128Expr(env, stmt->Ist.WrTmp.data);
6e98bf
          HReg dst = lookupIRTemp(env, tmp);
6e98bf
@@ -4183,42 +4242,67 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
6e98bf
          /* LL */
6e98bf
          IRTemp res = stmt->Ist.LLSC.result;
6e98bf
          IRType ty  = typeOfIRTemp(env->type_env, res);
6e98bf
-         if (ty == Ity_I64 || ty == Ity_I32 
6e98bf
+         if (ty == Ity_I128 || ty == Ity_I64 || ty == Ity_I32
6e98bf
              || ty == Ity_I16 || ty == Ity_I8) {
6e98bf
             Int  szB   = 0;
6e98bf
-            HReg r_dst = lookupIRTemp(env, res);
6e98bf
             HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
6e98bf
             switch (ty) {
6e98bf
-               case Ity_I8:  szB = 1; break;
6e98bf
-               case Ity_I16: szB = 2; break;
6e98bf
-               case Ity_I32: szB = 4; break;
6e98bf
-               case Ity_I64: szB = 8; break;
6e98bf
-               default:      vassert(0);
6e98bf
+               case Ity_I8:   szB = 1;  break;
6e98bf
+               case Ity_I16:  szB = 2;  break;
6e98bf
+               case Ity_I32:  szB = 4;  break;
6e98bf
+               case Ity_I64:  szB = 8;  break;
6e98bf
+               case Ity_I128: szB = 16; break;
6e98bf
+               default:       vassert(0);
6e98bf
+            }
6e98bf
+            if (szB == 16) {
6e98bf
+               HReg r_dstMSword = INVALID_HREG;
6e98bf
+               HReg r_dstLSword = INVALID_HREG;
6e98bf
+               lookupIRTempPair(&r_dstMSword, &r_dstLSword, env, res);
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
6e98bf
+               addInstr(env, ARM64Instr_LdrEXP());
6e98bf
+               addInstr(env, ARM64Instr_MovI(r_dstLSword, hregARM64_X2()));
6e98bf
+               addInstr(env, ARM64Instr_MovI(r_dstMSword, hregARM64_X3()));
6e98bf
+            } else {
6e98bf
+               vassert(szB != 0);
6e98bf
+               HReg r_dst = lookupIRTemp(env, res);
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
6e98bf
+               addInstr(env, ARM64Instr_LdrEX(szB));
6e98bf
+               addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
6e98bf
             }
6e98bf
-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
6e98bf
-            addInstr(env, ARM64Instr_LdrEX(szB));
6e98bf
-            addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
6e98bf
             return;
6e98bf
          }
6e98bf
          goto stmt_fail;
6e98bf
       } else {
6e98bf
          /* SC */
6e98bf
          IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
6e98bf
-         if (tyd == Ity_I64 || tyd == Ity_I32
6e98bf
+         if (tyd == Ity_I128 || tyd == Ity_I64 || tyd == Ity_I32
6e98bf
              || tyd == Ity_I16 || tyd == Ity_I8) {
6e98bf
             Int  szB = 0;
6e98bf
-            HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
6e98bf
             HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
6e98bf
             switch (tyd) {
6e98bf
-               case Ity_I8:  szB = 1; break;
6e98bf
-               case Ity_I16: szB = 2; break;
6e98bf
-               case Ity_I32: szB = 4; break;
6e98bf
-               case Ity_I64: szB = 8; break;
6e98bf
-               default:      vassert(0);
6e98bf
+               case Ity_I8:   szB = 1; break;
6e98bf
+               case Ity_I16:  szB = 2; break;
6e98bf
+               case Ity_I32:  szB = 4; break;
6e98bf
+               case Ity_I64:  szB = 8; break;
6e98bf
+               case Ity_I128: szB = 16; break;
6e98bf
+               default:       vassert(0);
6e98bf
+            }
6e98bf
+            if (szB == 16) {
6e98bf
+               HReg rD_MSword = INVALID_HREG;
6e98bf
+               HReg rD_LSword = INVALID_HREG;
6e98bf
+               iselInt128Expr(&rD_MSword,
6e98bf
+                              &rD_LSword, env, stmt->Ist.LLSC.storedata);
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD_LSword));
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rD_MSword));
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
6e98bf
+               addInstr(env, ARM64Instr_StrEXP());
6e98bf
+            } else {
6e98bf
+               vassert(szB != 0);
6e98bf
+               HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
6e98bf
+               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
6e98bf
+               addInstr(env, ARM64Instr_StrEX(szB));
6e98bf
             }
6e98bf
-            addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
6e98bf
-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
6e98bf
-            addInstr(env, ARM64Instr_StrEX(szB));
6e98bf
          } else {
6e98bf
             goto stmt_fail;
6e98bf
          }
6e98bf
@@ -4243,10 +4327,10 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
6e98bf
 
6e98bf
    /* --------- ACAS --------- */
6e98bf
    case Ist_CAS: {
6e98bf
-      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
6e98bf
+      IRCAS* cas = stmt->Ist.CAS.details;
6e98bf
+      if (cas->oldHi == IRTemp_INVALID && cas->end == Iend_LE) {
6e98bf
          /* "normal" singleton CAS */
6e98bf
          UChar  sz;
6e98bf
-         IRCAS* cas = stmt->Ist.CAS.details;
6e98bf
          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
6e98bf
          switch (ty) { 
6e98bf
             case Ity_I64: sz = 8; break;
6e98bf
@@ -4281,10 +4365,9 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
6e98bf
          addInstr(env, ARM64Instr_MovI(rOld, rResult));
6e98bf
          return;
6e98bf
       }
6e98bf
-      else {
6e98bf
+      if (cas->oldHi != IRTemp_INVALID && cas->end == Iend_LE) {
6e98bf
          /* Paired register CAS, i.e. CASP */
6e98bf
          UChar  sz;
6e98bf
-         IRCAS* cas = stmt->Ist.CAS.details;
6e98bf
          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
6e98bf
          switch (ty) {
6e98bf
             case Ity_I64: sz = 8; break;
6e98bf
diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
6e98bf
index 25566c41c..2d82c41a1 100644
6e98bf
--- a/VEX/priv/ir_defs.c
6e98bf
+++ b/VEX/priv/ir_defs.c
6e98bf
@@ -76,6 +76,7 @@ void ppIRConst ( const IRConst* con )
6e98bf
       case Ico_U16:  vex_printf( "0x%x:I16",     (UInt)(con->Ico.U16)); break;
6e98bf
       case Ico_U32:  vex_printf( "0x%x:I32",     (UInt)(con->Ico.U32)); break;
6e98bf
       case Ico_U64:  vex_printf( "0x%llx:I64",   (ULong)(con->Ico.U64)); break;
6e98bf
+      case Ico_U128: vex_printf( "I128{0x%04x}", (UInt)(con->Ico.U128)); break;
6e98bf
       case Ico_F32:  u.f32 = con->Ico.F32;
6e98bf
                      vex_printf( "F32{0x%x}",   u.i32);
6e98bf
                      break;
6e98bf
@@ -2266,6 +2267,13 @@ IRConst* IRConst_U64 ( ULong u64 )
6e98bf
    c->Ico.U64 = u64;
6e98bf
    return c;
6e98bf
 }
6e98bf
+IRConst* IRConst_U128 ( UShort con )
6e98bf
+{
6e98bf
+   IRConst* c  = LibVEX_Alloc_inline(sizeof(IRConst));
6e98bf
+   c->tag      = Ico_U128;
6e98bf
+   c->Ico.U128 = con;
6e98bf
+   return c;
6e98bf
+}
6e98bf
 IRConst* IRConst_F32 ( Float f32 )
6e98bf
 {
6e98bf
    IRConst* c = LibVEX_Alloc_inline(sizeof(IRConst));
6e98bf
@@ -4230,6 +4238,7 @@ IRType typeOfIRConst ( const IRConst* con )
6e98bf
       case Ico_U16:   return Ity_I16;
6e98bf
       case Ico_U32:   return Ity_I32;
6e98bf
       case Ico_U64:   return Ity_I64;
6e98bf
+      case Ico_U128:  return Ity_I128;
6e98bf
       case Ico_F32:   return Ity_F32;
6e98bf
       case Ico_F32i:  return Ity_F32;
6e98bf
       case Ico_F64:   return Ity_F64;
6e98bf
@@ -5129,7 +5138,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
6e98bf
          tyRes = typeOfIRTemp(tyenv, stmt->Ist.LLSC.result);
6e98bf
          if (stmt->Ist.LLSC.storedata == NULL) {
6e98bf
             /* it's a LL */
6e98bf
-            if (tyRes != Ity_I64 && tyRes != Ity_I32
6e98bf
+            if (tyRes != Ity_I128 && tyRes != Ity_I64 && tyRes != Ity_I32
6e98bf
                 && tyRes != Ity_I16 && tyRes != Ity_I8)
6e98bf
                sanityCheckFail(bb,stmt,"Ist.LLSC(LL).result :: bogus");
6e98bf
          } else {
6e98bf
@@ -5137,7 +5146,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
6e98bf
             if (tyRes != Ity_I1)
6e98bf
                sanityCheckFail(bb,stmt,"Ist.LLSC(SC).result: not :: Ity_I1");
6e98bf
             tyData = typeOfIRExpr(tyenv, stmt->Ist.LLSC.storedata);
6e98bf
-            if (tyData != Ity_I64 && tyData != Ity_I32
6e98bf
+            if (tyData != Ity_I128 && tyData != Ity_I64 && tyData != Ity_I32
6e98bf
                 && tyData != Ity_I16 && tyData != Ity_I8)
6e98bf
                sanityCheckFail(bb,stmt,
6e98bf
                                "Ist.LLSC(SC).result :: storedata bogus");
6e98bf
@@ -5385,6 +5394,7 @@ Int sizeofIRType ( IRType ty )
6e98bf
 IRType integerIRTypeOfSize ( Int szB )
6e98bf
 {
6e98bf
    switch (szB) {
6e98bf
+      case 16: return Ity_I128;
6e98bf
       case 8: return Ity_I64;
6e98bf
       case 4: return Ity_I32;
6e98bf
       case 2: return Ity_I16;
6e98bf
diff --git a/VEX/pub/libvex_guest_arm64.h b/VEX/pub/libvex_guest_arm64.h
6e98bf
index 39b6ecdc2..91d06bd75 100644
6e98bf
--- a/VEX/pub/libvex_guest_arm64.h
6e98bf
+++ b/VEX/pub/libvex_guest_arm64.h
6e98bf
@@ -157,14 +157,18 @@ typedef
6e98bf
          note of bits 23 and 22. */
6e98bf
       UInt  guest_FPCR;
6e98bf
 
6e98bf
-      /* Fallback LL/SC support.  See bugs 344524 and 369459. */
6e98bf
-      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4 or 8.
6e98bf
+      /* Fallback LL/SC support.  See bugs 344524 and 369459.  _LO64 and _HI64
6e98bf
+         contain the original contents of _ADDR+0 .. _ADDR+15, but only _SIZE
6e98bf
+         number of bytes of it.  The remaining 16-_SIZE bytes of them must be
6e98bf
+         zero. */
6e98bf
+      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4,8 or 16.
6e98bf
       ULong guest_LLSC_ADDR; // Address of transaction.
6e98bf
-      ULong guest_LLSC_DATA; // Original value at _ADDR, zero-extended.
6e98bf
+      ULong guest_LLSC_DATA_LO64; // Original value at _ADDR+0.
6e98bf
+      ULong guest_LLSC_DATA_HI64; // Original value at _ADDR+8.
6e98bf
 
6e98bf
       /* Padding to make it have an 16-aligned size */
6e98bf
       /* UInt  pad_end_0; */
6e98bf
-      ULong pad_end_1;
6e98bf
+      /* ULong pad_end_1; */
6e98bf
    }
6e98bf
    VexGuestARM64State;
6e98bf
 
6e98bf
diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
6e98bf
index deaa044c1..85805bb69 100644
6e98bf
--- a/VEX/pub/libvex_ir.h
6e98bf
+++ b/VEX/pub/libvex_ir.h
6e98bf
@@ -269,6 +269,8 @@ typedef
6e98bf
       Ico_U16, 
6e98bf
       Ico_U32, 
6e98bf
       Ico_U64,
6e98bf
+      Ico_U128,  /* 128-bit restricted integer constant,
6e98bf
+                    same encoding scheme as V128 */
6e98bf
       Ico_F32,   /* 32-bit IEEE754 floating */
6e98bf
       Ico_F32i,  /* 32-bit unsigned int to be interpreted literally
6e98bf
                     as a IEEE754 single value. */
6e98bf
@@ -295,6 +297,7 @@ typedef
6e98bf
          UShort U16;
6e98bf
          UInt   U32;
6e98bf
          ULong  U64;
6e98bf
+         UShort U128;
6e98bf
          Float  F32;
6e98bf
          UInt   F32i;
6e98bf
          Double F64;
6e98bf
@@ -311,6 +314,7 @@ extern IRConst* IRConst_U8   ( UChar );
6e98bf
 extern IRConst* IRConst_U16  ( UShort );
6e98bf
 extern IRConst* IRConst_U32  ( UInt );
6e98bf
 extern IRConst* IRConst_U64  ( ULong );
6e98bf
+extern IRConst* IRConst_U128 ( UShort );
6e98bf
 extern IRConst* IRConst_F32  ( Float );
6e98bf
 extern IRConst* IRConst_F32i ( UInt );
6e98bf
 extern IRConst* IRConst_F64  ( Double );
6e98bf
diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
6e98bf
index 919c7fae8..176c8e5cb 100644
6e98bf
--- a/memcheck/mc_machine.c
6e98bf
+++ b/memcheck/mc_machine.c
6e98bf
@@ -1115,9 +1115,10 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
6e98bf
    if (o == GOF(CMSTART) && sz == 8) return -1; // untracked
6e98bf
    if (o == GOF(CMLEN)   && sz == 8) return -1; // untracked
6e98bf
 
6e98bf
-   if (o == GOF(LLSC_SIZE) && sz == 8) return -1; // untracked
6e98bf
-   if (o == GOF(LLSC_ADDR) && sz == 8) return o;
6e98bf
-   if (o == GOF(LLSC_DATA) && sz == 8) return o;
6e98bf
+   if (o == GOF(LLSC_SIZE)      && sz == 8) return -1; // untracked
6e98bf
+   if (o == GOF(LLSC_ADDR)      && sz == 8) return o;
6e98bf
+   if (o == GOF(LLSC_DATA_LO64) && sz == 8) return o;
6e98bf
+   if (o == GOF(LLSC_DATA_HI64) && sz == 8) return o;
6e98bf
 
6e98bf
    VG_(printf)("MC_(get_otrack_shadow_offset)(arm64)(off=%d,sz=%d)\n",
6e98bf
                offset,szB);
6e98bf
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
6e98bf
index c6fd2653f..72ccb3c8c 100644
6e98bf
--- a/memcheck/mc_translate.c
6e98bf
+++ b/memcheck/mc_translate.c
6e98bf
@@ -5497,8 +5497,11 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
6e98bf
       the address (shadow) to 'defined' following the test. */
6e98bf
    complainIfUndefined( mce, addr, guard );
6e98bf
 
6e98bf
-   /* Now cook up a call to the relevant helper function, to read the
6e98bf
-      data V bits from shadow memory. */
6e98bf
+   /* Now cook up a call to the relevant helper function, to read the data V
6e98bf
+      bits from shadow memory.  Note that I128 loads are done by pretending
6e98bf
+      we're doing a V128 load, and then converting the resulting V128 vbits
6e98bf
+      word to an I128, right at the end of this function -- see `castedToI128`
6e98bf
+      below.  (It's only a minor hack :-) This pertains to bug 444399. */
6e98bf
    ty = shadowTypeV(ty);
6e98bf
 
6e98bf
    void*        helper           = NULL;
6e98bf
@@ -5511,6 +5514,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
6e98bf
                         hname = "MC_(helperc_LOADV256le)";
6e98bf
                         ret_via_outparam = True;
6e98bf
                         break;
6e98bf
+         case Ity_I128: // fallthrough.  See comment above.
6e98bf
          case Ity_V128: helper = &MC_(helperc_LOADV128le);
6e98bf
                         hname = "MC_(helperc_LOADV128le)";
6e98bf
                         ret_via_outparam = True;
6e98bf
@@ -5576,7 +5580,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
6e98bf
 
6e98bf
    /* We need to have a place to park the V bits we're just about to
6e98bf
       read. */
6e98bf
-   IRTemp datavbits = newTemp(mce, ty, VSh);
6e98bf
+   IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
6e98bf
 
6e98bf
    /* Here's the call. */
6e98bf
    IRDirty* di;
6e98bf
@@ -5603,7 +5607,14 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
6e98bf
    }
6e98bf
    stmt( 'V', mce, IRStmt_Dirty(di) );
6e98bf
 
6e98bf
-   return mkexpr(datavbits);
6e98bf
+   if (ty == Ity_I128) {
6e98bf
+      IRAtom* castedToI128
6e98bf
+         = assignNew('V', mce, Ity_I128,
6e98bf
+                     unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
6e98bf
+      return castedToI128;
6e98bf
+   } else {
6e98bf
+      return mkexpr(datavbits);
6e98bf
+   }
6e98bf
 }
6e98bf
 
6e98bf
 
6e98bf
@@ -5631,6 +5642,7 @@ IRAtom* expr2vbits_Load ( MCEnv* mce,
6e98bf
       case Ity_I16:
6e98bf
       case Ity_I32:
6e98bf
       case Ity_I64:
6e98bf
+      case Ity_I128:
6e98bf
       case Ity_V128:
6e98bf
       case Ity_V256:
6e98bf
          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
6e98bf
@@ -5928,6 +5940,7 @@ void do_shadow_Store ( MCEnv* mce,
6e98bf
                         c = IRConst_V256(V_BITS32_DEFINED); break;
6e98bf
          case Ity_V128: // V128 weirdness -- used twice
6e98bf
                         c = IRConst_V128(V_BITS16_DEFINED); break;
6e98bf
+         case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
6e98bf
          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
6e98bf
          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
6e98bf
          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
6e98bf
@@ -5948,6 +5961,7 @@ void do_shadow_Store ( MCEnv* mce,
6e98bf
       switch (ty) {
6e98bf
          case Ity_V256: /* we'll use the helper four times */
6e98bf
          case Ity_V128: /* we'll use the helper twice */
6e98bf
+         case Ity_I128: /* we'll use the helper twice */
6e98bf
          case Ity_I64: helper = &MC_(helperc_STOREV64le);
6e98bf
                        hname = "MC_(helperc_STOREV64le)";
6e98bf
                        break;
6e98bf
@@ -6051,9 +6065,9 @@ void do_shadow_Store ( MCEnv* mce,
6e98bf
       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
6e98bf
 
6e98bf
    } 
6e98bf
-   else if (UNLIKELY(ty == Ity_V128)) {
6e98bf
+   else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
6e98bf
 
6e98bf
-      /* V128-bit case */
6e98bf
+      /* V128/I128-bit case */
6e98bf
       /* See comment in next clause re 64-bit regparms */
6e98bf
       /* also, need to be careful about endianness */
6e98bf
 
6e98bf
@@ -6062,6 +6076,7 @@ void do_shadow_Store ( MCEnv* mce,
6e98bf
       IRAtom  *addrLo64, *addrHi64;
6e98bf
       IRAtom  *vdataLo64, *vdataHi64;
6e98bf
       IRAtom  *eBiasLo64, *eBiasHi64;
6e98bf
+      IROp    opGetLO64,  opGetHI64;
6e98bf
 
6e98bf
       if (end == Iend_LE) {
6e98bf
          offLo64 = 0;
6e98bf
@@ -6071,9 +6086,17 @@ void do_shadow_Store ( MCEnv* mce,
6e98bf
          offHi64 = 0;
6e98bf
       }
6e98bf
 
6e98bf
+      if (ty == Ity_V128) {
6e98bf
+         opGetLO64 = Iop_V128to64;
6e98bf
+         opGetHI64 = Iop_V128HIto64;
6e98bf
+      } else {
6e98bf
+         opGetLO64 = Iop_128to64;
6e98bf
+         opGetHI64 = Iop_128HIto64;
6e98bf
+      }
6e98bf
+
6e98bf
       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
6e98bf
       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
6e98bf
-      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
6e98bf
+      vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
6e98bf
       diLo64    = unsafeIRDirty_0_N( 
6e98bf
                      1/*regparms*/, 
6e98bf
                      hname, VG_(fnptr_to_fnentry)( helper ), 
6e98bf
@@ -6081,7 +6104,7 @@ void do_shadow_Store ( MCEnv* mce,
6e98bf
                   );
6e98bf
       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
6e98bf
       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
6e98bf
-      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
6e98bf
+      vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
6e98bf
       diHi64    = unsafeIRDirty_0_N( 
6e98bf
                      1/*regparms*/, 
6e98bf
                      hname, VG_(fnptr_to_fnentry)( helper ), 
6e98bf
@@ -6888,7 +6911,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
6e98bf
       /* Just treat this as a normal load, followed by an assignment of
6e98bf
          the value to .result. */
6e98bf
       /* Stay sane */
6e98bf
-      tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6e98bf
+      tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
6e98bf
                 || resTy == Ity_I16 || resTy == Ity_I8);
6e98bf
       assign( 'V', mce, resTmp,
6e98bf
                    expr2vbits_Load(
6e98bf
@@ -6899,7 +6922,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
6e98bf
       /* Stay sane */
6e98bf
       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
6e98bf
                                    stStoredata);
6e98bf
-      tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
6e98bf
+      tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
6e98bf
                 || dataTy == Ity_I16 || dataTy == Ity_I8);
6e98bf
       do_shadow_Store( mce, stEnd,
6e98bf
                             stAddr, 0/* addr bias */,
6e98bf
@@ -7684,7 +7707,7 @@ static void schemeS ( MCEnv* mce, IRStmt* st )
6e98bf
                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
6e98bf
             IRExpr* vanillaLoad
6e98bf
                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
6e98bf
-            tl_assert(resTy == Ity_I64 || resTy == Ity_I32
6e98bf
+            tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
6e98bf
                       || resTy == Ity_I16 || resTy == Ity_I8);
6e98bf
             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
6e98bf
                               schemeE(mce, vanillaLoad));
6e98bf
diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
6e98bf
index 449710020..2b43ef7d7 100644
6e98bf
--- a/memcheck/tests/Makefile.am
6e98bf
+++ b/memcheck/tests/Makefile.am
6e98bf
@@ -90,6 +90,7 @@ EXTRA_DIST = \
6e98bf
 	addressable.stderr.exp addressable.stdout.exp addressable.vgtest \
6e98bf
 	atomic_incs.stderr.exp atomic_incs.vgtest \
6e98bf
 	atomic_incs.stdout.exp-32bit atomic_incs.stdout.exp-64bit \
6e98bf
+	atomic_incs.stdout.exp-64bit-and-128bit \
6e98bf
 	badaddrvalue.stderr.exp \
6e98bf
 	badaddrvalue.stdout.exp badaddrvalue.vgtest \
6e98bf
         exit_on_first_error.stderr.exp \
6e98bf
diff --git a/memcheck/tests/atomic_incs.c b/memcheck/tests/atomic_incs.c
6e98bf
index f931750f4..1c738c530 100644
6e98bf
--- a/memcheck/tests/atomic_incs.c
6e98bf
+++ b/memcheck/tests/atomic_incs.c
6e98bf
@@ -22,6 +22,17 @@
6e98bf
 #define NNN 3456987
6e98bf
 
6e98bf
 #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
6e98bf
+#define IS_16_ALIGNED(_ptr)  (0 == (((unsigned long)(_ptr)) & 15))
6e98bf
+
6e98bf
+// U128 from libvex_basictypes.h is a 4-x-UInt array, which is a bit
6e98bf
+// inconvenient, hence:
6e98bf
+typedef
6e98bf
+   struct {
6e98bf
+      // assuming little-endianness
6e98bf
+      unsigned long long int lo64;
6e98bf
+      unsigned long long int hi64;
6e98bf
+   }
6e98bf
+   MyU128;
6e98bf
 
6e98bf
 
6e98bf
 __attribute__((noinline)) void atomic_add_8bit ( char* p, int n ) 
6e98bf
@@ -712,6 +723,40 @@ __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
6e98bf
 #endif
6e98bf
 }
6e98bf
 
6e98bf
+__attribute__((noinline)) void atomic_add_128bit ( MyU128* p,
6e98bf
+                                                   unsigned long long int n )
6e98bf
+{
6e98bf
+#if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32) \
6e98bf
+    || defined (VGA_nanomips) || defined(VGA_mips64) \
6e98bf
+    || defined(VGA_amd64) \
6e98bf
+    || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
6e98bf
+    || defined(VGA_arm) \
6e98bf
+    || defined(VGA_s390x)
6e98bf
+   /* do nothing; is not supported */
6e98bf
+#elif defined(VGA_arm64)
6e98bf
+   unsigned long long int block[3]
6e98bf
+      = { (unsigned long long int)p, (unsigned long long int)n,
6e98bf
+          0xFFFFFFFFFFFFFFFFULL};
6e98bf
+   do {
6e98bf
+      __asm__ __volatile__(
6e98bf
+         "mov   x5, %0"             "\n\t" // &block[0]
6e98bf
+         "ldr   x9, [x5, #0]"       "\n\t" // p
6e98bf
+         "ldr   x10, [x5, #8]"      "\n\t" // n
6e98bf
+         "ldxp  x7, x8, [x9]"       "\n\t"
6e98bf
+         "adds  x7, x7, x10"        "\n\t"
6e98bf
+         "adc   x8, x8, xzr"        "\n\t"
6e98bf
+         "stxp  w4, x7, x8, [x9]"   "\n\t"
6e98bf
+         "str   x4, [x5, #16]"      "\n\t"
6e98bf
+         : /*out*/
6e98bf
+         : /*in*/ "r"(&block[0])
6e98bf
+         : /*trash*/ "memory", "cc", "x5", "x7", "x8", "x9", "x10", "x4"
6e98bf
+      );
6e98bf
+   } while (block[2] != 0);
6e98bf
+#else
6e98bf
+# error "Unsupported arch"
6e98bf
+#endif
6e98bf
+}
6e98bf
+
6e98bf
 int main ( int argc, char** argv )
6e98bf
 {
6e98bf
    int    i, status;
6e98bf
@@ -720,8 +765,12 @@ int main ( int argc, char** argv )
6e98bf
    short* p16;
6e98bf
    int*   p32;
6e98bf
    long long int* p64;
6e98bf
+   MyU128*  p128;
6e98bf
    pid_t  child, p2;
6e98bf
 
6e98bf
+   assert(sizeof(MyU128) == 16);
6e98bf
+   assert(sysconf(_SC_PAGESIZE) >= 4096);
6e98bf
+
6e98bf
    printf("parent, pre-fork\n");
6e98bf
 
6e98bf
    page = mmap( 0, sysconf(_SC_PAGESIZE),
6e98bf
@@ -736,11 +785,13 @@ int main ( int argc, char** argv )
6e98bf
    p16 = (short*)(page+256);
6e98bf
    p32 = (int*)(page+512);
6e98bf
    p64 = (long long int*)(page+768);
6e98bf
+   p128 = (MyU128*)(page+1024);
6e98bf
 
6e98bf
    assert( IS_8_ALIGNED(p8) );
6e98bf
    assert( IS_8_ALIGNED(p16) );
6e98bf
    assert( IS_8_ALIGNED(p32) );
6e98bf
    assert( IS_8_ALIGNED(p64) );
6e98bf
+   assert( IS_16_ALIGNED(p128) );
6e98bf
 
6e98bf
    memset(page, 0, 1024);
6e98bf
 
6e98bf
@@ -748,6 +799,7 @@ int main ( int argc, char** argv )
6e98bf
    *p16 = 0;
6e98bf
    *p32 = 0;
6e98bf
    *p64 = 0;
6e98bf
+   p128->lo64 = p128->hi64 = 0;
6e98bf
 
6e98bf
    child = fork();
6e98bf
    if (child == -1) {
6e98bf
@@ -763,6 +815,7 @@ int main ( int argc, char** argv )
6e98bf
          atomic_add_16bit(p16, 1);
6e98bf
          atomic_add_32bit(p32, 1);
6e98bf
          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
6e98bf
+         atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
6e98bf
       }
6e98bf
       return 1;
6e98bf
       /* NOTREACHED */
6e98bf
@@ -778,6 +831,7 @@ int main ( int argc, char** argv )
6e98bf
       atomic_add_16bit(p16, 1);
6e98bf
       atomic_add_32bit(p32, 1);
6e98bf
       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
6e98bf
+      atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
6e98bf
    }
6e98bf
 
6e98bf
    p2 = waitpid(child, &status, 0);
6e98bf
@@ -788,11 +842,17 @@ int main ( int argc, char** argv )
6e98bf
 
6e98bf
    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
6e98bf
           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
6e98bf
+   printf("               128 bit 0x%016llx:0x%016llx\n",
6e98bf
+          p128->hi64, p128->lo64);
6e98bf
 
6e98bf
    if (-74 == (int)(*(signed char*)p8) 
6e98bf
        && 32694 == (int)(*p16) 
6e98bf
        && 6913974 == *p32
6e98bf
-       && (0LL == *p64 || 682858642110LL == *p64)) {
6e98bf
+       && (0LL == *p64 || 682858642110LL == *p64)
6e98bf
+       && ((0 == p128->hi64 && 0 == p128->lo64)
6e98bf
+           || (0x00000000000697fb == p128->hi64
6e98bf
+               && 0x6007eb426316d956ULL == p128->lo64))
6e98bf
+      ) {
6e98bf
       printf("PASS\n");
6e98bf
    } else {
6e98bf
       printf("FAIL -- see source code for expected values\n");
6e98bf
diff --git a/memcheck/tests/atomic_incs.stdout.exp-32bit b/memcheck/tests/atomic_incs.stdout.exp-32bit
6e98bf
index c5b8781e5..55e5044b5 100644
6e98bf
--- a/memcheck/tests/atomic_incs.stdout.exp-32bit
6e98bf
+++ b/memcheck/tests/atomic_incs.stdout.exp-32bit
6e98bf
@@ -3,5 +3,6 @@ child
6e98bf
 parent, pre-fork
6e98bf
 parent
6e98bf
 FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 0
6e98bf
+               128 bit 0x0000000000000000:0x0000000000000000
6e98bf
 PASS
6e98bf
 parent exits
6e98bf
diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit b/memcheck/tests/atomic_incs.stdout.exp-64bit
6e98bf
index 82405c520..ca2f4fc97 100644
6e98bf
--- a/memcheck/tests/atomic_incs.stdout.exp-64bit
6e98bf
+++ b/memcheck/tests/atomic_incs.stdout.exp-64bit
6e98bf
@@ -3,5 +3,6 @@ child
6e98bf
 parent, pre-fork
6e98bf
 parent
6e98bf
 FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
6e98bf
+               128 bit 0x0000000000000000:0x0000000000000000
6e98bf
 PASS
6e98bf
 parent exits
6e98bf
diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
6e98bf
new file mode 100644
6e98bf
index 000000000..ef6580917
6e98bf
--- /dev/null
6e98bf
+++ b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
6e98bf
@@ -0,0 +1,8 @@
6e98bf
+parent, pre-fork
6e98bf
+child
6e98bf
+parent, pre-fork
6e98bf
+parent
6e98bf
+FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
6e98bf
+               128 bit 0x00000000000697fb:0x6007eb426316d956
6e98bf
+PASS
6e98bf
+parent exits
6e98bf
diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
6e98bf
index 00cbfa52c..9efb49b27 100644
6e98bf
--- a/none/tests/arm64/Makefile.am
6e98bf
+++ b/none/tests/arm64/Makefile.am
6e98bf
@@ -12,7 +12,10 @@ EXTRA_DIST = \
6e98bf
 	atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \
6e98bf
 	simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \
6e98bf
         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
6e98bf
-	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp fp_and_simd_v82.vgtest
6e98bf
+	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
6e98bf
+	fp_and_simd_v82.vgtest \
6e98bf
+	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
6e98bf
+	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
6e98bf
 
6e98bf
 check_PROGRAMS = \
6e98bf
 	allexec \
6e98bf
@@ -20,7 +23,8 @@ check_PROGRAMS = \
6e98bf
 	fp_and_simd \
6e98bf
 	integer \
6e98bf
 	memory \
6e98bf
-	fmadd_sub
6e98bf
+	fmadd_sub \
6e98bf
+	ldxp_stxp
6e98bf
 
6e98bf
 if BUILD_ARMV8_CRC_TESTS
6e98bf
   check_PROGRAMS += crc32
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp.c b/none/tests/arm64/ldxp_stxp.c
6e98bf
new file mode 100644
6e98bf
index 000000000..b5f6ea121
6e98bf
--- /dev/null
6e98bf
+++ b/none/tests/arm64/ldxp_stxp.c
6e98bf
@@ -0,0 +1,93 @@
6e98bf
+
6e98bf
+/* Note, this is only a basic smoke test of LD{A}XP and ST{L}XP.  Their
6e98bf
+   atomicity properties are tested by memcheck/tests/atomic_incs.c. */
6e98bf
+
6e98bf
+#include <stdio.h>
6e98bf
+#include <stdlib.h>
6e98bf
+#include <malloc.h>
6e98bf
+#include <assert.h>
6e98bf
+
6e98bf
+typedef  unsigned int            UInt;
6e98bf
+typedef  unsigned long long int  ULong;
6e98bf
+
6e98bf
+
6e98bf
+void initBlock ( ULong* block )
6e98bf
+{
6e98bf
+   block[0] = 0x0001020304050607ULL;
6e98bf
+   block[1] = 0x1011121314151617ULL;
6e98bf
+   block[2] = 0x2021222324252627ULL;
6e98bf
+   block[3] = 0x3031323334353637ULL;
6e98bf
+   block[4] = 0x4041424344454647ULL;
6e98bf
+   block[5] = 0x5051525354555657ULL;
6e98bf
+}
6e98bf
+
6e98bf
+void printBlock ( const char* who,
6e98bf
+                  ULong* block, ULong rt1contents, ULong rt2contents,
6e98bf
+                  UInt zeroIfSuccess )
6e98bf
+{
6e98bf
+   printf("Block %s (%s)\n", who, zeroIfSuccess == 0 ? "success" : "FAILURE" );
6e98bf
+   for (int i = 0; i < 6; i++) {
6e98bf
+      printf("0x%016llx\n", block[i]);
6e98bf
+   }
6e98bf
+   printf("0x%016llx rt1contents\n", rt1contents);
6e98bf
+   printf("0x%016llx rt2contents\n", rt2contents);
6e98bf
+   printf("\n");
6e98bf
+}
6e98bf
+
6e98bf
+int main ( void )
6e98bf
+{
6e98bf
+   ULong* block = memalign(16, 6 * sizeof(ULong));
6e98bf
+   assert(block);
6e98bf
+
6e98bf
+   ULong rt1in, rt2in, rt1out, rt2out;
6e98bf
+   UInt scRes;
6e98bf
+
6e98bf
+   // Do ldxp then stxp with x-registers
6e98bf
+   initBlock(block);
6e98bf
+   rt1in  = 0x5555666677778888ULL;
6e98bf
+   rt2in  = 0xAAAA9999BBBB0000ULL;
6e98bf
+   rt1out = 0x1111222233334444ULL;
6e98bf
+   rt2out = 0xFFFFEEEEDDDDCCCCULL;
6e98bf
+   scRes  = 0x55555555;
6e98bf
+   __asm__ __volatile__(
6e98bf
+      "ldxp %1, %2, [%5]"       "\n\t"
6e98bf
+      "stxp %w0, %3, %4, [%5]"  "\n\t"
6e98bf
+      : /*OUT*/
6e98bf
+        "=&r"(scRes),  // %0
6e98bf
+        "=&r"(rt1out), // %1
6e98bf
+        "=&r"(rt2out)  // %2
6e98bf
+      : /*IN*/
6e98bf
+        "r"(rt1in),    // %3
6e98bf
+        "r"(rt2in),    // %4
6e98bf
+        "r"(&block[2]) // %5
6e98bf
+      : /*TRASH*/
6e98bf
+        "memory","cc"
6e98bf
+   );
6e98bf
+   printBlock("after ldxp/stxp 2x64-bit", block, rt1out, rt2out, scRes);
6e98bf
+
6e98bf
+   // Do ldxp then stxp with w-registers
6e98bf
+   initBlock(block);
6e98bf
+   rt1in  = 0x5555666677778888ULL;
6e98bf
+   rt2in  = 0xAAAA9999BBBB0000ULL;
6e98bf
+   rt1out = 0x1111222233334444ULL;
6e98bf
+   rt2out = 0xFFFFEEEEDDDDCCCCULL;
6e98bf
+   scRes  = 0x55555555;
6e98bf
+   __asm__ __volatile__(
6e98bf
+      "ldxp %w1, %w2, [%5]"       "\n\t"
6e98bf
+      "stxp %w0, %w3, %w4, [%5]"  "\n\t"
6e98bf
+      : /*OUT*/
6e98bf
+        "=&r"(scRes),  // %0
6e98bf
+        "=&r"(rt1out), // %1
6e98bf
+        "=&r"(rt2out)  // %2
6e98bf
+      : /*IN*/
6e98bf
+        "r"(rt1in),    // %3
6e98bf
+        "r"(rt2in),    // %4
6e98bf
+        "r"(&block[2]) // %5
6e98bf
+      : /*TRASH*/
6e98bf
+        "memory","cc"
6e98bf
+   );
6e98bf
+   printBlock("after ldxp/stxp 2x32-bit", block, rt1out, rt2out, scRes);
6e98bf
+
6e98bf
+   free(block);
6e98bf
+   return 0;
6e98bf
+}
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp
6e98bf
new file mode 100644
6e98bf
index 000000000..e69de29bb
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
6e98bf
new file mode 100644
6e98bf
index 000000000..f269ecdcc
6e98bf
--- /dev/null
6e98bf
+++ b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
6e98bf
@@ -0,0 +1,20 @@
6e98bf
+Block after ldxp/stxp 2x64-bit (success)
6e98bf
+0x0001020304050607
6e98bf
+0x1011121314151617
6e98bf
+0x5555666677778888
6e98bf
+0xaaaa9999bbbb0000
6e98bf
+0x4041424344454647
6e98bf
+0x5051525354555657
6e98bf
+0x2021222324252627 rt1contents
6e98bf
+0x3031323334353637 rt2contents
6e98bf
+
6e98bf
+Block after ldxp/stxp 2x32-bit (success)
6e98bf
+0x0001020304050607
6e98bf
+0x1011121314151617
6e98bf
+0xbbbb000077778888
6e98bf
+0x3031323334353637
6e98bf
+0x4041424344454647
6e98bf
+0x5051525354555657
6e98bf
+0x0000000024252627 rt1contents
6e98bf
+0x0000000020212223 rt2contents
6e98bf
+
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.vgtest b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
6e98bf
new file mode 100644
6e98bf
index 000000000..29133729a
6e98bf
--- /dev/null
6e98bf
+++ b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
6e98bf
@@ -0,0 +1,2 @@
6e98bf
+prog: ldxp_stxp
6e98bf
+vgopts: -q
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp
6e98bf
new file mode 100644
6e98bf
index 000000000..e69de29bb
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
6e98bf
new file mode 100644
6e98bf
index 000000000..f269ecdcc
6e98bf
--- /dev/null
6e98bf
+++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
6e98bf
@@ -0,0 +1,20 @@
6e98bf
+Block after ldxp/stxp 2x64-bit (success)
6e98bf
+0x0001020304050607
6e98bf
+0x1011121314151617
6e98bf
+0x5555666677778888
6e98bf
+0xaaaa9999bbbb0000
6e98bf
+0x4041424344454647
6e98bf
+0x5051525354555657
6e98bf
+0x2021222324252627 rt1contents
6e98bf
+0x3031323334353637 rt2contents
6e98bf
+
6e98bf
+Block after ldxp/stxp 2x32-bit (success)
6e98bf
+0x0001020304050607
6e98bf
+0x1011121314151617
6e98bf
+0xbbbb000077778888
6e98bf
+0x3031323334353637
6e98bf
+0x4041424344454647
6e98bf
+0x5051525354555657
6e98bf
+0x0000000024252627 rt1contents
6e98bf
+0x0000000020212223 rt2contents
6e98bf
+
6e98bf
diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
6e98bf
new file mode 100644
6e98bf
index 000000000..474282a03
6e98bf
--- /dev/null
6e98bf
+++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
6e98bf
@@ -0,0 +1,2 @@
6e98bf
+prog: ldxp_stxp
6e98bf
+vgopts: -q --sim-hints=fallback-llsc
6e98bf
6e98bf
commit 0d38ca5dd6b446c70738031132d41f09de0f7a8a
6e98bf
Author: Julian Seward <jseward@acm.org>
6e98bf
Date:   Fri Nov 12 13:08:45 2021 +0100
6e98bf
6e98bf
    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).  FOLLOWUP FIX.
6e98bf
    
6e98bf
    This is an attempt to un-break 'make dist', as broken by the main commit for
6e98bf
    this bug, which was 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650.
6e98bf
6e98bf
diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
6e98bf
index 9efb49b27..4a06f0996 100644
6e98bf
--- a/none/tests/arm64/Makefile.am
6e98bf
+++ b/none/tests/arm64/Makefile.am
6e98bf
@@ -14,8 +14,10 @@ EXTRA_DIST = \
6e98bf
         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
6e98bf
 	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
6e98bf
 	fp_and_simd_v82.vgtest \
6e98bf
-	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
6e98bf
-	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
6e98bf
+	ldxp_stxp_basisimpl.stdout.exp ldxp_stxp_basisimpl.stderr.exp \
6e98bf
+	ldxp_stxp_basisimpl.vgtest \
6e98bf
+	ldxp_stxp_fallbackimpl.stdout.exp ldxp_stxp_fallbackimpl.stderr.exp \
6e98bf
+	ldxp_stxp_fallbackimpl.vgtest
6e98bf
 
6e98bf
 check_PROGRAMS = \
6e98bf
 	allexec \