From 04e90e24c6548b45b48ae056e4b9b33be498eb5e Mon Sep 17 00:00:00 2001
From: Mark Wielaard <mark@klomp.org>
Date: Nov 17 2021 17:19:28 +0000
Subject: Add valgrind-3.18.1-arm64-ldaxp-stlxp.patch


---

diff --git a/valgrind-3.18.1-arm64-ldaxp-stlxp.patch b/valgrind-3.18.1-arm64-ldaxp-stlxp.patch
new file mode 100644
index 0000000..d118cc6
--- /dev/null
+++ b/valgrind-3.18.1-arm64-ldaxp-stlxp.patch
@@ -0,0 +1,1440 @@
+commit 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650
+Author: Julian Seward <jseward@acm.org>
+Date:   Fri Nov 12 12:13:45 2021 +0100
+
+    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).
+    
+    This is unfortunately a big and complex patch, to implement LD{,A}XP and
+    ST{,L}XP.  These were omitted from the original AArch64 v8.0 implementation
+    for unknown reasons.
+    
+    (Background) the patch is made significantly more complex because for AArch64
+    we actually have two implementations of the underlying
+    Load-Linked/Store-Conditional (LL/SC) machinery: a "primary" implementation,
+    which translates LL/SC more or less directly into IR and re-emits them at the
+    back end, and a "fallback" implementation that implements LL/SC "manually", by
+    taking advantage of the fact that V serialises thread execution, so we can
+    "implement" LL/SC by simulating a reservation using fields LLSC_* in the guest
+    state, and invalidating the reservation at every thread switch.
+    
+    (Background) the fallback scheme is needed because the primary scheme is in
+    violation of the ARMv8 semantics in that it can (easily) introduce extra
+    memory references between the LL and SC, hence on some hardware causing the
+    reservation to always fail and so the simulated program to wind up looping
+    forever.
+    
+    For these instructions, big picture:
+    
+    * for the primary implementation, we take advantage of the fact that
+      IRStmt_LLSC allows I128 bit transactions to be represented.  Hence we bundle
+      up the two 64-bit data elements into an I128 (or vice versa) and present a
+      single I128-typed IRStmt_LLSC in the IR.  In the backend, those are
+      re-emitted as LDXP/STXP respectively.  For LL/SC on 32-bit register pairs,
+      that bundling produces a single 64-bit item, and so the existing LL/SC
+      backend machinery handles it.  The effect is that a doubleword 32-bit LL/SC
+      in the front end translates into a single 64-bit LL/SC in the back end.
+      Overall, though, the implementation is straightforward.
+    
+    * for the fallback implementation, it is necessary to extend the guest state
+      field `guest_LLSC_DATA` to represent a 128-bit transaction, by splitting it
+      into _DATA_LO64 and DATA_HI64.  Then, the implementation is an exact
+      analogue of the fallback implementation for single-word LL/SC.  It takes
+      advantage of the fact that the backend already supports 128-bit CAS, as
+      fixed in bug 445354.  As with the primary implementation, doubleword 32-bit
+      LL/SC is bundled into a single 64-bit transaction.
+    
+    Detailed changes:
+    
+    * new arm64 guest state fields LLSC_DATA_LO64/LLSC_DATA_LO64 to replace
+      guest_LLSC_DATA
+    
+    * (ridealong fix) arm64 front end: a fix to a minor and harmless decoding bug
+      for the single-word LDX/STX case.
+    
+    * arm64 front end: IR generation for LD{,A}XP/ST{,L}XP: tedious and
+      longwinded, but per comments above, an exact(ish) analogue of the singleword
+      case
+    
+    * arm64 backend: new insns ARM64Instr_LdrEXP / ARM64Instr_StrEXP to wrap up 2
+      x 64 exclusive loads/stores.  Per comments above, there's no need to handle
+      the 2 x 32 case.
+    
+    * arm64 isel: translate I128-typed IRStmt_LLSC into the above two insns
+    
+    * arm64 isel: some auxiliary bits and pieces needed to handle I128 values;
+      this is standard doubleword isel stuff
+    
+    * arm64 isel: (ridealong fix): Ist_CAS: check for endianness of the CAS!
+    
+    * arm64 isel: (ridealong) a couple of formatting fixes
+    
+    * IR infrastructure: add support for I128 constants, done the same as V128
+      constants
+    
+    * memcheck: handle shadow loads and stores for I128 values
+    
+    * testcase: memcheck/tests/atomic_incs.c: on arm64, also test 128-bit atomic
+      addition, to check we really have atomicity right
+    
+    * testcase: new test none/tests/arm64/ldxp_stxp.c, tests operation but not
+      atomicity.  (Smoke test).
+
+diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
+index 12a1c5978..ee018c6a9 100644
+--- a/VEX/priv/guest_arm64_toIR.c
++++ b/VEX/priv/guest_arm64_toIR.c
+@@ -1184,9 +1184,10 @@ static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
+ #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
+ #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
+ 
+-#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
+-#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
+-#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
++#define OFFB_LLSC_SIZE      offsetof(VexGuestARM64State,guest_LLSC_SIZE)
++#define OFFB_LLSC_ADDR      offsetof(VexGuestARM64State,guest_LLSC_ADDR)
++#define OFFB_LLSC_DATA_LO64 offsetof(VexGuestARM64State,guest_LLSC_DATA_LO64)
++#define OFFB_LLSC_DATA_HI64 offsetof(VexGuestARM64State,guest_LLSC_DATA_HI64)
+ 
+ 
+ /* ---------------- Integer registers ---------------- */
+@@ -6652,7 +6653,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+         (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
+          has to do this bit)
+    */   
+-   if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
++   if (INSN(29,24) == BITS6(0,0,1,0,0,0)
+        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
+        && INSN(14,10) == BITS5(1,1,1,1,1)) {
+       UInt szBlg2     = INSN(31,30);
+@@ -6678,7 +6679,8 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+             // if it faults.
+             IRTemp loaded_data64 = newTemp(Ity_I64);
+             assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
+-            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
++            stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
++            stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
+             stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
+             stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
+             putIReg64orZR(tt, mkexpr(loaded_data64));
+@@ -6729,7 +6731,7 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+             ));
+             // Fail if the data doesn't match the LL data
+             IRTemp llsc_data64 = newTemp(Ity_I64);
+-            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
++            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
+             stmt( IRStmt_Exit(
+                       binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
+                                          mkexpr(llsc_data64)),
+@@ -6771,6 +6773,257 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+       /* else fall through */
+    }
+ 
++   /* -------------------- LD{,A}XP -------------------- */
++   /* -------------------- ST{,L}XP -------------------- */
++   /* 31 30 29     23  20    15 14  9  4
++       1 sz 001000 011 11111 0  t2  n  t1   LDXP  Rt1, Rt2, [Xn|SP]
++       1 sz 001000 011 11111 1  t2  n  t1   LDAXP Rt1, Rt2, [Xn|SP]
++       1 sz 001000 001 s     0  t2  n  t1   STXP  Ws, Rt1, Rt2, [Xn|SP]
++       1 sz 001000 001 s     1  t2  n  t1   STLXP Ws, Rt1, Rt2, [Xn|SP]
++   */
++   /* See just above, "LD{,A}X{R,RH,RB} / ST{,L}X{R,RH,RB}", for detailed
++      comments about this implementation.  Note the 'sz' field here is only 1
++      bit; above, it is 2 bits, and has a different encoding.
++   */
++   if (INSN(31,31) == 1
++       && INSN(29,24) == BITS6(0,0,1,0,0,0)
++       && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,1)) {
++      Bool elemIs64   = INSN(30,30) == 1;
++      Bool isLD       = INSN(22,22) == 1;
++      Bool isAcqOrRel = INSN(15,15) == 1;
++      UInt ss         = INSN(20,16);
++      UInt tt2        = INSN(14,10);
++      UInt nn         = INSN(9,5);
++      UInt tt1        = INSN(4,0);
++
++      UInt   elemSzB = elemIs64 ? 8 : 4;
++      UInt   fullSzB = 2 * elemSzB;
++      IRType elemTy  = integerIRTypeOfSize(elemSzB);
++      IRType fullTy  = integerIRTypeOfSize(fullSzB);
++
++      IRTemp ea = newTemp(Ity_I64);
++      assign(ea, getIReg64orSP(nn));
++      /* FIXME generate check that ea is 2*elemSzB-aligned */
++
++      if (isLD && ss == BITS5(1,1,1,1,1)) {
++         if (abiinfo->guest__use_fallback_LLSC) {
++            // Fallback implementation of LL.
++            // Do the load first so we don't update any guest state if it
++            // faults.  Assumes little-endian guest.
++            if (fullTy == Ity_I64) {
++               vassert(elemSzB == 4);
++               IRTemp loaded_data64 = newTemp(Ity_I64);
++               assign(loaded_data64, loadLE(fullTy, mkexpr(ea)));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64, mkexpr(loaded_data64) ));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64, mkU64(0) ));
++               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
++               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(8) ));
++               putIReg64orZR(tt1, unop(Iop_32Uto64,
++                                       unop(Iop_64to32,
++                                            mkexpr(loaded_data64))));
++               putIReg64orZR(tt2, unop(Iop_32Uto64,
++                                       unop(Iop_64HIto32,
++                                            mkexpr(loaded_data64))));
++            } else {
++               vassert(elemSzB == 8 && fullTy == Ity_I128);
++               IRTemp loaded_data128 = newTemp(Ity_I128);
++               // Hack: do the load as V128 rather than I128 so as to avoid
++               // having to implement I128 loads in the arm64 back end.
++               assign(loaded_data128, unop(Iop_ReinterpV128asI128,
++                                           loadLE(Ity_V128, mkexpr(ea))));
++               IRTemp loaded_data_lo64 = newTemp(Ity_I64);
++               IRTemp loaded_data_hi64 = newTemp(Ity_I64);
++               assign(loaded_data_lo64, unop(Iop_128to64,
++                                             mkexpr(loaded_data128)));
++               assign(loaded_data_hi64, unop(Iop_128HIto64,
++                                             mkexpr(loaded_data128)));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_LO64,
++                                 mkexpr(loaded_data_lo64) ));
++               stmt( IRStmt_Put( OFFB_LLSC_DATA_HI64,
++                                 mkexpr(loaded_data_hi64) ));
++               stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
++               stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(16) ));
++               putIReg64orZR(tt1, mkexpr(loaded_data_lo64));
++               putIReg64orZR(tt2, mkexpr(loaded_data_hi64));
++            }
++         } else {
++            // Non-fallback implementation of LL.
++            IRTemp res = newTemp(fullTy); // I64 or I128
++            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
++            // Assuming a little-endian guest here.  Rt1 goes at the lower
++            // address, so it must live in the least significant half of `res`.
++            IROp opGetLO = fullTy == Ity_I128 ? Iop_128to64   : Iop_64to32;
++            IROp opGetHI = fullTy == Ity_I128 ? Iop_128HIto64 : Iop_64HIto32;
++            putIReg64orZR(tt1, widenUto64(elemTy, unop(opGetLO, mkexpr(res))));
++            putIReg64orZR(tt2, widenUto64(elemTy, unop(opGetHI, mkexpr(res))));
++         }
++         if (isAcqOrRel) {
++            stmt(IRStmt_MBE(Imbe_Fence));
++         }
++         DIP("ld%sxp %s, %s, [%s] %s\n",
++             isAcqOrRel ? (isLD ? "a" : "l") : "",
++             nameIRegOrZR(elemSzB == 8, tt1),
++             nameIRegOrZR(elemSzB == 8, tt2),
++             nameIReg64orSP(nn),
++             abiinfo->guest__use_fallback_LLSC
++                ? "(fallback implementation)" : "");
++         return True;
++      }
++      if (!isLD) {
++         if (isAcqOrRel) {
++            stmt(IRStmt_MBE(Imbe_Fence));
++         }
++         if (abiinfo->guest__use_fallback_LLSC) {
++            // Fallback implementation of SC.
++            // This is really ugly, since we don't have any way to do
++            // proper if-then-else.  First, set up as if the SC failed,
++            // and jump forwards if it really has failed.
++
++            // Continuation address
++            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
++
++            // "the SC failed".  Any non-zero value means failure.
++            putIReg64orZR(ss, mkU64(1));
++
++            IRTemp tmp_LLsize = newTemp(Ity_I64);
++            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
++            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
++            ));
++            // Fail if no or wrong-size transaction
++            vassert((fullSzB == 8 && fullTy == Ity_I64)
++                    || (fullSzB == 16 && fullTy == Ity_I128));
++            stmt( IRStmt_Exit(
++                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(fullSzB)),
++                     Ijk_Boring, nia, OFFB_PC
++            ));
++            // Fail if the address doesn't match the LL address
++            stmt( IRStmt_Exit(
++                      binop(Iop_CmpNE64, mkexpr(ea),
++                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
++                      Ijk_Boring, nia, OFFB_PC
++            ));
++            // The data to be stored.
++            IRTemp store_data = newTemp(fullTy);
++            if (fullTy == Ity_I64) {
++               assign(store_data,
++                      binop(Iop_32HLto64,
++                            narrowFrom64(Ity_I32, getIReg64orZR(tt2)),
++                            narrowFrom64(Ity_I32, getIReg64orZR(tt1))));
++            } else {
++               assign(store_data,
++                      binop(Iop_64HLto128,
++                            getIReg64orZR(tt2), getIReg64orZR(tt1)));
++            }
++
++            if (fullTy == Ity_I64) {
++               // 64 bit (2x32 bit) path
++               // Fail if the data in memory doesn't match the data stashed by
++               // the LL.
++               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
++               assign(llsc_data_lo64,
++                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
++               stmt( IRStmt_Exit(
++                         binop(Iop_CmpNE64, loadLE(Ity_I64, mkexpr(ea)),
++                                            mkexpr(llsc_data_lo64)),
++                      Ijk_Boring, nia, OFFB_PC
++               ));
++               // Try to CAS the new value in.
++               IRTemp old = newTemp(Ity_I64);
++               IRTemp expd = newTemp(Ity_I64);
++               assign(expd, mkexpr(llsc_data_lo64));
++               stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
++                                        Iend_LE, mkexpr(ea),
++                                        /*expdHi*/NULL, mkexpr(expd),
++                                        /*dataHi*/NULL, mkexpr(store_data)
++               )));
++               // Fail if the CAS failed (viz, old != expd)
++               stmt( IRStmt_Exit(
++                         binop(Iop_CmpNE64, mkexpr(old), mkexpr(expd)),
++                         Ijk_Boring, nia, OFFB_PC
++               ));
++            } else {
++               // 128 bit (2x64 bit) path
++               // Fail if the data in memory doesn't match the data stashed by
++               // the LL.
++               IRTemp llsc_data_lo64 = newTemp(Ity_I64);
++               assign(llsc_data_lo64,
++                      IRExpr_Get(OFFB_LLSC_DATA_LO64, Ity_I64));
++               IRTemp llsc_data_hi64 = newTemp(Ity_I64);
++               assign(llsc_data_hi64,
++                      IRExpr_Get(OFFB_LLSC_DATA_HI64, Ity_I64));
++               IRTemp data_at_ea = newTemp(Ity_I128);
++               assign(data_at_ea,
++                      unop(Iop_ReinterpV128asI128,
++                           loadLE(Ity_V128, mkexpr(ea))));
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64,
++                              unop(Iop_128to64, mkexpr(data_at_ea)),
++                              mkexpr(llsc_data_lo64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64,
++                              unop(Iop_128HIto64, mkexpr(data_at_ea)),
++                              mkexpr(llsc_data_hi64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++               // Try to CAS the new value in.
++               IRTemp old_lo64 = newTemp(Ity_I64);
++               IRTemp old_hi64 = newTemp(Ity_I64);
++               IRTemp expd_lo64 = newTemp(Ity_I64);
++               IRTemp expd_hi64 = newTemp(Ity_I64);
++               IRTemp store_data_lo64 = newTemp(Ity_I64);
++               IRTemp store_data_hi64 = newTemp(Ity_I64);
++               assign(expd_lo64, mkexpr(llsc_data_lo64));
++               assign(expd_hi64, mkexpr(llsc_data_hi64));
++               assign(store_data_lo64, unop(Iop_128to64, mkexpr(store_data)));
++               assign(store_data_hi64, unop(Iop_128HIto64, mkexpr(store_data)));
++               stmt( IRStmt_CAS(mkIRCAS(old_hi64, old_lo64,
++                                        Iend_LE, mkexpr(ea),
++                                        mkexpr(expd_hi64), mkexpr(expd_lo64),
++                                        mkexpr(store_data_hi64),
++                                        mkexpr(store_data_lo64)
++               )));
++               // Fail if the CAS failed (viz, old != expd)
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64, mkexpr(old_lo64), mkexpr(expd_lo64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++               stmt( IRStmt_Exit(
++                        binop(Iop_CmpNE64, mkexpr(old_hi64), mkexpr(expd_hi64)),
++                        Ijk_Boring, nia, OFFB_PC
++               ));
++            }
++            // Otherwise we succeeded (!)
++            putIReg64orZR(ss, mkU64(0));
++         } else {
++            // Non-fallback implementation of SC.
++            IRTemp  res     = newTemp(Ity_I1);
++            IRExpr* dataLO  = narrowFrom64(elemTy, getIReg64orZR(tt1));
++            IRExpr* dataHI  = narrowFrom64(elemTy, getIReg64orZR(tt2));
++            IROp    opMerge = fullTy == Ity_I128 ? Iop_64HLto128 : Iop_32HLto64;
++            IRExpr* data    = binop(opMerge, dataHI, dataLO);
++            // Assuming a little-endian guest here.  Rt1 goes at the lower
++            // address, so it must live in the least significant half of `data`.
++            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
++            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
++               Need to set rS to 1 on failure, 0 on success. */
++            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
++                                               mkU64(1)));
++         }
++         DIP("st%sxp %s, %s, %s, [%s] %s\n",
++             isAcqOrRel ? (isLD ? "a" : "l") : "",
++             nameIRegOrZR(False, ss),
++             nameIRegOrZR(elemSzB == 8, tt1),
++             nameIRegOrZR(elemSzB == 8, tt2),
++             nameIReg64orSP(nn),
++             abiinfo->guest__use_fallback_LLSC
++                ? "(fallback implementation)" : "");
++         return True;
++      }
++      /* else fall through */
++   }
++
+    /* ------------------ LDA{R,RH,RB} ------------------ */
+    /* ------------------ STL{R,RH,RB} ------------------ */
+    /* 31 29     23  20      14    9 4
+diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
+index 5657bcab9..b65e27db4 100644
+--- a/VEX/priv/host_arm64_defs.c
++++ b/VEX/priv/host_arm64_defs.c
+@@ -1059,6 +1059,16 @@ ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
+    vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
+    return i;
+ }
++ARM64Instr* ARM64Instr_LdrEXP ( void ) {
++   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
++   i->tag        = ARM64in_LdrEXP;
++   return i;
++}
++ARM64Instr* ARM64Instr_StrEXP ( void ) {
++   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
++   i->tag        = ARM64in_StrEXP;
++   return i;
++}
+ ARM64Instr* ARM64Instr_CAS ( Int szB ) {
+    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
+    i->tag             = ARM64in_CAS;
+@@ -1699,12 +1709,19 @@ void ppARM64Instr ( const ARM64Instr* i ) {
+                     sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
+          return;
+       }
++      case ARM64in_LdrEXP:
++         vex_printf("ldxp   x2, x3, [x4]");
++         return;
++      case ARM64in_StrEXP:
++         vex_printf("stxp   w0, x2, x3, [x4]");
++         return;
+       case ARM64in_CAS: {
+          vex_printf("x1 = cas(%dbit)(x3, x5 -> x7)", 8 * i->ARM64in.CAS.szB);
+          return;
+       }
+       case ARM64in_CASP: {
+-         vex_printf("x0,x1 = casp(%dbit)(x2, x4,x5 -> x6,x7)", 8 * i->ARM64in.CASP.szB);
++         vex_printf("x0,x1 = casp(2x%dbit)(x2, x4,x5 -> x6,x7)",
++                    8 * i->ARM64in.CASP.szB);
+          return;
+       }
+       case ARM64in_MFence:
+@@ -2253,6 +2270,17 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
+          addHRegUse(u, HRmWrite, hregARM64_X0());
+          addHRegUse(u, HRmRead, hregARM64_X2());
+          return;
++      case ARM64in_LdrEXP:
++         addHRegUse(u, HRmRead, hregARM64_X4());
++         addHRegUse(u, HRmWrite, hregARM64_X2());
++         addHRegUse(u, HRmWrite, hregARM64_X3());
++         return;
++      case ARM64in_StrEXP:
++         addHRegUse(u, HRmRead, hregARM64_X4());
++         addHRegUse(u, HRmWrite, hregARM64_X0());
++         addHRegUse(u, HRmRead, hregARM64_X2());
++         addHRegUse(u, HRmRead, hregARM64_X3());
++         return;
+       case ARM64in_CAS:
+          addHRegUse(u, HRmRead, hregARM64_X3());
+          addHRegUse(u, HRmRead, hregARM64_X5());
+@@ -2571,6 +2599,10 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
+          return;
+       case ARM64in_StrEX:
+          return;
++      case ARM64in_LdrEXP:
++         return;
++      case ARM64in_StrEXP:
++         return;
+       case ARM64in_CAS:
+          return;
+       case ARM64in_CASP:
+@@ -4167,6 +4199,16 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
+          }
+          goto bad;
+       }
++      case ARM64in_LdrEXP: {
++         // 820C7FC8   ldxp x2, x3, [x4]
++         *p++ = 0xC87F0C82;
++         goto done;
++      }
++      case ARM64in_StrEXP: {
++         // 820C20C8   stxp w0, x2, x3, [x4]
++         *p++ = 0xC8200C82;
++         goto done;
++      }
+       case ARM64in_CAS: {
+          /* This isn't simple.  For an explanation see the comment in
+             host_arm64_defs.h on the definition of ARM64Instr case CAS.
+diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
+index 01fb5708e..dc686dff7 100644
+--- a/VEX/priv/host_arm64_defs.h
++++ b/VEX/priv/host_arm64_defs.h
+@@ -509,8 +509,10 @@ typedef
+       ARM64in_AddToSP,     /* move SP by small, signed constant */
+       ARM64in_FromSP,      /* move SP to integer register */
+       ARM64in_Mul,
+-      ARM64in_LdrEX,
+-      ARM64in_StrEX,
++      ARM64in_LdrEX,       /* load exclusive, single register */
++      ARM64in_StrEX,       /* store exclusive, single register */
++      ARM64in_LdrEXP,      /* load exclusive, register pair, 2x64-bit only */
++      ARM64in_StrEXP,      /* store exclusive, register pair, 2x64-bit only */
+       ARM64in_CAS,
+       ARM64in_CASP,
+       ARM64in_MFence,
+@@ -719,6 +721,12 @@ typedef
+          struct {
+             Int  szB; /* 1, 2, 4 or 8 */
+          } StrEX;
++         /* LDXP x2, x3, [x4].  This is 2x64-bit only. */
++         struct {
++         } LdrEXP;
++         /* STXP w0, x2, x3, [x4].  This is 2x64-bit only. */
++         struct {
++         } StrEXP;
+          /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
+             and trashes x8
+             where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
+@@ -1037,6 +1045,8 @@ extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
+                                         ARM64MulOp op );
+ extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
+ extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
++extern ARM64Instr* ARM64Instr_LdrEXP  ( void );
++extern ARM64Instr* ARM64Instr_StrEXP  ( void );
+ extern ARM64Instr* ARM64Instr_CAS     ( Int szB );
+ extern ARM64Instr* ARM64Instr_CASP    ( Int szB );
+ extern ARM64Instr* ARM64Instr_MFence  ( void );
+diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
+index 4b1d8c846..094e7e74b 100644
+--- a/VEX/priv/host_arm64_isel.c
++++ b/VEX/priv/host_arm64_isel.c
+@@ -196,9 +196,9 @@ static HReg        iselCondCode_R        ( ISelEnv* env, IRExpr* e );
+ static HReg        iselIntExpr_R_wrk     ( ISelEnv* env, IRExpr* e );
+ static HReg        iselIntExpr_R         ( ISelEnv* env, IRExpr* e );
+ 
+-static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, HReg* rLo, 
++static void        iselInt128Expr_wrk    ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
+                                            ISelEnv* env, IRExpr* e );
+-static void        iselInt128Expr        ( /*OUT*/HReg* rHi, HReg* rLo, 
++static void        iselInt128Expr        ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
+                                            ISelEnv* env, IRExpr* e );
+ 
+ static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
+@@ -1759,9 +1759,12 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+ 
+       /* AND/OR/XOR(e1, e2) (for any e1, e2) */
+       switch (e->Iex.Binop.op) {
+-         case Iop_And64: case Iop_And32: lop = ARM64lo_AND; goto log_binop;
+-         case Iop_Or64:  case Iop_Or32:  case Iop_Or16: lop = ARM64lo_OR;  goto log_binop;
+-         case Iop_Xor64: case Iop_Xor32: lop = ARM64lo_XOR; goto log_binop;
++         case Iop_And64: case Iop_And32:
++            lop = ARM64lo_AND; goto log_binop;
++         case Iop_Or64:  case Iop_Or32:  case Iop_Or16:
++            lop = ARM64lo_OR;  goto log_binop;
++         case Iop_Xor64: case Iop_Xor32:
++            lop = ARM64lo_XOR; goto log_binop;
+          log_binop: {
+             HReg      dst  = newVRegI(env);
+             HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+@@ -2013,6 +2016,11 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+             return rHi; /* and abandon rLo */
+          }
++         case Iop_128to64: {
++            HReg rHi, rLo;
++            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
++            return rLo; /* and abandon rHi */
++         }
+          case Iop_8Sto32: case Iop_8Sto64: {
+             IRExpr* arg = e->Iex.Unop.arg;
+             HReg    src = iselIntExpr_R(env, arg);
+@@ -2185,13 +2193,19 @@ static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+             }
+             return dst;
+          }
++         case Iop_64HIto32: {
++            HReg dst = newVRegI(env);
++            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
++            addInstr(env, ARM64Instr_Shift(dst, src, ARM64RI6_I6(32),
++                                           ARM64sh_SHR));
++            return dst;
++         }
+          case Iop_64to32:
+          case Iop_64to16:
+          case Iop_64to8:
+          case Iop_32to16:
+             /* These are no-ops. */
+             return iselIntExpr_R(env, e->Iex.Unop.arg);
+-
+          default:
+             break;
+       }
+@@ -2335,6 +2349,43 @@ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
+    vassert(e);
+    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
+ 
++   /* --------- TEMP --------- */
++   if (e->tag == Iex_RdTmp) {
++      lookupIRTempPair(rHi, rLo, env, e->Iex.RdTmp.tmp);
++      return;
++   }
++
++   /* --------- CONST --------- */
++   if (e->tag == Iex_Const) {
++      IRConst* c = e->Iex.Const.con;
++      vassert(c->tag == Ico_U128);
++      if (c->Ico.U128 == 0) {
++         // The only case we need to handle (so far)
++         HReg zero = newVRegI(env);
++         addInstr(env, ARM64Instr_Imm64(zero, 0));
++         *rHi = *rLo = zero;
++         return;
++      }
++   }
++
++   /* --------- UNARY ops --------- */
++   if (e->tag == Iex_Unop) {
++      switch (e->Iex.Unop.op) {
++         case Iop_ReinterpV128asI128: {
++            HReg dstHi = newVRegI(env);
++            HReg dstLo = newVRegI(env);
++            HReg src    = iselV128Expr(env, e->Iex.Unop.arg);
++            addInstr(env, ARM64Instr_VXfromQ(dstHi, src, 1));
++            addInstr(env, ARM64Instr_VXfromQ(dstLo, src, 0));
++            *rHi = dstHi;
++            *rLo = dstLo;
++            return;
++         }
++         default:
++            break;
++      }
++   }
++
+    /* --------- BINARY ops --------- */
+    if (e->tag == Iex_Binop) {
+       switch (e->Iex.Binop.op) {
+@@ -4086,6 +4137,14 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          addInstr(env, ARM64Instr_VMov(8/*yes, really*/, dst, src));
+          return;
+       }
++      if (ty == Ity_I128) {
++         HReg rHi, rLo, dstHi, dstLo;
++         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
++         lookupIRTempPair( &dstHi, &dstLo, env, tmp);
++         addInstr(env, ARM64Instr_MovI(dstHi, rHi));
++         addInstr(env, ARM64Instr_MovI(dstLo, rLo));
++         return;
++      }
+       if (ty == Ity_V128) {
+          HReg src = iselV128Expr(env, stmt->Ist.WrTmp.data);
+          HReg dst = lookupIRTemp(env, tmp);
+@@ -4183,42 +4242,67 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          /* LL */
+          IRTemp res = stmt->Ist.LLSC.result;
+          IRType ty  = typeOfIRTemp(env->type_env, res);
+-         if (ty == Ity_I64 || ty == Ity_I32 
++         if (ty == Ity_I128 || ty == Ity_I64 || ty == Ity_I32
+              || ty == Ity_I16 || ty == Ity_I8) {
+             Int  szB   = 0;
+-            HReg r_dst = lookupIRTemp(env, res);
+             HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+             switch (ty) {
+-               case Ity_I8:  szB = 1; break;
+-               case Ity_I16: szB = 2; break;
+-               case Ity_I32: szB = 4; break;
+-               case Ity_I64: szB = 8; break;
+-               default:      vassert(0);
++               case Ity_I8:   szB = 1;  break;
++               case Ity_I16:  szB = 2;  break;
++               case Ity_I32:  szB = 4;  break;
++               case Ity_I64:  szB = 8;  break;
++               case Ity_I128: szB = 16; break;
++               default:       vassert(0);
++            }
++            if (szB == 16) {
++               HReg r_dstMSword = INVALID_HREG;
++               HReg r_dstLSword = INVALID_HREG;
++               lookupIRTempPair(&r_dstMSword, &r_dstLSword, env, res);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
++               addInstr(env, ARM64Instr_LdrEXP());
++               addInstr(env, ARM64Instr_MovI(r_dstLSword, hregARM64_X2()));
++               addInstr(env, ARM64Instr_MovI(r_dstMSword, hregARM64_X3()));
++            } else {
++               vassert(szB != 0);
++               HReg r_dst = lookupIRTemp(env, res);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
++               addInstr(env, ARM64Instr_LdrEX(szB));
++               addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
+             }
+-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), raddr));
+-            addInstr(env, ARM64Instr_LdrEX(szB));
+-            addInstr(env, ARM64Instr_MovI(r_dst, hregARM64_X2()));
+             return;
+          }
+          goto stmt_fail;
+       } else {
+          /* SC */
+          IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
+-         if (tyd == Ity_I64 || tyd == Ity_I32
++         if (tyd == Ity_I128 || tyd == Ity_I64 || tyd == Ity_I32
+              || tyd == Ity_I16 || tyd == Ity_I8) {
+             Int  szB = 0;
+-            HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
+             HReg rA  = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+             switch (tyd) {
+-               case Ity_I8:  szB = 1; break;
+-               case Ity_I16: szB = 2; break;
+-               case Ity_I32: szB = 4; break;
+-               case Ity_I64: szB = 8; break;
+-               default:      vassert(0);
++               case Ity_I8:   szB = 1; break;
++               case Ity_I16:  szB = 2; break;
++               case Ity_I32:  szB = 4; break;
++               case Ity_I64:  szB = 8; break;
++               case Ity_I128: szB = 16; break;
++               default:       vassert(0);
++            }
++            if (szB == 16) {
++               HReg rD_MSword = INVALID_HREG;
++               HReg rD_LSword = INVALID_HREG;
++               iselInt128Expr(&rD_MSword,
++                              &rD_LSword, env, stmt->Ist.LLSC.storedata);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD_LSword));
++               addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rD_MSword));
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
++               addInstr(env, ARM64Instr_StrEXP());
++            } else {
++               vassert(szB != 0);
++               HReg rD  = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
++               addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
++               addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
++               addInstr(env, ARM64Instr_StrEX(szB));
+             }
+-            addInstr(env, ARM64Instr_MovI(hregARM64_X2(), rD));
+-            addInstr(env, ARM64Instr_MovI(hregARM64_X4(), rA));
+-            addInstr(env, ARM64Instr_StrEX(szB));
+          } else {
+             goto stmt_fail;
+          }
+@@ -4243,10 +4327,10 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+ 
+    /* --------- ACAS --------- */
+    case Ist_CAS: {
+-      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
++      IRCAS* cas = stmt->Ist.CAS.details;
++      if (cas->oldHi == IRTemp_INVALID && cas->end == Iend_LE) {
+          /* "normal" singleton CAS */
+          UChar  sz;
+-         IRCAS* cas = stmt->Ist.CAS.details;
+          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+          switch (ty) { 
+             case Ity_I64: sz = 8; break;
+@@ -4281,10 +4365,9 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+          addInstr(env, ARM64Instr_MovI(rOld, rResult));
+          return;
+       }
+-      else {
++      if (cas->oldHi != IRTemp_INVALID && cas->end == Iend_LE) {
+          /* Paired register CAS, i.e. CASP */
+          UChar  sz;
+-         IRCAS* cas = stmt->Ist.CAS.details;
+          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+          switch (ty) {
+             case Ity_I64: sz = 8; break;
+diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
+index 25566c41c..2d82c41a1 100644
+--- a/VEX/priv/ir_defs.c
++++ b/VEX/priv/ir_defs.c
+@@ -76,6 +76,7 @@ void ppIRConst ( const IRConst* con )
+       case Ico_U16:  vex_printf( "0x%x:I16",     (UInt)(con->Ico.U16)); break;
+       case Ico_U32:  vex_printf( "0x%x:I32",     (UInt)(con->Ico.U32)); break;
+       case Ico_U64:  vex_printf( "0x%llx:I64",   (ULong)(con->Ico.U64)); break;
++      case Ico_U128: vex_printf( "I128{0x%04x}", (UInt)(con->Ico.U128)); break;
+       case Ico_F32:  u.f32 = con->Ico.F32;
+                      vex_printf( "F32{0x%x}",   u.i32);
+                      break;
+@@ -2266,6 +2267,13 @@ IRConst* IRConst_U64 ( ULong u64 )
+    c->Ico.U64 = u64;
+    return c;
+ }
++IRConst* IRConst_U128 ( UShort con )
++{
++   IRConst* c  = LibVEX_Alloc_inline(sizeof(IRConst));
++   c->tag      = Ico_U128;
++   c->Ico.U128 = con;
++   return c;
++}
+ IRConst* IRConst_F32 ( Float f32 )
+ {
+    IRConst* c = LibVEX_Alloc_inline(sizeof(IRConst));
+@@ -4230,6 +4238,7 @@ IRType typeOfIRConst ( const IRConst* con )
+       case Ico_U16:   return Ity_I16;
+       case Ico_U32:   return Ity_I32;
+       case Ico_U64:   return Ity_I64;
++      case Ico_U128:  return Ity_I128;
+       case Ico_F32:   return Ity_F32;
+       case Ico_F32i:  return Ity_F32;
+       case Ico_F64:   return Ity_F64;
+@@ -5129,7 +5138,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
+          tyRes = typeOfIRTemp(tyenv, stmt->Ist.LLSC.result);
+          if (stmt->Ist.LLSC.storedata == NULL) {
+             /* it's a LL */
+-            if (tyRes != Ity_I64 && tyRes != Ity_I32
++            if (tyRes != Ity_I128 && tyRes != Ity_I64 && tyRes != Ity_I32
+                 && tyRes != Ity_I16 && tyRes != Ity_I8)
+                sanityCheckFail(bb,stmt,"Ist.LLSC(LL).result :: bogus");
+          } else {
+@@ -5137,7 +5146,7 @@ void tcStmt ( const IRSB* bb, const IRStmt* stmt, IRType gWordTy )
+             if (tyRes != Ity_I1)
+                sanityCheckFail(bb,stmt,"Ist.LLSC(SC).result: not :: Ity_I1");
+             tyData = typeOfIRExpr(tyenv, stmt->Ist.LLSC.storedata);
+-            if (tyData != Ity_I64 && tyData != Ity_I32
++            if (tyData != Ity_I128 && tyData != Ity_I64 && tyData != Ity_I32
+                 && tyData != Ity_I16 && tyData != Ity_I8)
+                sanityCheckFail(bb,stmt,
+                                "Ist.LLSC(SC).result :: storedata bogus");
+@@ -5385,6 +5394,7 @@ Int sizeofIRType ( IRType ty )
+ IRType integerIRTypeOfSize ( Int szB )
+ {
+    switch (szB) {
++      case 16: return Ity_I128;
+       case 8: return Ity_I64;
+       case 4: return Ity_I32;
+       case 2: return Ity_I16;
+diff --git a/VEX/pub/libvex_guest_arm64.h b/VEX/pub/libvex_guest_arm64.h
+index 39b6ecdc2..91d06bd75 100644
+--- a/VEX/pub/libvex_guest_arm64.h
++++ b/VEX/pub/libvex_guest_arm64.h
+@@ -157,14 +157,18 @@ typedef
+          note of bits 23 and 22. */
+       UInt  guest_FPCR;
+ 
+-      /* Fallback LL/SC support.  See bugs 344524 and 369459. */
+-      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4 or 8.
++      /* Fallback LL/SC support.  See bugs 344524 and 369459.  _LO64 and _HI64
++         contain the original contents of _ADDR+0 .. _ADDR+15, but only _SIZE
++         number of bytes of it.  The remaining 16-_SIZE bytes of them must be
++         zero. */
++      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4,8 or 16.
+       ULong guest_LLSC_ADDR; // Address of transaction.
+-      ULong guest_LLSC_DATA; // Original value at _ADDR, zero-extended.
++      ULong guest_LLSC_DATA_LO64; // Original value at _ADDR+0.
++      ULong guest_LLSC_DATA_HI64; // Original value at _ADDR+8.
+ 
+       /* Padding to make it have an 16-aligned size */
+       /* UInt  pad_end_0; */
+-      ULong pad_end_1;
++      /* ULong pad_end_1; */
+    }
+    VexGuestARM64State;
+ 
+diff --git a/VEX/pub/libvex_ir.h b/VEX/pub/libvex_ir.h
+index deaa044c1..85805bb69 100644
+--- a/VEX/pub/libvex_ir.h
++++ b/VEX/pub/libvex_ir.h
+@@ -269,6 +269,8 @@ typedef
+       Ico_U16, 
+       Ico_U32, 
+       Ico_U64,
++      Ico_U128,  /* 128-bit restricted integer constant,
++                    same encoding scheme as V128 */
+       Ico_F32,   /* 32-bit IEEE754 floating */
+       Ico_F32i,  /* 32-bit unsigned int to be interpreted literally
+                     as a IEEE754 single value. */
+@@ -295,6 +297,7 @@ typedef
+          UShort U16;
+          UInt   U32;
+          ULong  U64;
++         UShort U128;
+          Float  F32;
+          UInt   F32i;
+          Double F64;
+@@ -311,6 +314,7 @@ extern IRConst* IRConst_U8   ( UChar );
+ extern IRConst* IRConst_U16  ( UShort );
+ extern IRConst* IRConst_U32  ( UInt );
+ extern IRConst* IRConst_U64  ( ULong );
++extern IRConst* IRConst_U128 ( UShort );
+ extern IRConst* IRConst_F32  ( Float );
+ extern IRConst* IRConst_F32i ( UInt );
+ extern IRConst* IRConst_F64  ( Double );
+diff --git a/memcheck/mc_machine.c b/memcheck/mc_machine.c
+index 919c7fae8..176c8e5cb 100644
+--- a/memcheck/mc_machine.c
++++ b/memcheck/mc_machine.c
+@@ -1115,9 +1115,10 @@ static Int get_otrack_shadow_offset_wrk ( Int offset, Int szB )
+    if (o == GOF(CMSTART) && sz == 8) return -1; // untracked
+    if (o == GOF(CMLEN)   && sz == 8) return -1; // untracked
+ 
+-   if (o == GOF(LLSC_SIZE) && sz == 8) return -1; // untracked
+-   if (o == GOF(LLSC_ADDR) && sz == 8) return o;
+-   if (o == GOF(LLSC_DATA) && sz == 8) return o;
++   if (o == GOF(LLSC_SIZE)      && sz == 8) return -1; // untracked
++   if (o == GOF(LLSC_ADDR)      && sz == 8) return o;
++   if (o == GOF(LLSC_DATA_LO64) && sz == 8) return o;
++   if (o == GOF(LLSC_DATA_HI64) && sz == 8) return o;
+ 
+    VG_(printf)("MC_(get_otrack_shadow_offset)(arm64)(off=%d,sz=%d)\n",
+                offset,szB);
+diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
+index c6fd2653f..72ccb3c8c 100644
+--- a/memcheck/mc_translate.c
++++ b/memcheck/mc_translate.c
+@@ -5497,8 +5497,11 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+       the address (shadow) to 'defined' following the test. */
+    complainIfUndefined( mce, addr, guard );
+ 
+-   /* Now cook up a call to the relevant helper function, to read the
+-      data V bits from shadow memory. */
++   /* Now cook up a call to the relevant helper function, to read the data V
++      bits from shadow memory.  Note that I128 loads are done by pretending
++      we're doing a V128 load, and then converting the resulting V128 vbits
++      word to an I128, right at the end of this function -- see `castedToI128`
++      below.  (It's only a minor hack :-) This pertains to bug 444399. */
+    ty = shadowTypeV(ty);
+ 
+    void*        helper           = NULL;
+@@ -5511,6 +5514,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+                         hname = "MC_(helperc_LOADV256le)";
+                         ret_via_outparam = True;
+                         break;
++         case Ity_I128: // fallthrough.  See comment above.
+          case Ity_V128: helper = &MC_(helperc_LOADV128le);
+                         hname = "MC_(helperc_LOADV128le)";
+                         ret_via_outparam = True;
+@@ -5576,7 +5580,7 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+ 
+    /* We need to have a place to park the V bits we're just about to
+       read. */
+-   IRTemp datavbits = newTemp(mce, ty, VSh);
++   IRTemp datavbits = newTemp(mce, ty == Ity_I128 ? Ity_V128 : ty, VSh);
+ 
+    /* Here's the call. */
+    IRDirty* di;
+@@ -5603,7 +5607,14 @@ IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
+    }
+    stmt( 'V', mce, IRStmt_Dirty(di) );
+ 
+-   return mkexpr(datavbits);
++   if (ty == Ity_I128) {
++      IRAtom* castedToI128
++         = assignNew('V', mce, Ity_I128,
++                     unop(Iop_ReinterpV128asI128, mkexpr(datavbits)));
++      return castedToI128;
++   } else {
++      return mkexpr(datavbits);
++   }
+ }
+ 
+ 
+@@ -5631,6 +5642,7 @@ IRAtom* expr2vbits_Load ( MCEnv* mce,
+       case Ity_I16:
+       case Ity_I32:
+       case Ity_I64:
++      case Ity_I128:
+       case Ity_V128:
+       case Ity_V256:
+          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
+@@ -5928,6 +5940,7 @@ void do_shadow_Store ( MCEnv* mce,
+                         c = IRConst_V256(V_BITS32_DEFINED); break;
+          case Ity_V128: // V128 weirdness -- used twice
+                         c = IRConst_V128(V_BITS16_DEFINED); break;
++         case Ity_I128: c = IRConst_U128(V_BITS16_DEFINED); break;
+          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
+          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
+          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
+@@ -5948,6 +5961,7 @@ void do_shadow_Store ( MCEnv* mce,
+       switch (ty) {
+          case Ity_V256: /* we'll use the helper four times */
+          case Ity_V128: /* we'll use the helper twice */
++         case Ity_I128: /* we'll use the helper twice */
+          case Ity_I64: helper = &MC_(helperc_STOREV64le);
+                        hname = "MC_(helperc_STOREV64le)";
+                        break;
+@@ -6051,9 +6065,9 @@ void do_shadow_Store ( MCEnv* mce,
+       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
+ 
+    } 
+-   else if (UNLIKELY(ty == Ity_V128)) {
++   else if (UNLIKELY(ty == Ity_V128 || ty == Ity_I128)) {
+ 
+-      /* V128-bit case */
++      /* V128/I128-bit case */
+       /* See comment in next clause re 64-bit regparms */
+       /* also, need to be careful about endianness */
+ 
+@@ -6062,6 +6076,7 @@ void do_shadow_Store ( MCEnv* mce,
+       IRAtom  *addrLo64, *addrHi64;
+       IRAtom  *vdataLo64, *vdataHi64;
+       IRAtom  *eBiasLo64, *eBiasHi64;
++      IROp    opGetLO64,  opGetHI64;
+ 
+       if (end == Iend_LE) {
+          offLo64 = 0;
+@@ -6071,9 +6086,17 @@ void do_shadow_Store ( MCEnv* mce,
+          offHi64 = 0;
+       }
+ 
++      if (ty == Ity_V128) {
++         opGetLO64 = Iop_V128to64;
++         opGetHI64 = Iop_V128HIto64;
++      } else {
++         opGetLO64 = Iop_128to64;
++         opGetHI64 = Iop_128HIto64;
++      }
++
+       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
+       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
+-      vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
++      vdataLo64 = assignNew('V', mce, Ity_I64, unop(opGetLO64, vdata));
+       diLo64    = unsafeIRDirty_0_N( 
+                      1/*regparms*/, 
+                      hname, VG_(fnptr_to_fnentry)( helper ), 
+@@ -6081,7 +6104,7 @@ void do_shadow_Store ( MCEnv* mce,
+                   );
+       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
+       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
+-      vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
++      vdataHi64 = assignNew('V', mce, Ity_I64, unop(opGetHI64, vdata));
+       diHi64    = unsafeIRDirty_0_N( 
+                      1/*regparms*/, 
+                      hname, VG_(fnptr_to_fnentry)( helper ), 
+@@ -6888,7 +6911,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
+       /* Just treat this as a normal load, followed by an assignment of
+          the value to .result. */
+       /* Stay sane */
+-      tl_assert(resTy == Ity_I64 || resTy == Ity_I32
++      tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
+                 || resTy == Ity_I16 || resTy == Ity_I8);
+       assign( 'V', mce, resTmp,
+                    expr2vbits_Load(
+@@ -6899,7 +6922,7 @@ static void do_shadow_LLSC ( MCEnv*    mce,
+       /* Stay sane */
+       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
+                                    stStoredata);
+-      tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
++      tl_assert(dataTy == Ity_I128 || dataTy == Ity_I64 || dataTy == Ity_I32
+                 || dataTy == Ity_I16 || dataTy == Ity_I8);
+       do_shadow_Store( mce, stEnd,
+                             stAddr, 0/* addr bias */,
+@@ -7684,7 +7707,7 @@ static void schemeS ( MCEnv* mce, IRStmt* st )
+                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
+             IRExpr* vanillaLoad
+                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
+-            tl_assert(resTy == Ity_I64 || resTy == Ity_I32
++            tl_assert(resTy == Ity_I128 || resTy == Ity_I64 || resTy == Ity_I32
+                       || resTy == Ity_I16 || resTy == Ity_I8);
+             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
+                               schemeE(mce, vanillaLoad));
+diff --git a/memcheck/tests/Makefile.am b/memcheck/tests/Makefile.am
+index 449710020..2b43ef7d7 100644
+--- a/memcheck/tests/Makefile.am
++++ b/memcheck/tests/Makefile.am
+@@ -90,6 +90,7 @@ EXTRA_DIST = \
+ 	addressable.stderr.exp addressable.stdout.exp addressable.vgtest \
+ 	atomic_incs.stderr.exp atomic_incs.vgtest \
+ 	atomic_incs.stdout.exp-32bit atomic_incs.stdout.exp-64bit \
++	atomic_incs.stdout.exp-64bit-and-128bit \
+ 	badaddrvalue.stderr.exp \
+ 	badaddrvalue.stdout.exp badaddrvalue.vgtest \
+         exit_on_first_error.stderr.exp \
+diff --git a/memcheck/tests/atomic_incs.c b/memcheck/tests/atomic_incs.c
+index f931750f4..1c738c530 100644
+--- a/memcheck/tests/atomic_incs.c
++++ b/memcheck/tests/atomic_incs.c
+@@ -22,6 +22,17 @@
+ #define NNN 3456987
+ 
+ #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
++#define IS_16_ALIGNED(_ptr)  (0 == (((unsigned long)(_ptr)) & 15))
++
++// U128 from libvex_basictypes.h is a 4-x-UInt array, which is a bit
++// inconvenient, hence:
++typedef
++   struct {
++      // assuming little-endianness
++      unsigned long long int lo64;
++      unsigned long long int hi64;
++   }
++   MyU128;
+ 
+ 
+ __attribute__((noinline)) void atomic_add_8bit ( char* p, int n ) 
+@@ -712,6 +723,40 @@ __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
+ #endif
+ }
+ 
++__attribute__((noinline)) void atomic_add_128bit ( MyU128* p,
++                                                   unsigned long long int n )
++{
++#if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32) \
++    || defined (VGA_nanomips) || defined(VGA_mips64) \
++    || defined(VGA_amd64) \
++    || defined(VGA_ppc64be) || defined(VGA_ppc64le) \
++    || defined(VGA_arm) \
++    || defined(VGA_s390x)
++   /* do nothing; is not supported */
++#elif defined(VGA_arm64)
++   unsigned long long int block[3]
++      = { (unsigned long long int)p, (unsigned long long int)n,
++          0xFFFFFFFFFFFFFFFFULL};
++   do {
++      __asm__ __volatile__(
++         "mov   x5, %0"             "\n\t" // &block[0]
++         "ldr   x9, [x5, #0]"       "\n\t" // p
++         "ldr   x10, [x5, #8]"      "\n\t" // n
++         "ldxp  x7, x8, [x9]"       "\n\t"
++         "adds  x7, x7, x10"        "\n\t"
++         "adc   x8, x8, xzr"        "\n\t"
++         "stxp  w4, x7, x8, [x9]"   "\n\t"
++         "str   x4, [x5, #16]"      "\n\t"
++         : /*out*/
++         : /*in*/ "r"(&block[0])
++         : /*trash*/ "memory", "cc", "x5", "x7", "x8", "x9", "x10", "x4"
++      );
++   } while (block[2] != 0);
++#else
++# error "Unsupported arch"
++#endif
++}
++
+ int main ( int argc, char** argv )
+ {
+    int    i, status;
+@@ -720,8 +765,12 @@ int main ( int argc, char** argv )
+    short* p16;
+    int*   p32;
+    long long int* p64;
++   MyU128*  p128;
+    pid_t  child, p2;
+ 
++   assert(sizeof(MyU128) == 16);
++   assert(sysconf(_SC_PAGESIZE) >= 4096);
++
+    printf("parent, pre-fork\n");
+ 
+    page = mmap( 0, sysconf(_SC_PAGESIZE),
+@@ -736,11 +785,13 @@ int main ( int argc, char** argv )
+    p16 = (short*)(page+256);
+    p32 = (int*)(page+512);
+    p64 = (long long int*)(page+768);
++   p128 = (MyU128*)(page+1024);
+ 
+    assert( IS_8_ALIGNED(p8) );
+    assert( IS_8_ALIGNED(p16) );
+    assert( IS_8_ALIGNED(p32) );
+    assert( IS_8_ALIGNED(p64) );
++   assert( IS_16_ALIGNED(p128) );
+ 
+    memset(page, 0, 1024);
+ 
+@@ -748,6 +799,7 @@ int main ( int argc, char** argv )
+    *p16 = 0;
+    *p32 = 0;
+    *p64 = 0;
++   p128->lo64 = p128->hi64 = 0;
+ 
+    child = fork();
+    if (child == -1) {
+@@ -763,6 +815,7 @@ int main ( int argc, char** argv )
+          atomic_add_16bit(p16, 1);
+          atomic_add_32bit(p32, 1);
+          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
++         atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
+       }
+       return 1;
+       /* NOTREACHED */
+@@ -778,6 +831,7 @@ int main ( int argc, char** argv )
+       atomic_add_16bit(p16, 1);
+       atomic_add_32bit(p32, 1);
+       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
++      atomic_add_128bit(p128, 0x1000000013374771ULL); // ditto re upper 64
+    }
+ 
+    p2 = waitpid(child, &status, 0);
+@@ -788,11 +842,17 @@ int main ( int argc, char** argv )
+ 
+    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
+           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
++   printf("               128 bit 0x%016llx:0x%016llx\n",
++          p128->hi64, p128->lo64);
+ 
+    if (-74 == (int)(*(signed char*)p8) 
+        && 32694 == (int)(*p16) 
+        && 6913974 == *p32
+-       && (0LL == *p64 || 682858642110LL == *p64)) {
++       && (0LL == *p64 || 682858642110LL == *p64)
++       && ((0 == p128->hi64 && 0 == p128->lo64)
++           || (0x00000000000697fb == p128->hi64
++               && 0x6007eb426316d956ULL == p128->lo64))
++      ) {
+       printf("PASS\n");
+    } else {
+       printf("FAIL -- see source code for expected values\n");
+diff --git a/memcheck/tests/atomic_incs.stdout.exp-32bit b/memcheck/tests/atomic_incs.stdout.exp-32bit
+index c5b8781e5..55e5044b5 100644
+--- a/memcheck/tests/atomic_incs.stdout.exp-32bit
++++ b/memcheck/tests/atomic_incs.stdout.exp-32bit
+@@ -3,5 +3,6 @@ child
+ parent, pre-fork
+ parent
+ FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 0
++               128 bit 0x0000000000000000:0x0000000000000000
+ PASS
+ parent exits
+diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit b/memcheck/tests/atomic_incs.stdout.exp-64bit
+index 82405c520..ca2f4fc97 100644
+--- a/memcheck/tests/atomic_incs.stdout.exp-64bit
++++ b/memcheck/tests/atomic_incs.stdout.exp-64bit
+@@ -3,5 +3,6 @@ child
+ parent, pre-fork
+ parent
+ FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
++               128 bit 0x0000000000000000:0x0000000000000000
+ PASS
+ parent exits
+diff --git a/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
+new file mode 100644
+index 000000000..ef6580917
+--- /dev/null
++++ b/memcheck/tests/atomic_incs.stdout.exp-64bit-and-128bit
+@@ -0,0 +1,8 @@
++parent, pre-fork
++child
++parent, pre-fork
++parent
++FINAL VALUES:  8 bit -74,  16 bit 32694,  32 bit 6913974,  64 bit 682858642110
++               128 bit 0x00000000000697fb:0x6007eb426316d956
++PASS
++parent exits
+diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
+index 00cbfa52c..9efb49b27 100644
+--- a/none/tests/arm64/Makefile.am
++++ b/none/tests/arm64/Makefile.am
+@@ -12,7 +12,10 @@ EXTRA_DIST = \
+ 	atomics_v81.stdout.exp atomics_v81.stderr.exp atomics_v81.vgtest \
+ 	simd_v81.stdout.exp simd_v81.stderr.exp simd_v81.vgtest \
+         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
+-	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp fp_and_simd_v82.vgtest
++	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
++	fp_and_simd_v82.vgtest \
++	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
++	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
+ 
+ check_PROGRAMS = \
+ 	allexec \
+@@ -20,7 +23,8 @@ check_PROGRAMS = \
+ 	fp_and_simd \
+ 	integer \
+ 	memory \
+-	fmadd_sub
++	fmadd_sub \
++	ldxp_stxp
+ 
+ if BUILD_ARMV8_CRC_TESTS
+   check_PROGRAMS += crc32
+diff --git a/none/tests/arm64/ldxp_stxp.c b/none/tests/arm64/ldxp_stxp.c
+new file mode 100644
+index 000000000..b5f6ea121
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp.c
+@@ -0,0 +1,93 @@
++
++/* Note, this is only a basic smoke test of LD{A}XP and ST{L}XP.  Their
++   atomicity properties are tested by memcheck/tests/atomic_incs.c. */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <malloc.h>
++#include <assert.h>
++
++typedef  unsigned int            UInt;
++typedef  unsigned long long int  ULong;
++
++
++void initBlock ( ULong* block )
++{
++   block[0] = 0x0001020304050607ULL;
++   block[1] = 0x1011121314151617ULL;
++   block[2] = 0x2021222324252627ULL;
++   block[3] = 0x3031323334353637ULL;
++   block[4] = 0x4041424344454647ULL;
++   block[5] = 0x5051525354555657ULL;
++}
++
++void printBlock ( const char* who,
++                  ULong* block, ULong rt1contents, ULong rt2contents,
++                  UInt zeroIfSuccess )
++{
++   printf("Block %s (%s)\n", who, zeroIfSuccess == 0 ? "success" : "FAILURE" );
++   for (int i = 0; i < 6; i++) {
++      printf("0x%016llx\n", block[i]);
++   }
++   printf("0x%016llx rt1contents\n", rt1contents);
++   printf("0x%016llx rt2contents\n", rt2contents);
++   printf("\n");
++}
++
++int main ( void )
++{
++   ULong* block = memalign(16, 6 * sizeof(ULong));
++   assert(block);
++
++   ULong rt1in, rt2in, rt1out, rt2out;
++   UInt scRes;
++
++   // Do ldxp then stxp with x-registers
++   initBlock(block);
++   rt1in  = 0x5555666677778888ULL;
++   rt2in  = 0xAAAA9999BBBB0000ULL;
++   rt1out = 0x1111222233334444ULL;
++   rt2out = 0xFFFFEEEEDDDDCCCCULL;
++   scRes  = 0x55555555;
++   __asm__ __volatile__(
++      "ldxp %1, %2, [%5]"       "\n\t"
++      "stxp %w0, %3, %4, [%5]"  "\n\t"
++      : /*OUT*/
++        "=&r"(scRes),  // %0
++        "=&r"(rt1out), // %1
++        "=&r"(rt2out)  // %2
++      : /*IN*/
++        "r"(rt1in),    // %3
++        "r"(rt2in),    // %4
++        "r"(&block[2]) // %5
++      : /*TRASH*/
++        "memory","cc"
++   );
++   printBlock("after ldxp/stxp 2x64-bit", block, rt1out, rt2out, scRes);
++
++   // Do ldxp then stxp with w-registers
++   initBlock(block);
++   rt1in  = 0x5555666677778888ULL;
++   rt2in  = 0xAAAA9999BBBB0000ULL;
++   rt1out = 0x1111222233334444ULL;
++   rt2out = 0xFFFFEEEEDDDDCCCCULL;
++   scRes  = 0x55555555;
++   __asm__ __volatile__(
++      "ldxp %w1, %w2, [%5]"       "\n\t"
++      "stxp %w0, %w3, %w4, [%5]"  "\n\t"
++      : /*OUT*/
++        "=&r"(scRes),  // %0
++        "=&r"(rt1out), // %1
++        "=&r"(rt2out)  // %2
++      : /*IN*/
++        "r"(rt1in),    // %3
++        "r"(rt2in),    // %4
++        "r"(&block[2]) // %5
++      : /*TRASH*/
++        "memory","cc"
++   );
++   printBlock("after ldxp/stxp 2x32-bit", block, rt1out, rt2out, scRes);
++
++   free(block);
++   return 0;
++}
+diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stderr.exp
+new file mode 100644
+index 000000000..e69de29bb
+diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
+new file mode 100644
+index 000000000..f269ecdcc
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_basisimpl.stdout.exp
+@@ -0,0 +1,20 @@
++Block after ldxp/stxp 2x64-bit (success)
++0x0001020304050607
++0x1011121314151617
++0x5555666677778888
++0xaaaa9999bbbb0000
++0x4041424344454647
++0x5051525354555657
++0x2021222324252627 rt1contents
++0x3031323334353637 rt2contents
++
++Block after ldxp/stxp 2x32-bit (success)
++0x0001020304050607
++0x1011121314151617
++0xbbbb000077778888
++0x3031323334353637
++0x4041424344454647
++0x5051525354555657
++0x0000000024252627 rt1contents
++0x0000000020212223 rt2contents
++
+diff --git a/none/tests/arm64/ldxp_stxp_basisimpl.vgtest b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
+new file mode 100644
+index 000000000..29133729a
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_basisimpl.vgtest
+@@ -0,0 +1,2 @@
++prog: ldxp_stxp
++vgopts: -q
+diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stderr.exp
+new file mode 100644
+index 000000000..e69de29bb
+diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
+new file mode 100644
+index 000000000..f269ecdcc
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.stdout.exp
+@@ -0,0 +1,20 @@
++Block after ldxp/stxp 2x64-bit (success)
++0x0001020304050607
++0x1011121314151617
++0x5555666677778888
++0xaaaa9999bbbb0000
++0x4041424344454647
++0x5051525354555657
++0x2021222324252627 rt1contents
++0x3031323334353637 rt2contents
++
++Block after ldxp/stxp 2x32-bit (success)
++0x0001020304050607
++0x1011121314151617
++0xbbbb000077778888
++0x3031323334353637
++0x4041424344454647
++0x5051525354555657
++0x0000000024252627 rt1contents
++0x0000000020212223 rt2contents
++
+diff --git a/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
+new file mode 100644
+index 000000000..474282a03
+--- /dev/null
++++ b/none/tests/arm64/ldxp_stxp_fallbackimpl.vgtest
+@@ -0,0 +1,2 @@
++prog: ldxp_stxp
++vgopts: -q --sim-hints=fallback-llsc
+
+commit 0d38ca5dd6b446c70738031132d41f09de0f7a8a
+Author: Julian Seward <jseward@acm.org>
+Date:   Fri Nov 12 13:08:45 2021 +0100
+
+    Bug 444399 - disInstr(arm64): unhandled instruction 0xC87F2D89 (LD{,A}XP and ST{,L}XP).  FOLLOWUP FIX.
+    
+    This is an attempt to un-break 'make dist', as broken by the main commit for
+    this bug, which was 530df882b8f60ecacaf2b9b8a719f7ea1c1d1650.
+
+diff --git a/none/tests/arm64/Makefile.am b/none/tests/arm64/Makefile.am
+index 9efb49b27..4a06f0996 100644
+--- a/none/tests/arm64/Makefile.am
++++ b/none/tests/arm64/Makefile.am
+@@ -14,8 +14,10 @@ EXTRA_DIST = \
+         fmadd_sub.stdout.exp fmadd_sub.stderr.exp fmadd_sub.vgtest \
+ 	fp_and_simd_v82.stdout.exp fp_and_simd_v82.stderr.exp \
+ 	fp_and_simd_v82.vgtest \
+-	ldxp_stxp.stdout.exp ldxp_stxp.stderr.exp \
+-	ldxp_stxp_basisimpl.vgtest ldxp_stxp_fallbackimpl.vgtest
++	ldxp_stxp_basisimpl.stdout.exp ldxp_stxp_basisimpl.stderr.exp \
++	ldxp_stxp_basisimpl.vgtest \
++	ldxp_stxp_fallbackimpl.stdout.exp ldxp_stxp_fallbackimpl.stderr.exp \
++	ldxp_stxp_fallbackimpl.vgtest
+ 
+ check_PROGRAMS = \
+ 	allexec \
diff --git a/valgrind.spec b/valgrind.spec
index 337279f..01ecfe8 100644
--- a/valgrind.spec
+++ b/valgrind.spec
@@ -111,6 +111,9 @@ Patch12: valgrind-3.18.1-rust-v0-demangle.patch
 # KDE#445354 arm64 backend: incorrect code emitted for doubleword CAS
 Patch13: valgrind-3.18.1-arm64-doubleword-cas.patch
 
+# KDE#444399 arm64: unhandled instruction LD{,A}XP and ST{,L}XP
+Patch14: valgrind-3.18.1-arm64-ldaxp-stlxp.patch
+
 BuildRequires: make
 BuildRequires: glibc-devel
 
@@ -256,6 +259,7 @@ Valgrind User Manual for details.
 %patch11 -p1
 %patch12 -p1
 %patch13 -p1
+%patch14 -p1
 
 %build
 # LTO triggers undefined symbols in valgrind.  Valgrind has a --enable-lto
@@ -491,6 +495,7 @@ fi
 - Add valgrind-3.18.1-gdbserver_tests-hwcap.patch
 - Add valgrind-3.18.1-rust-v0-demangle.patch
 - Add valgrind-3.18.1-arm64-doubleword-cas.patch
+- Add valgrind-3.18.1-arm64-ldaxp-stlxp.patch
 
 * Mon Nov  1 2021 Mark Wielaard <mjw@fedoraproject.org> - 3.18.1-2
 - Add valgrind-3.18.1-dhat-tests-copy.patch