Blob Blame History Raw
Only arm64. Removed the MIPS part.

commit 6b72dc54b722af5f6a87ebe258d3da6bcba059b7
Author: Julian Seward <jseward@acm.org>
Date:   Mon Apr 24 09:23:43 2017 +0000

    Bug 369459 - valgrind on arm64 violates the ARMv8 spec (ldxr/stxr)
    
    This implements a fallback LL/SC implementation as described in bug 344524.
    
    The fallback implementation is not enabled by default, and there is no
    auto-detection for when it should be used.  To use it, run with the
    flag --sim-hints=fallback-llsc.  This commit also allows the existing
    MIPS fallback implementation to be enabled with that flag.
    
    VEX side changes:
    
    * priv/main_main.c, pub/libvex.h
    
      Adds new field guest__use_fallback_LLSC to VexAbiInfo
    
    * pub/libvex_guest_arm64.h priv/guest_arm64_toIR.c
    
      add front end support, new guest state fields
      guest_LLSC_{SIZE,ADDR,DATA}, also documentation of the scheme
    
    * priv/guest_mips_toIR.c
    
      allow manual selection of fallback implementation via
      --sim-hints=fallback-llsc
    
    * priv/host_arm64_defs.c priv/host_arm64_defs.h priv/host_arm64_isel.c
    
      Add support for generating CAS on arm64, as needed by the front end changes
    
    
    
    git-svn-id: svn://svn.valgrind.org/vex/trunk@3352

diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index 088af55..421db37 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -1147,6 +1147,10 @@ static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
 
+#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
+#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
+#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
+
 
 /* ---------------- Integer registers ---------------- */
 
@@ -4702,7 +4706,9 @@ const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
 
 
 static
-Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
+Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
+                          const VexAbiInfo* abiinfo
+)
 {
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
 
@@ -6457,6 +6463,32 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
    */
+   /* For the "standard" implementation we pass through the LL and SC to
+      the host.  For the "fallback" implementation, for details see
+        https://bugs.kde.org/show_bug.cgi?id=344524 and
+        https://bugs.kde.org/show_bug.cgi?id=369459,
+      but in short:
+
+      LoadLinked(addr)
+        gs.LLsize = load_size // 1, 2, 4 or 8
+        gs.LLaddr = addr
+        gs.LLdata = zeroExtend(*addr)
+
+      StoreCond(addr, data)
+        tmp_LLsize = gs.LLsize
+        gs.LLsize = 0 // "no transaction"
+        if tmp_LLsize != store_size        -> fail
+        if addr != gs.LLaddr               -> fail
+        if zeroExtend(*addr) != gs.LLdata  -> fail
+        cas_ok = CAS(store_size, addr, gs.LLdata -> data)
+        if !cas_ok                         -> fail
+        succeed
+
+      When thread scheduled
+        gs.LLsize = 0 // "no transaction"
+        (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
+         has to do this bit)
+   */   
    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
        && INSN(14,10) == BITS5(1,1,1,1,1)) {
@@ -6478,29 +6510,99 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
 
       if (isLD && ss == BITS5(1,1,1,1,1)) {
          IRTemp res = newTemp(ty);
-         stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
-         putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
+         if (abiinfo->guest__use_fallback_LLSC) {
+            // Do the load first so we don't update any guest state
+            // if it faults.
+            IRTemp loaded_data64 = newTemp(Ity_I64);
+            assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
+            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
+            stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
+            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
+            putIReg64orZR(tt, mkexpr(loaded_data64));
+         } else {
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
+            putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
+         }
          if (isAcqOrRel) {
             stmt(IRStmt_MBE(Imbe_Fence));
          }
-         DIP("ld%sx%s %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
-             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
+         DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
+             abiinfo->guest__use_fallback_LLSC
+                ? "(fallback implementation)" : "");
          return True;
       }
       if (!isLD) {
          if (isAcqOrRel) {
             stmt(IRStmt_MBE(Imbe_Fence));
          }
-         IRTemp  res  = newTemp(Ity_I1);
          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
-         stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
-         /* IR semantics: res is 1 if store succeeds, 0 if it fails.
-            Need to set rS to 1 on failure, 0 on success. */
-         putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
-                                            mkU64(1)));
-         DIP("st%sx%s %s, %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
+         if (abiinfo->guest__use_fallback_LLSC) {
+            // This is really ugly, since we don't have any way to do
+            // proper if-then-else.  First, set up as if the SC failed,
+            // and jump forwards if it really has failed.
+
+            // Continuation address
+            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
+
+            // "the SC failed".  Any non-zero value means failure.
+            putIReg64orZR(ss, mkU64(1));
+          
+            IRTemp tmp_LLsize = newTemp(Ity_I64);
+            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
+            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
+            ));
+            // Fail if no or wrong-size transaction
+            vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
+            stmt( IRStmt_Exit(
+                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
+                     Ijk_Boring, nia, OFFB_PC
+            ));
+            // Fail if the address doesn't match the LL address
+            stmt( IRStmt_Exit(
+                      binop(Iop_CmpNE64, mkexpr(ea),
+                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
+                      Ijk_Boring, nia, OFFB_PC
+            ));
+            // Fail if the data doesn't match the LL data
+            IRTemp llsc_data64 = newTemp(Ity_I64);
+            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
+            stmt( IRStmt_Exit(
+                      binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
+                                         mkexpr(llsc_data64)),
+                      Ijk_Boring, nia, OFFB_PC
+            ));
+            // Try to CAS the new value in.
+            IRTemp old = newTemp(ty);
+            IRTemp expd = newTemp(ty);
+            assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
+            stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
+                                     Iend_LE, mkexpr(ea),
+                                     /*expdHi*/NULL, mkexpr(expd),
+                                     /*dataHi*/NULL, data
+            )));
+            // Fail if the CAS failed (viz, old != expd)
+            stmt( IRStmt_Exit(
+                      binop(Iop_CmpNE64,
+                            widenUto64(ty, mkexpr(old)),
+                            widenUto64(ty, mkexpr(expd))),
+                      Ijk_Boring, nia, OFFB_PC
+            ));
+            // Otherwise we succeeded (!)
+            putIReg64orZR(ss, mkU64(0));
+         } else {
+            IRTemp res = newTemp(Ity_I1);
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
+            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
+               Need to set rS to 1 on failure, 0 on success. */
+            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
+                                               mkU64(1)));
+         }
+         DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
              nameIRegOrZR(False, ss),
-             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
+             abiinfo->guest__use_fallback_LLSC
+                ? "(fallback implementation)" : "");
          return True;
       }
       /* else fall through */
@@ -6589,7 +6691,8 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
 
 static
 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
-                          const VexArchInfo* archinfo)
+                          const VexArchInfo* archinfo,
+                          const VexAbiInfo* abiinfo)
 {
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
 
@@ -7048,7 +7151,11 @@ Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
       /* AFAICS, this simply cancels a (all?) reservations made by a
          (any?) preceding LDREX(es).  Arrange to hand it through to
          the back end. */
-      stmt( IRStmt_MBE(Imbe_CancelReservation) );
+      if (abiinfo->guest__use_fallback_LLSC) {
+         stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
+      } else {
+         stmt( IRStmt_MBE(Imbe_CancelReservation) );
+      }
       DIP("clrex #%u\n", mm);
       return True;
    }
@@ -14411,12 +14518,12 @@ Bool disInstr_ARM64_WRK (
          break;
       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
          // Branch, exception generation and system instructions
-         ok = dis_ARM64_branch_etc(dres, insn, archinfo);
+         ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo);
          break;
       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
          // Loads and stores
-         ok = dis_ARM64_load_store(dres, insn);
+         ok = dis_ARM64_load_store(dres, insn, abiinfo);
          break;
       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
          // Data processing - register
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
index cc7c832..c9affbd 100644
--- a/VEX/priv/host_arm64_defs.c
+++ b/VEX/priv/host_arm64_defs.c
@@ -1005,6 +1005,13 @@ ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
    vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
    return i;
 }
+ARM64Instr* ARM64Instr_CAS ( Int szB ) {
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
+   i->tag             = ARM64in_CAS;
+   i->ARM64in.CAS.szB = szB;
+   vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
+   return i;
+}
 ARM64Instr* ARM64Instr_MFence ( void ) {
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
    i->tag        = ARM64in_MFence;
@@ -1569,6 +1576,10 @@ void ppARM64Instr ( const ARM64Instr* i ) {
                     sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
          return;
       }
+      case ARM64in_CAS: {
+         vex_printf("x1 = cas(%dbit)(x3, x5 -> x7)", 8 * i->ARM64in.CAS.szB);
+         return;
+      }
       case ARM64in_MFence:
          vex_printf("(mfence) dsb sy; dmb sy; isb");
          return;
@@ -2064,6 +2075,14 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
          addHRegUse(u, HRmWrite, hregARM64_X0());
          addHRegUse(u, HRmRead, hregARM64_X2());
          return;
+      case ARM64in_CAS:
+         addHRegUse(u, HRmRead, hregARM64_X3());
+         addHRegUse(u, HRmRead, hregARM64_X5());
+         addHRegUse(u, HRmRead, hregARM64_X7());
+         addHRegUse(u, HRmWrite, hregARM64_X1());
+         /* Pointless to state this since X8 is not available to RA. */
+         addHRegUse(u, HRmWrite, hregARM64_X8());
+         break;
       case ARM64in_MFence:
          return;
       case ARM64in_ClrEX:
@@ -2326,6 +2345,8 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
          return;
       case ARM64in_StrEX:
          return;
+      case ARM64in_CAS:
+         return;
       case ARM64in_MFence:
          return;
       case ARM64in_ClrEX:
@@ -3803,6 +3824,61 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
          }
          goto bad;
       }
+      case ARM64in_CAS: {
+         /* This isn't simple.  For an explanation see the comment in
+            host_arm64_defs.h on the the definition of ARM64Instr case
+            CAS. */
+         /* Generate:
+              -- one of:
+              mov     x8, x5                 // AA0503E8
+              and     x8, x5, #0xFFFFFFFF    // 92407CA8
+              and     x8, x5, #0xFFFF        // 92403CA8
+              and     x8, x5, #0xFF          // 92401CA8
+
+              -- one of:
+              ldxr    x1, [x3]               // C85F7C61
+              ldxr    w1, [x3]               // 885F7C61
+              ldxrh   w1, [x3]               // 485F7C61 
+              ldxrb   w1, [x3]               // 085F7C61
+
+              -- always:
+              cmp     x1, x8                 // EB08003F
+              bne     out                    // 54000061
+
+              -- one of:
+              stxr    w1, x7, [x3]           // C8017C67
+              stxr    w1, w7, [x3]           // 88017C67
+              stxrh   w1, w7, [x3]           // 48017C67
+              stxrb   w1, w7, [x3]           // 08017C67
+
+              -- always:
+              eor     x1, x5, x1             // CA0100A1
+            out:
+         */
+         switch (i->ARM64in.CAS.szB) {
+            case 8:  *p++ = 0xAA0503E8; break;
+            case 4:  *p++ = 0x92407CA8; break;
+            case 2:  *p++ = 0x92403CA8; break;
+            case 1:  *p++ = 0x92401CA8; break;
+            default: vassert(0);
+         }
+         switch (i->ARM64in.CAS.szB) {
+            case 8:  *p++ = 0xC85F7C61; break;
+            case 4:  *p++ = 0x885F7C61; break;
+            case 2:  *p++ = 0x485F7C61; break;
+            case 1:  *p++ = 0x085F7C61; break;
+         }
+         *p++ = 0xEB08003F;
+         *p++ = 0x54000061;
+         switch (i->ARM64in.CAS.szB) {
+            case 8:  *p++ = 0xC8017C67; break;
+            case 4:  *p++ = 0x88017C67; break;
+            case 2:  *p++ = 0x48017C67; break;
+            case 1:  *p++ = 0x08017C67; break;
+         }
+         *p++ = 0xCA0100A1;
+         goto done;
+      }
       case ARM64in_MFence: {
          *p++ = 0xD5033F9F; /* DSB sy */
          *p++ = 0xD5033FBF; /* DMB sy */
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
index 62b25fd..92d247e 100644
--- a/VEX/priv/host_arm64_defs.h
+++ b/VEX/priv/host_arm64_defs.h
@@ -481,6 +481,7 @@ typedef
       ARM64in_Mul,
       ARM64in_LdrEX,
       ARM64in_StrEX,
+      ARM64in_CAS,
       ARM64in_MFence,
       ARM64in_ClrEX,
       /* ARM64in_V*: scalar ops involving vector registers */
@@ -668,6 +669,32 @@ typedef
          struct {
             Int  szB; /* 1, 2, 4 or 8 */
          } StrEX;
+         /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
+            where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
+                  x1[8*szB-1 : 0] != x5[8*szB-1 : 0] indicates failure.
+            Uses x8 as scratch (but that's not allocatable).
+            Hence: RD x3, x5, x7; WR x1
+
+            (szB=8)  mov  x8, x5
+            (szB=4)  and  x8, x5, #0xFFFFFFFF
+            (szB=2)  and  x8, x5, #0xFFFF
+            (szB=1)  and  x8, x5, #0xFF
+            -- x8 is correctly zero-extended expected value
+            ldxr    x1, [x3]
+            -- x1 is correctly zero-extended actual value
+            cmp     x1, x8
+            bne     after
+            -- if branch taken, failure; x1[[8*szB-1 : 0] holds old value
+            -- attempt to store
+            stxr    w1, x7, [x3]
+            -- if store successful, x1==0, so the eor is "x1 := x5"
+            -- if store failed,     x1==1, so the eor makes x1 != x5
+            eor     x1, x5, x1
+           after:
+         */
+         struct {
+            Int szB; /* 1, 2, 4 or 8 */
+         } CAS;
          /* Mem fence.  An insn which fences all loads and stores as
             much as possible before continuing.  On ARM64 we emit the
             sequence "dsb sy ; dmb sy ; isb sy", which is probably
@@ -912,6 +939,7 @@ extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
                                         ARM64MulOp op );
 extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
 extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
+extern ARM64Instr* ARM64Instr_CAS     ( Int szB );
 extern ARM64Instr* ARM64Instr_MFence  ( void );
 extern ARM64Instr* ARM64Instr_ClrEX   ( void );
 extern ARM64Instr* ARM64Instr_VLdStH  ( Bool isLoad, HReg sD, HReg rN,
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
index 42748e4..07ce87a 100644
--- a/VEX/priv/host_arm64_isel.c
+++ b/VEX/priv/host_arm64_isel.c
@@ -1383,12 +1383,13 @@ static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
            || e->Iex.Binop.op == Iop_CmpLT64S
            || e->Iex.Binop.op == Iop_CmpLT64U
            || e->Iex.Binop.op == Iop_CmpLE64S
-           || e->Iex.Binop.op == Iop_CmpLE64U)) {
+           || e->Iex.Binop.op == Iop_CmpLE64U
+           || e->Iex.Binop.op == Iop_CasCmpEQ64)) {
       HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
       ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
       addInstr(env, ARM64Instr_Cmp(argL, argR, True/*is64*/));
       switch (e->Iex.Binop.op) {
-         case Iop_CmpEQ64:  return ARM64cc_EQ;
+         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return ARM64cc_EQ;
          case Iop_CmpNE64:  return ARM64cc_NE;
          case Iop_CmpLT64S: return ARM64cc_LT;
          case Iop_CmpLT64U: return ARM64cc_CC;
@@ -1405,12 +1406,13 @@ static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
            || e->Iex.Binop.op == Iop_CmpLT32S
            || e->Iex.Binop.op == Iop_CmpLT32U
            || e->Iex.Binop.op == Iop_CmpLE32S
-           || e->Iex.Binop.op == Iop_CmpLE32U)) {
+           || e->Iex.Binop.op == Iop_CmpLE32U
+           || e->Iex.Binop.op == Iop_CasCmpEQ32)) {
       HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
       ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
       addInstr(env, ARM64Instr_Cmp(argL, argR, False/*!is64*/));
       switch (e->Iex.Binop.op) {
-         case Iop_CmpEQ32:  return ARM64cc_EQ;
+         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return ARM64cc_EQ;
          case Iop_CmpNE32:  return ARM64cc_NE;
          case Iop_CmpLT32S: return ARM64cc_LT;
          case Iop_CmpLT32U: return ARM64cc_CC;
@@ -1420,6 +1422,34 @@ static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
       }
    }
 
+   /* --- Cmp*16*(x,y) --- */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_CasCmpEQ16)) {
+      HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+      HReg argL2 = widen_z_16_to_64(env, argL);
+      HReg argR2 = widen_z_16_to_64(env, argR);
+      addInstr(env, ARM64Instr_Cmp(argL2, ARM64RIA_R(argR2), True/*is64*/));
+      switch (e->Iex.Binop.op) {
+         case Iop_CasCmpEQ16: return ARM64cc_EQ;
+         default: vpanic("iselCondCode(arm64): CmpXX16");
+      }
+   }
+
+   /* --- Cmp*8*(x,y) --- */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_CasCmpEQ8)) {
+      HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+      HReg argL2 = widen_z_8_to_64(env, argL);
+      HReg argR2 = widen_z_8_to_64(env, argR);
+      addInstr(env, ARM64Instr_Cmp(argL2, ARM64RIA_R(argR2), True/*is64*/));
+      switch (e->Iex.Binop.op) {
+         case Iop_CasCmpEQ8: return ARM64cc_EQ;
+         default: vpanic("iselCondCode(arm64): CmpXX8");
+      }
+   }
+
    ppIRExpr(e);
    vpanic("iselCondCode");
 }
@@ -3833,6 +3863,57 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
       break;
    }
 
+   /* --------- ACAS --------- */
+   case Ist_CAS: {
+      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
+         /* "normal" singleton CAS */
+         UChar  sz;
+         IRCAS* cas = stmt->Ist.CAS.details;
+         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+         switch (ty) { 
+            case Ity_I64: sz = 8; break;
+            case Ity_I32: sz = 4; break;
+            case Ity_I16: sz = 2; break;
+            case Ity_I8:  sz = 1; break; 
+            default: goto unhandled_cas;
+         }
+         HReg rAddr = iselIntExpr_R(env, cas->addr);
+         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
+         HReg rData = iselIntExpr_R(env, cas->dataLo);
+         vassert(cas->expdHi == NULL);
+         vassert(cas->dataHi == NULL);
+         addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rAddr));
+         addInstr(env, ARM64Instr_MovI(hregARM64_X5(), rExpd));
+         addInstr(env, ARM64Instr_MovI(hregARM64_X7(), rData));
+         addInstr(env, ARM64Instr_CAS(sz));
+         /* Now we have the lowest szB bytes of x1 are either equal to
+            the lowest szB bytes of x5, indicating success, or they
+            aren't, indicating failure.  The IR semantics actually
+            require us to return the old value at the location,
+            regardless of success or failure, but in the case of
+            failure it's not clear how to do this, since
+            ARM64Instr_CAS can't provide that.  Instead we'll just
+            return the relevant bit of x1, since that's at least
+            guaranteed to be different from the lowest bits of x5 on
+            failure. */
+         HReg rResult = hregARM64_X1();
+         switch (sz) {
+            case 8:  break;
+            case 4:  rResult = widen_z_32_to_64(env, rResult); break;
+            case 2:  rResult = widen_z_16_to_64(env, rResult); break;
+            case 1:  rResult = widen_z_8_to_64(env, rResult); break;
+            default: vassert(0);
+         }
+         // "old" in this case is interpreted somewhat liberally, per
+         // the previous comment.
+         HReg rOld = lookupIRTemp(env, cas->oldLo);
+         addInstr(env, ARM64Instr_MovI(rOld, rResult));
+         return;
+      }
+      unhandled_cas:
+      break;
+   }
+
    /* --------- MEM FENCE --------- */
    case Ist_MBE:
       switch (stmt->Ist.MBE.event) {
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
index 8c4845e..26e9880 100644
--- a/VEX/priv/main_main.c
+++ b/VEX/priv/main_main.c
@@ -1556,6 +1556,7 @@ void LibVEX_default_VexAbiInfo ( /*OUT*/VexAbiInfo* vbi )
    vbi->guest_amd64_assume_gs_is_const = False;
    vbi->guest_ppc_zap_RZ_at_blr        = False;
    vbi->guest_ppc_zap_RZ_at_bl         = NULL;
+   vbi->guest__use_fallback_LLSC       = False;
    vbi->host_ppc_calls_use_fndescrs    = False;
 }
 
diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h
index 8ac3d9f..cbbb1ad 100644
--- a/VEX/pub/libvex.h
+++ b/VEX/pub/libvex.h
@@ -369,6 +369,11 @@ void LibVEX_default_VexArchInfo ( /*OUT*/VexArchInfo* vai );
       guest is ppc32-linux                ==> const False
       guest is other                      ==> inapplicable
 
+   guest__use_fallback_LLSC
+      guest is mips32                     ==> applicable, default True
+      guest is mips64                     ==> applicable, default True
+      guest is arm64                      ==> applicable, default False
+
    host_ppc_calls_use_fndescrs:
       host is ppc32-linux                 ==> False
       host is ppc64-linux                 ==> True
@@ -401,11 +406,17 @@ typedef
          is assumed equivalent to a fn which always returns False. */
       Bool (*guest_ppc_zap_RZ_at_bl)(Addr);
 
+      /* Potentially for all guests that use LL/SC: use the fallback
+         (synthesised) implementation rather than passing LL/SC on to
+         the host? */
+      Bool guest__use_fallback_LLSC;
+
       /* PPC32/PPC64 HOSTS only: does '&f' give us a pointer to a
          function descriptor on the host, or to the function code
          itself?  True => descriptor, False => code. */
       Bool host_ppc_calls_use_fndescrs;
 
+      /* ??? Description ??? */
       Bool guest_mips_fp_mode64;
    }
    VexAbiInfo;
diff --git a/VEX/pub/libvex_guest_arm64.h b/VEX/pub/libvex_guest_arm64.h
index c438c1e..8b62cdd 100644
--- a/VEX/pub/libvex_guest_arm64.h
+++ b/VEX/pub/libvex_guest_arm64.h
@@ -159,9 +159,14 @@ typedef
          note of bits 23 and 22. */
       UInt  guest_FPCR;
 
+      /* Fallback LL/SC support.  See bugs 344524 and 369459. */
+      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4 or 8.
+      ULong guest_LLSC_ADDR; // Address of transaction.
+      ULong guest_LLSC_DATA; // Original value at _ADDR, zero-extended.
+
       /* Padding to make it have an 16-aligned size */
       /* UInt  pad_end_0; */
-      /* ULong pad_end_1; */
+      ULong pad_end_1;
    }
    VexGuestARM64State;