e94d64
Only arm64. Removed the MIPS part.
e94d64
e94d64
commit 6b72dc54b722af5f6a87ebe258d3da6bcba059b7
e94d64
Author: Julian Seward <jseward@acm.org>
e94d64
Date:   Mon Apr 24 09:23:43 2017 +0000
e94d64
e94d64
    Bug 369459 - valgrind on arm64 violates the ARMv8 spec (ldxr/stxr)
e94d64
    
e94d64
    This implements a fallback LL/SC implementation as described in bug 344524.
e94d64
    
e94d64
    The fallback implementation is not enabled by default, and there is no
e94d64
    auto-detection for when it should be used.  To use it, run with the
e94d64
    flag --sim-hints=fallback-llsc.  This commit also allows the existing
e94d64
    MIPS fallback implementation to be enabled with that flag.
e94d64
    
e94d64
    VEX side changes:
e94d64
    
e94d64
    * priv/main_main.c, pub/libvex.h
e94d64
    
e94d64
      Adds new field guest__use_fallback_LLSC to VexAbiInfo
e94d64
    
e94d64
    * pub/libvex_guest_arm64.h priv/guest_arm64_toIR.c
e94d64
    
e94d64
      add front end support, new guest state fields
e94d64
      guest_LLSC_{SIZE,ADDR,DATA}, also documentation of the scheme
e94d64
    
e94d64
    * priv/guest_mips_toIR.c
e94d64
    
e94d64
      allow manual selection of fallback implementation via
e94d64
      --sim-hints=fallback-llsc
e94d64
    
e94d64
    * priv/host_arm64_defs.c priv/host_arm64_defs.h priv/host_arm64_isel.c
e94d64
    
e94d64
      Add support for generating CAS on arm64, as needed by the front end changes
e94d64
    
e94d64
    
e94d64
    
e94d64
    git-svn-id: svn://svn.valgrind.org/vex/trunk@3352
e94d64
e94d64
diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
e94d64
index 088af55..421db37 100644
e94d64
--- a/VEX/priv/guest_arm64_toIR.c
e94d64
+++ b/VEX/priv/guest_arm64_toIR.c
e94d64
@@ -1147,6 +1147,10 @@ static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
e94d64
 #define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
e94d64
 #define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)
e94d64
 
e94d64
+#define OFFB_LLSC_SIZE offsetof(VexGuestARM64State,guest_LLSC_SIZE)
e94d64
+#define OFFB_LLSC_ADDR offsetof(VexGuestARM64State,guest_LLSC_ADDR)
e94d64
+#define OFFB_LLSC_DATA offsetof(VexGuestARM64State,guest_LLSC_DATA)
e94d64
+
e94d64
 
e94d64
 /* ---------------- Integer registers ---------------- */
e94d64
 
e94d64
@@ -4702,7 +4706,9 @@ const HChar* nameArr_Q_SZ ( UInt bitQ, UInt size )
e94d64
 
e94d64
 
e94d64
 static
e94d64
-Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
e94d64
+Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn,
e94d64
+                          const VexAbiInfo* abiinfo
e94d64
+)
e94d64
 {
e94d64
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
e94d64
 
e94d64
@@ -6457,6 +6463,32 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
e94d64
       sz 001000 000 s     0 11111 n t   STX{R,RH,RB}  Ws, Rt, [Xn|SP]
e94d64
       sz 001000 000 s     1 11111 n t   STLX{R,RH,RB} Ws, Rt, [Xn|SP]
e94d64
    */
e94d64
+   /* For the "standard" implementation we pass through the LL and SC to
e94d64
+      the host.  For the "fallback" implementation, for details see
e94d64
+        https://bugs.kde.org/show_bug.cgi?id=344524 and
e94d64
+        https://bugs.kde.org/show_bug.cgi?id=369459,
e94d64
+      but in short:
e94d64
+
e94d64
+      LoadLinked(addr)
e94d64
+        gs.LLsize = load_size // 1, 2, 4 or 8
e94d64
+        gs.LLaddr = addr
e94d64
+        gs.LLdata = zeroExtend(*addr)
e94d64
+
e94d64
+      StoreCond(addr, data)
e94d64
+        tmp_LLsize = gs.LLsize
e94d64
+        gs.LLsize = 0 // "no transaction"
e94d64
+        if tmp_LLsize != store_size        -> fail
e94d64
+        if addr != gs.LLaddr               -> fail
e94d64
+        if zeroExtend(*addr) != gs.LLdata  -> fail
e94d64
+        cas_ok = CAS(store_size, addr, gs.LLdata -> data)
e94d64
+        if !cas_ok                         -> fail
e94d64
+        succeed
e94d64
+
e94d64
+      When thread scheduled
e94d64
+        gs.LLsize = 0 // "no transaction"
e94d64
+        (coregrind/m_scheduler/scheduler.c, run_thread_for_a_while()
e94d64
+         has to do this bit)
e94d64
+   */   
e94d64
    if (INSN(29,23) == BITS7(0,0,1,0,0,0,0)
e94d64
        && (INSN(23,21) & BITS3(1,0,1)) == BITS3(0,0,0)
e94d64
        && INSN(14,10) == BITS5(1,1,1,1,1)) {
e94d64
@@ -6478,29 +6510,99 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
e94d64
 
e94d64
       if (isLD && ss == BITS5(1,1,1,1,1)) {
e94d64
          IRTemp res = newTemp(ty);
e94d64
-         stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
e94d64
-         putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
e94d64
+         if (abiinfo->guest__use_fallback_LLSC) {
e94d64
+            // Do the load first so we don't update any guest state
e94d64
+            // if it faults.
e94d64
+            IRTemp loaded_data64 = newTemp(Ity_I64);
e94d64
+            assign(loaded_data64, widenUto64(ty, loadLE(ty, mkexpr(ea))));
e94d64
+            stmt( IRStmt_Put( OFFB_LLSC_DATA, mkexpr(loaded_data64) ));
e94d64
+            stmt( IRStmt_Put( OFFB_LLSC_ADDR, mkexpr(ea) ));
e94d64
+            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(szB) ));
e94d64
+            putIReg64orZR(tt, mkexpr(loaded_data64));
e94d64
+         } else {
e94d64
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), NULL/*LL*/));
e94d64
+            putIReg64orZR(tt, widenUto64(ty, mkexpr(res)));
e94d64
+         }
e94d64
          if (isAcqOrRel) {
e94d64
             stmt(IRStmt_MBE(Imbe_Fence));
e94d64
          }
e94d64
-         DIP("ld%sx%s %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
e94d64
-             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
e94d64
+         DIP("ld%sx%s %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
e94d64
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
e94d64
+             abiinfo->guest__use_fallback_LLSC
e94d64
+                ? "(fallback implementation)" : "");
e94d64
          return True;
e94d64
       }
e94d64
       if (!isLD) {
e94d64
          if (isAcqOrRel) {
e94d64
             stmt(IRStmt_MBE(Imbe_Fence));
e94d64
          }
e94d64
-         IRTemp  res  = newTemp(Ity_I1);
e94d64
          IRExpr* data = narrowFrom64(ty, getIReg64orZR(tt));
e94d64
-         stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
e94d64
-         /* IR semantics: res is 1 if store succeeds, 0 if it fails.
e94d64
-            Need to set rS to 1 on failure, 0 on success. */
e94d64
-         putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
e94d64
-                                            mkU64(1)));
e94d64
-         DIP("st%sx%s %s, %s, [%s]\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
e94d64
+         if (abiinfo->guest__use_fallback_LLSC) {
e94d64
+            // This is really ugly, since we don't have any way to do
e94d64
+            // proper if-then-else.  First, set up as if the SC failed,
e94d64
+            // and jump forwards if it really has failed.
e94d64
+
e94d64
+            // Continuation address
e94d64
+            IRConst* nia = IRConst_U64(guest_PC_curr_instr + 4);
e94d64
+
e94d64
+            // "the SC failed".  Any non-zero value means failure.
e94d64
+            putIReg64orZR(ss, mkU64(1));
e94d64
+          
e94d64
+            IRTemp tmp_LLsize = newTemp(Ity_I64);
e94d64
+            assign(tmp_LLsize, IRExpr_Get(OFFB_LLSC_SIZE, Ity_I64));
e94d64
+            stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) // "no transaction"
e94d64
+            ));
e94d64
+            // Fail if no or wrong-size transaction
e94d64
+            vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
e94d64
+            stmt( IRStmt_Exit(
e94d64
+                     binop(Iop_CmpNE64, mkexpr(tmp_LLsize), mkU64(szB)),
e94d64
+                     Ijk_Boring, nia, OFFB_PC
e94d64
+            ));
e94d64
+            // Fail if the address doesn't match the LL address
e94d64
+            stmt( IRStmt_Exit(
e94d64
+                      binop(Iop_CmpNE64, mkexpr(ea),
e94d64
+                                         IRExpr_Get(OFFB_LLSC_ADDR, Ity_I64)),
e94d64
+                      Ijk_Boring, nia, OFFB_PC
e94d64
+            ));
e94d64
+            // Fail if the data doesn't match the LL data
e94d64
+            IRTemp llsc_data64 = newTemp(Ity_I64);
e94d64
+            assign(llsc_data64, IRExpr_Get(OFFB_LLSC_DATA, Ity_I64));
e94d64
+            stmt( IRStmt_Exit(
e94d64
+                      binop(Iop_CmpNE64, widenUto64(ty, loadLE(ty, mkexpr(ea))),
e94d64
+                                         mkexpr(llsc_data64)),
e94d64
+                      Ijk_Boring, nia, OFFB_PC
e94d64
+            ));
e94d64
+            // Try to CAS the new value in.
e94d64
+            IRTemp old = newTemp(ty);
e94d64
+            IRTemp expd = newTemp(ty);
e94d64
+            assign(expd, narrowFrom64(ty, mkexpr(llsc_data64)));
e94d64
+            stmt( IRStmt_CAS(mkIRCAS(/*oldHi*/IRTemp_INVALID, old,
e94d64
+                                     Iend_LE, mkexpr(ea),
e94d64
+                                     /*expdHi*/NULL, mkexpr(expd),
e94d64
+                                     /*dataHi*/NULL, data
e94d64
+            )));
e94d64
+            // Fail if the CAS failed (viz, old != expd)
e94d64
+            stmt( IRStmt_Exit(
e94d64
+                      binop(Iop_CmpNE64,
e94d64
+                            widenUto64(ty, mkexpr(old)),
e94d64
+                            widenUto64(ty, mkexpr(expd))),
e94d64
+                      Ijk_Boring, nia, OFFB_PC
e94d64
+            ));
e94d64
+            // Otherwise we succeeded (!)
e94d64
+            putIReg64orZR(ss, mkU64(0));
e94d64
+         } else {
e94d64
+            IRTemp res = newTemp(Ity_I1);
e94d64
+            stmt(IRStmt_LLSC(Iend_LE, res, mkexpr(ea), data));
e94d64
+            /* IR semantics: res is 1 if store succeeds, 0 if it fails.
e94d64
+               Need to set rS to 1 on failure, 0 on success. */
e94d64
+            putIReg64orZR(ss, binop(Iop_Xor64, unop(Iop_1Uto64, mkexpr(res)),
e94d64
+                                               mkU64(1)));
e94d64
+         }
e94d64
+         DIP("st%sx%s %s, %s, [%s] %s\n", isAcqOrRel ? "a" : "", suffix[szBlg2],
e94d64
              nameIRegOrZR(False, ss),
e94d64
-             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn));
e94d64
+             nameIRegOrZR(szB == 8, tt), nameIReg64orSP(nn),
e94d64
+             abiinfo->guest__use_fallback_LLSC
e94d64
+                ? "(fallback implementation)" : "");
e94d64
          return True;
e94d64
       }
e94d64
       /* else fall through */
e94d64
@@ -6589,7 +6691,8 @@ Bool dis_ARM64_load_store(/*MB_OUT*/DisResult* dres, UInt insn)
e94d64
 
e94d64
 static
e94d64
 Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
e94d64
-                          const VexArchInfo* archinfo)
e94d64
+                          const VexArchInfo* archinfo,
e94d64
+                          const VexAbiInfo* abiinfo)
e94d64
 {
e94d64
 #  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
e94d64
 
e94d64
@@ -7048,7 +7151,11 @@ Bool dis_ARM64_branch_etc(/*MB_OUT*/DisResult* dres, UInt insn,
e94d64
       /* AFAICS, this simply cancels a (all?) reservations made by a
e94d64
          (any?) preceding LDREX(es).  Arrange to hand it through to
e94d64
          the back end. */
e94d64
-      stmt( IRStmt_MBE(Imbe_CancelReservation) );
e94d64
+      if (abiinfo->guest__use_fallback_LLSC) {
e94d64
+         stmt( IRStmt_Put( OFFB_LLSC_SIZE, mkU64(0) )); // "no transaction"
e94d64
+      } else {
e94d64
+         stmt( IRStmt_MBE(Imbe_CancelReservation) );
e94d64
+      }
e94d64
       DIP("clrex #%u\n", mm);
e94d64
       return True;
e94d64
    }
e94d64
@@ -14411,12 +14518,12 @@ Bool disInstr_ARM64_WRK (
e94d64
          break;
e94d64
       case BITS4(1,0,1,0): case BITS4(1,0,1,1):
e94d64
          // Branch, exception generation and system instructions
e94d64
-         ok = dis_ARM64_branch_etc(dres, insn, archinfo);
e94d64
+         ok = dis_ARM64_branch_etc(dres, insn, archinfo, abiinfo);
e94d64
          break;
e94d64
       case BITS4(0,1,0,0): case BITS4(0,1,1,0):
e94d64
       case BITS4(1,1,0,0): case BITS4(1,1,1,0):
e94d64
          // Loads and stores
e94d64
-         ok = dis_ARM64_load_store(dres, insn);
e94d64
+         ok = dis_ARM64_load_store(dres, insn, abiinfo);
e94d64
          break;
e94d64
       case BITS4(0,1,0,1): case BITS4(1,1,0,1):
e94d64
          // Data processing - register
e94d64
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
e94d64
index cc7c832..c9affbd 100644
e94d64
--- a/VEX/priv/host_arm64_defs.c
e94d64
+++ b/VEX/priv/host_arm64_defs.c
e94d64
@@ -1005,6 +1005,13 @@ ARM64Instr* ARM64Instr_StrEX ( Int szB ) {
e94d64
    vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
e94d64
    return i;
e94d64
 }
e94d64
+ARM64Instr* ARM64Instr_CAS ( Int szB ) {
e94d64
+   ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
e94d64
+   i->tag             = ARM64in_CAS;
e94d64
+   i->ARM64in.CAS.szB = szB;
e94d64
+   vassert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
e94d64
+   return i;
e94d64
+}
e94d64
 ARM64Instr* ARM64Instr_MFence ( void ) {
e94d64
    ARM64Instr* i = LibVEX_Alloc_inline(sizeof(ARM64Instr));
e94d64
    i->tag        = ARM64in_MFence;
e94d64
@@ -1569,6 +1576,10 @@ void ppARM64Instr ( const ARM64Instr* i ) {
e94d64
                     sz, i->ARM64in.StrEX.szB == 8 ? 'x' : 'w');
e94d64
          return;
e94d64
       }
e94d64
+      case ARM64in_CAS: {
e94d64
+         vex_printf("x1 = cas(%dbit)(x3, x5 -> x7)", 8 * i->ARM64in.CAS.szB);
e94d64
+         return;
e94d64
+      }
e94d64
       case ARM64in_MFence:
e94d64
          vex_printf("(mfence) dsb sy; dmb sy; isb");
e94d64
          return;
e94d64
@@ -2064,6 +2075,14 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
e94d64
          addHRegUse(u, HRmWrite, hregARM64_X0());
e94d64
          addHRegUse(u, HRmRead, hregARM64_X2());
e94d64
          return;
e94d64
+      case ARM64in_CAS:
e94d64
+         addHRegUse(u, HRmRead, hregARM64_X3());
e94d64
+         addHRegUse(u, HRmRead, hregARM64_X5());
e94d64
+         addHRegUse(u, HRmRead, hregARM64_X7());
e94d64
+         addHRegUse(u, HRmWrite, hregARM64_X1());
e94d64
+         /* Pointless to state this since X8 is not available to RA. */
e94d64
+         addHRegUse(u, HRmWrite, hregARM64_X8());
e94d64
+         break;
e94d64
       case ARM64in_MFence:
e94d64
          return;
e94d64
       case ARM64in_ClrEX:
e94d64
@@ -2326,6 +2345,8 @@ void mapRegs_ARM64Instr ( HRegRemap* m, ARM64Instr* i, Bool mode64 )
e94d64
          return;
e94d64
       case ARM64in_StrEX:
e94d64
          return;
e94d64
+      case ARM64in_CAS:
e94d64
+         return;
e94d64
       case ARM64in_MFence:
e94d64
          return;
e94d64
       case ARM64in_ClrEX:
e94d64
@@ -3803,6 +3824,61 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
e94d64
          }
e94d64
          goto bad;
e94d64
       }
e94d64
+      case ARM64in_CAS: {
e94d64
+         /* This isn't simple.  For an explanation see the comment in
e94d64
+            host_arm64_defs.h on the the definition of ARM64Instr case
e94d64
+            CAS. */
e94d64
+         /* Generate:
e94d64
+              -- one of:
e94d64
+              mov     x8, x5                 // AA0503E8
e94d64
+              and     x8, x5, #0xFFFFFFFF    // 92407CA8
e94d64
+              and     x8, x5, #0xFFFF        // 92403CA8
e94d64
+              and     x8, x5, #0xFF          // 92401CA8
e94d64
+
e94d64
+              -- one of:
e94d64
+              ldxr    x1, [x3]               // C85F7C61
e94d64
+              ldxr    w1, [x3]               // 885F7C61
e94d64
+              ldxrh   w1, [x3]               // 485F7C61 
e94d64
+              ldxrb   w1, [x3]               // 085F7C61
e94d64
+
e94d64
+              -- always:
e94d64
+              cmp     x1, x8                 // EB08003F
e94d64
+              bne     out                    // 54000061
e94d64
+
e94d64
+              -- one of:
e94d64
+              stxr    w1, x7, [x3]           // C8017C67
e94d64
+              stxr    w1, w7, [x3]           // 88017C67
e94d64
+              stxrh   w1, w7, [x3]           // 48017C67
e94d64
+              stxrb   w1, w7, [x3]           // 08017C67
e94d64
+
e94d64
+              -- always:
e94d64
+              eor     x1, x5, x1             // CA0100A1
e94d64
+            out:
e94d64
+         */
e94d64
+         switch (i->ARM64in.CAS.szB) {
e94d64
+            case 8:  *p++ = 0xAA0503E8; break;
e94d64
+            case 4:  *p++ = 0x92407CA8; break;
e94d64
+            case 2:  *p++ = 0x92403CA8; break;
e94d64
+            case 1:  *p++ = 0x92401CA8; break;
e94d64
+            default: vassert(0);
e94d64
+         }
e94d64
+         switch (i->ARM64in.CAS.szB) {
e94d64
+            case 8:  *p++ = 0xC85F7C61; break;
e94d64
+            case 4:  *p++ = 0x885F7C61; break;
e94d64
+            case 2:  *p++ = 0x485F7C61; break;
e94d64
+            case 1:  *p++ = 0x085F7C61; break;
e94d64
+         }
e94d64
+         *p++ = 0xEB08003F;
e94d64
+         *p++ = 0x54000061;
e94d64
+         switch (i->ARM64in.CAS.szB) {
e94d64
+            case 8:  *p++ = 0xC8017C67; break;
e94d64
+            case 4:  *p++ = 0x88017C67; break;
e94d64
+            case 2:  *p++ = 0x48017C67; break;
e94d64
+            case 1:  *p++ = 0x08017C67; break;
e94d64
+         }
e94d64
+         *p++ = 0xCA0100A1;
e94d64
+         goto done;
e94d64
+      }
e94d64
       case ARM64in_MFence: {
e94d64
          *p++ = 0xD5033F9F; /* DSB sy */
e94d64
          *p++ = 0xD5033FBF; /* DMB sy */
e94d64
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
e94d64
index 62b25fd..92d247e 100644
e94d64
--- a/VEX/priv/host_arm64_defs.h
e94d64
+++ b/VEX/priv/host_arm64_defs.h
e94d64
@@ -481,6 +481,7 @@ typedef
e94d64
       ARM64in_Mul,
e94d64
       ARM64in_LdrEX,
e94d64
       ARM64in_StrEX,
e94d64
+      ARM64in_CAS,
e94d64
       ARM64in_MFence,
e94d64
       ARM64in_ClrEX,
e94d64
       /* ARM64in_V*: scalar ops involving vector registers */
e94d64
@@ -668,6 +669,32 @@ typedef
e94d64
          struct {
e94d64
             Int  szB; /* 1, 2, 4 or 8 */
e94d64
          } StrEX;
e94d64
+         /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
e94d64
+            where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
e94d64
+                  x1[8*szB-1 : 0] != x5[8*szB-1 : 0] indicates failure.
e94d64
+            Uses x8 as scratch (but that's not allocatable).
e94d64
+            Hence: RD x3, x5, x7; WR x1
e94d64
+
e94d64
+            (szB=8)  mov  x8, x5
e94d64
+            (szB=4)  and  x8, x5, #0xFFFFFFFF
e94d64
+            (szB=2)  and  x8, x5, #0xFFFF
e94d64
+            (szB=1)  and  x8, x5, #0xFF
e94d64
+            -- x8 is correctly zero-extended expected value
e94d64
+            ldxr    x1, [x3]
e94d64
+            -- x1 is correctly zero-extended actual value
e94d64
+            cmp     x1, x8
e94d64
+            bne     after
e94d64
+            -- if branch taken, failure; x1[[8*szB-1 : 0] holds old value
e94d64
+            -- attempt to store
e94d64
+            stxr    w1, x7, [x3]
e94d64
+            -- if store successful, x1==0, so the eor is "x1 := x5"
e94d64
+            -- if store failed,     x1==1, so the eor makes x1 != x5
e94d64
+            eor     x1, x5, x1
e94d64
+           after:
e94d64
+         */
e94d64
+         struct {
e94d64
+            Int szB; /* 1, 2, 4 or 8 */
e94d64
+         } CAS;
e94d64
          /* Mem fence.  An insn which fences all loads and stores as
e94d64
             much as possible before continuing.  On ARM64 we emit the
e94d64
             sequence "dsb sy ; dmb sy ; isb sy", which is probably
e94d64
@@ -912,6 +939,7 @@ extern ARM64Instr* ARM64Instr_Mul     ( HReg dst, HReg argL, HReg argR,
e94d64
                                         ARM64MulOp op );
e94d64
 extern ARM64Instr* ARM64Instr_LdrEX   ( Int szB );
e94d64
 extern ARM64Instr* ARM64Instr_StrEX   ( Int szB );
e94d64
+extern ARM64Instr* ARM64Instr_CAS     ( Int szB );
e94d64
 extern ARM64Instr* ARM64Instr_MFence  ( void );
e94d64
 extern ARM64Instr* ARM64Instr_ClrEX   ( void );
e94d64
 extern ARM64Instr* ARM64Instr_VLdStH  ( Bool isLoad, HReg sD, HReg rN,
e94d64
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
e94d64
index 42748e4..07ce87a 100644
e94d64
--- a/VEX/priv/host_arm64_isel.c
e94d64
+++ b/VEX/priv/host_arm64_isel.c
e94d64
@@ -1383,12 +1383,13 @@ static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
e94d64
            || e->Iex.Binop.op == Iop_CmpLT64S
e94d64
            || e->Iex.Binop.op == Iop_CmpLT64U
e94d64
            || e->Iex.Binop.op == Iop_CmpLE64S
e94d64
-           || e->Iex.Binop.op == Iop_CmpLE64U)) {
e94d64
+           || e->Iex.Binop.op == Iop_CmpLE64U
e94d64
+           || e->Iex.Binop.op == Iop_CasCmpEQ64)) {
e94d64
       HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
e94d64
       ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
e94d64
       addInstr(env, ARM64Instr_Cmp(argL, argR, True/*is64*/));
e94d64
       switch (e->Iex.Binop.op) {
e94d64
-         case Iop_CmpEQ64:  return ARM64cc_EQ;
e94d64
+         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return ARM64cc_EQ;
e94d64
          case Iop_CmpNE64:  return ARM64cc_NE;
e94d64
          case Iop_CmpLT64S: return ARM64cc_LT;
e94d64
          case Iop_CmpLT64U: return ARM64cc_CC;
e94d64
@@ -1405,12 +1406,13 @@ static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
e94d64
            || e->Iex.Binop.op == Iop_CmpLT32S
e94d64
            || e->Iex.Binop.op == Iop_CmpLT32U
e94d64
            || e->Iex.Binop.op == Iop_CmpLE32S
e94d64
-           || e->Iex.Binop.op == Iop_CmpLE32U)) {
e94d64
+           || e->Iex.Binop.op == Iop_CmpLE32U
e94d64
+           || e->Iex.Binop.op == Iop_CasCmpEQ32)) {
e94d64
       HReg      argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
e94d64
       ARM64RIA* argR = iselIntExpr_RIA(env, e->Iex.Binop.arg2);
e94d64
       addInstr(env, ARM64Instr_Cmp(argL, argR, False/*!is64*/));
e94d64
       switch (e->Iex.Binop.op) {
e94d64
-         case Iop_CmpEQ32:  return ARM64cc_EQ;
e94d64
+         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return ARM64cc_EQ;
e94d64
          case Iop_CmpNE32:  return ARM64cc_NE;
e94d64
          case Iop_CmpLT32S: return ARM64cc_LT;
e94d64
          case Iop_CmpLT32U: return ARM64cc_CC;
e94d64
@@ -1420,6 +1422,34 @@ static ARM64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
e94d64
       }
e94d64
    }
e94d64
 
e94d64
+   /* --- Cmp*16*(x,y) --- */
e94d64
+   if (e->tag == Iex_Binop
e94d64
+       && (e->Iex.Binop.op == Iop_CasCmpEQ16)) {
e94d64
+      HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
e94d64
+      HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
e94d64
+      HReg argL2 = widen_z_16_to_64(env, argL);
e94d64
+      HReg argR2 = widen_z_16_to_64(env, argR);
e94d64
+      addInstr(env, ARM64Instr_Cmp(argL2, ARM64RIA_R(argR2), True/*is64*/));
e94d64
+      switch (e->Iex.Binop.op) {
e94d64
+         case Iop_CasCmpEQ16: return ARM64cc_EQ;
e94d64
+         default: vpanic("iselCondCode(arm64): CmpXX16");
e94d64
+      }
e94d64
+   }
e94d64
+
e94d64
+   /* --- Cmp*8*(x,y) --- */
e94d64
+   if (e->tag == Iex_Binop
e94d64
+       && (e->Iex.Binop.op == Iop_CasCmpEQ8)) {
e94d64
+      HReg argL  = iselIntExpr_R(env, e->Iex.Binop.arg1);
e94d64
+      HReg argR  = iselIntExpr_R(env, e->Iex.Binop.arg2);
e94d64
+      HReg argL2 = widen_z_8_to_64(env, argL);
e94d64
+      HReg argR2 = widen_z_8_to_64(env, argR);
e94d64
+      addInstr(env, ARM64Instr_Cmp(argL2, ARM64RIA_R(argR2), True/*is64*/));
e94d64
+      switch (e->Iex.Binop.op) {
e94d64
+         case Iop_CasCmpEQ8: return ARM64cc_EQ;
e94d64
+         default: vpanic("iselCondCode(arm64): CmpXX8");
e94d64
+      }
e94d64
+   }
e94d64
+
e94d64
    ppIRExpr(e);
e94d64
    vpanic("iselCondCode");
e94d64
 }
e94d64
@@ -3833,6 +3863,57 @@ static void iselStmt ( ISelEnv* env, IRStmt* stmt )
e94d64
       break;
e94d64
    }
e94d64
 
e94d64
+   /* --------- ACAS --------- */
e94d64
+   case Ist_CAS: {
e94d64
+      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
e94d64
+         /* "normal" singleton CAS */
e94d64
+         UChar  sz;
e94d64
+         IRCAS* cas = stmt->Ist.CAS.details;
e94d64
+         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
e94d64
+         switch (ty) { 
e94d64
+            case Ity_I64: sz = 8; break;
e94d64
+            case Ity_I32: sz = 4; break;
e94d64
+            case Ity_I16: sz = 2; break;
e94d64
+            case Ity_I8:  sz = 1; break; 
e94d64
+            default: goto unhandled_cas;
e94d64
+         }
e94d64
+         HReg rAddr = iselIntExpr_R(env, cas->addr);
e94d64
+         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
e94d64
+         HReg rData = iselIntExpr_R(env, cas->dataLo);
e94d64
+         vassert(cas->expdHi == NULL);
e94d64
+         vassert(cas->dataHi == NULL);
e94d64
+         addInstr(env, ARM64Instr_MovI(hregARM64_X3(), rAddr));
e94d64
+         addInstr(env, ARM64Instr_MovI(hregARM64_X5(), rExpd));
e94d64
+         addInstr(env, ARM64Instr_MovI(hregARM64_X7(), rData));
e94d64
+         addInstr(env, ARM64Instr_CAS(sz));
e94d64
+         /* Now we have the lowest szB bytes of x1 are either equal to
e94d64
+            the lowest szB bytes of x5, indicating success, or they
e94d64
+            aren't, indicating failure.  The IR semantics actually
e94d64
+            require us to return the old value at the location,
e94d64
+            regardless of success or failure, but in the case of
e94d64
+            failure it's not clear how to do this, since
e94d64
+            ARM64Instr_CAS can't provide that.  Instead we'll just
e94d64
+            return the relevant bit of x1, since that's at least
e94d64
+            guaranteed to be different from the lowest bits of x5 on
e94d64
+            failure. */
e94d64
+         HReg rResult = hregARM64_X1();
e94d64
+         switch (sz) {
e94d64
+            case 8:  break;
e94d64
+            case 4:  rResult = widen_z_32_to_64(env, rResult); break;
e94d64
+            case 2:  rResult = widen_z_16_to_64(env, rResult); break;
e94d64
+            case 1:  rResult = widen_z_8_to_64(env, rResult); break;
e94d64
+            default: vassert(0);
e94d64
+         }
e94d64
+         // "old" in this case is interpreted somewhat liberally, per
e94d64
+         // the previous comment.
e94d64
+         HReg rOld = lookupIRTemp(env, cas->oldLo);
e94d64
+         addInstr(env, ARM64Instr_MovI(rOld, rResult));
e94d64
+         return;
e94d64
+      }
e94d64
+      unhandled_cas:
e94d64
+      break;
e94d64
+   }
e94d64
+
e94d64
    /* --------- MEM FENCE --------- */
e94d64
    case Ist_MBE:
e94d64
       switch (stmt->Ist.MBE.event) {
e94d64
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
e94d64
index 8c4845e..26e9880 100644
e94d64
--- a/VEX/priv/main_main.c
e94d64
+++ b/VEX/priv/main_main.c
e94d64
@@ -1556,6 +1556,7 @@ void LibVEX_default_VexAbiInfo ( /*OUT*/VexAbiInfo* vbi )
e94d64
    vbi->guest_amd64_assume_gs_is_const = False;
e94d64
    vbi->guest_ppc_zap_RZ_at_blr        = False;
e94d64
    vbi->guest_ppc_zap_RZ_at_bl         = NULL;
e94d64
+   vbi->guest__use_fallback_LLSC       = False;
e94d64
    vbi->host_ppc_calls_use_fndescrs    = False;
e94d64
 }
e94d64
 
e94d64
diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h
e94d64
index 8ac3d9f..cbbb1ad 100644
e94d64
--- a/VEX/pub/libvex.h
e94d64
+++ b/VEX/pub/libvex.h
e94d64
@@ -369,6 +369,11 @@ void LibVEX_default_VexArchInfo ( /*OUT*/VexArchInfo* vai );
e94d64
       guest is ppc32-linux                ==> const False
e94d64
       guest is other                      ==> inapplicable
e94d64
 
e94d64
+   guest__use_fallback_LLSC
e94d64
+      guest is mips32                     ==> applicable, default True
e94d64
+      guest is mips64                     ==> applicable, default True
e94d64
+      guest is arm64                      ==> applicable, default False
e94d64
+
e94d64
    host_ppc_calls_use_fndescrs:
e94d64
       host is ppc32-linux                 ==> False
e94d64
       host is ppc64-linux                 ==> True
e94d64
@@ -401,11 +406,17 @@ typedef
e94d64
          is assumed equivalent to a fn which always returns False. */
e94d64
       Bool (*guest_ppc_zap_RZ_at_bl)(Addr);
e94d64
 
e94d64
+      /* Potentially for all guests that use LL/SC: use the fallback
e94d64
+         (synthesised) implementation rather than passing LL/SC on to
e94d64
+         the host? */
e94d64
+      Bool guest__use_fallback_LLSC;
e94d64
+
e94d64
       /* PPC32/PPC64 HOSTS only: does '&f' give us a pointer to a
e94d64
          function descriptor on the host, or to the function code
e94d64
          itself?  True => descriptor, False => code. */
e94d64
       Bool host_ppc_calls_use_fndescrs;
e94d64
 
e94d64
+      /* ??? Description ??? */
e94d64
       Bool guest_mips_fp_mode64;
e94d64
    }
e94d64
    VexAbiInfo;
e94d64
diff --git a/VEX/pub/libvex_guest_arm64.h b/VEX/pub/libvex_guest_arm64.h
e94d64
index c438c1e..8b62cdd 100644
e94d64
--- a/VEX/pub/libvex_guest_arm64.h
e94d64
+++ b/VEX/pub/libvex_guest_arm64.h
e94d64
@@ -159,9 +159,14 @@ typedef
e94d64
          note of bits 23 and 22. */
e94d64
       UInt  guest_FPCR;
e94d64
 
e94d64
+      /* Fallback LL/SC support.  See bugs 344524 and 369459. */
e94d64
+      ULong guest_LLSC_SIZE; // 0==no current transaction, else 1,2,4 or 8.
e94d64
+      ULong guest_LLSC_ADDR; // Address of transaction.
e94d64
+      ULong guest_LLSC_DATA; // Original value at _ADDR, zero-extended.
e94d64
+
e94d64
       /* Padding to make it have an 16-aligned size */
e94d64
       /* UInt  pad_end_0; */
e94d64
-      /* ULong pad_end_1; */
e94d64
+      ULong pad_end_1;
e94d64
    }
e94d64
    VexGuestARM64State;
e94d64