Blame SOURCES/valgrind-3.18.1-arm64-doubleword-cas.patch

4db4a6
commit 7dbe2fed72886874f2eaf57dc07929542ae55b58
4db4a6
Author: Julian Seward <jseward@acm.org>
4db4a6
Date:   Fri Nov 12 10:40:48 2021 +0100
4db4a6
4db4a6
    Bug 445354 - arm64 backend: incorrect code emitted for doubleword CAS.
4db4a6
    
4db4a6
    The sequence of instructions emitted by the arm64 backend for doubleword
4db4a6
    compare-and-swap is incorrect.  This could lead to incorrect simulation of the
4db4a6
    AArch8.1 atomic instructions (CASP, at least).  It also causes failures in the
4db4a6
    upcoming fix for v8.0 support for LD{,A}XP/ST{,L}XP in bug 444399, at least
4db4a6
    when running with the fallback LL/SC implementation
4db4a6
    (`--sim-hints=fallback-llsc`, or as autoselected at startup).  In the worst
4db4a6
    case it can cause segfaulting in the generated code, because it could jump
4db4a6
    backwards unexpectedly far.
4db4a6
    
4db4a6
    The problem is the sequence emitted for ARM64in_CASP:
4db4a6
    
4db4a6
    * the jump offsets are incorrect, both for `bne out` (x 2) and `cbnz w1, loop`.
4db4a6
    
4db4a6
    * using w1 to hold the success indication of the stxp instruction trashes the
4db4a6
      previous value in x1.  But the value in x1 is an output of ARM64in_CASP,
4db4a6
      hence one of the two output registers is corrupted.  That confuses any code
4db4a6
      downstream that want to inspect those values to find out whether or not the
4db4a6
      transaction succeeded.
4db4a6
    
4db4a6
    The fixes are to
4db4a6
    
4db4a6
    * fix the branch offsets
4db4a6
    
4db4a6
    * use a different register to hold the stxp success indication.  w3 is a
4db4a6
      convenient check.
4db4a6
4db4a6
diff --git a/VEX/priv/host_arm64_defs.c b/VEX/priv/host_arm64_defs.c
4db4a6
index 5dccc0495..5657bcab9 100644
4db4a6
--- a/VEX/priv/host_arm64_defs.c
4db4a6
+++ b/VEX/priv/host_arm64_defs.c
4db4a6
@@ -2271,6 +2271,7 @@ void getRegUsage_ARM64Instr ( HRegUsage* u, const ARM64Instr* i, Bool mode64 )
4db4a6
          addHRegUse(u, HRmWrite, hregARM64_X1());
4db4a6
          addHRegUse(u, HRmWrite, hregARM64_X9());
4db4a6
          addHRegUse(u, HRmWrite, hregARM64_X8());
4db4a6
+         addHRegUse(u, HRmWrite, hregARM64_X3());
4db4a6
          break;
4db4a6
       case ARM64in_MFence:
4db4a6
          return;
4db4a6
@@ -4254,16 +4255,16 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
4db4a6
 
4db4a6
               -- always:
4db4a6
               cmp     x0, x8                 // EB08001F
4db4a6
-              bne     out                    // 540000E1 (b.ne #28 <out>)
4db4a6
+              bne     out                    // 540000A1
4db4a6
               cmp     x1, x9                 // EB09003F
4db4a6
-              bne     out                    // 540000A1 (b.ne #20 <out>)
4db4a6
+              bne     out                    // 54000061
4db4a6
 
4db4a6
               -- one of:
4db4a6
-              stxp    w1, x6, x7, [x2]       // C8211C46
4db4a6
-              stxp    w1, w6, w7, [x2]       // 88211C46
4db4a6
+              stxp    w3, x6, x7, [x2]       // C8231C46
4db4a6
+              stxp    w3, w6, w7, [x2]       // 88231C46
4db4a6
 
4db4a6
               -- always:
4db4a6
-              cbnz    w1, loop               // 35FFFE81 (cbnz w1, #-48 <loop>)
4db4a6
+              cbnz    w3, loop               // 35FFFF03
4db4a6
             out:
4db4a6
          */
4db4a6
          switch (i->ARM64in.CASP.szB) {
4db4a6
@@ -4277,15 +4278,15 @@ Int emit_ARM64Instr ( /*MB_MOD*/Bool* is_profInc,
4db4a6
             default: vassert(0);
4db4a6
          }
4db4a6
          *p++ = 0xEB08001F;
4db4a6
-         *p++ = 0x540000E1;
4db4a6
-         *p++ = 0xEB09003F;
4db4a6
          *p++ = 0x540000A1;
4db4a6
+         *p++ = 0xEB09003F;
4db4a6
+         *p++ = 0x54000061;
4db4a6
          switch (i->ARM64in.CASP.szB) {
4db4a6
-            case 8:  *p++ = 0xC8211C46; break;
4db4a6
-            case 4:  *p++ = 0x88211C46; break;
4db4a6
+            case 8:  *p++ = 0xC8231C46; break;
4db4a6
+            case 4:  *p++ = 0x88231C46; break;
4db4a6
             default: vassert(0);
4db4a6
          }
4db4a6
-         *p++ = 0x35FFFE81;
4db4a6
+         *p++ = 0x35FFFF03;
4db4a6
          goto done;
4db4a6
       }
4db4a6
       case ARM64in_MFence: {
4db4a6
diff --git a/VEX/priv/host_arm64_defs.h b/VEX/priv/host_arm64_defs.h
4db4a6
index f0737f2c6..01fb5708e 100644
4db4a6
--- a/VEX/priv/host_arm64_defs.h
4db4a6
+++ b/VEX/priv/host_arm64_defs.h
4db4a6
@@ -720,6 +720,7 @@ typedef
4db4a6
             Int  szB; /* 1, 2, 4 or 8 */
4db4a6
          } StrEX;
4db4a6
          /* x1 = CAS(x3(addr), x5(expected) -> x7(new)),
4db4a6
+            and trashes x8
4db4a6
             where x1[8*szB-1 : 0] == x5[8*szB-1 : 0] indicates success,
4db4a6
                   x1[8*szB-1 : 0] != x5[8*szB-1 : 0] indicates failure.
4db4a6
             Uses x8 as scratch (but that's not allocatable).
4db4a6
@@ -738,7 +739,7 @@ typedef
4db4a6
             -- if branch taken, failure; x1[[8*szB-1 : 0] holds old value
4db4a6
             -- attempt to store
4db4a6
             stxr    w8, x7, [x3]
4db4a6
-            -- if store successful, x1==0, so the eor is "x1 := x5"
4db4a6
+            -- if store successful, x8==0
4db4a6
             -- if store failed,     branch back and try again.
4db4a6
             cbne    w8, loop
4db4a6
            after:
4db4a6
@@ -746,6 +747,12 @@ typedef
4db4a6
          struct {
4db4a6
             Int szB; /* 1, 2, 4 or 8 */
4db4a6
          } CAS;
4db4a6
+         /* Doubleworld CAS, 2 x 32 bit or 2 x 64 bit
4db4a6
+            x0(oldLSW),x1(oldMSW)
4db4a6
+               = DCAS(x2(addr), x4(expectedLSW),x5(expectedMSW)
4db4a6
+                                -> x6(newLSW),x7(newMSW))
4db4a6
+            and trashes x8, x9 and x3
4db4a6
+         */
4db4a6
          struct {
4db4a6
             Int szB; /* 4 or 8 */
4db4a6
          } CASP;