Mark Wielaard d3173f
commit a4b7b67db47021c424c18a5729f250016d34df27
Mark Wielaard d3173f
Author: mjw <mjw@8f6e269a-dfd6-0310-a8e1-e2731360e62c>
Mark Wielaard d3173f
Date:   Tue Aug 27 10:19:03 2013 +0000
Mark Wielaard d3173f
Mark Wielaard d3173f
    Support mmxext (integer sse) subset on i386 (athlon).
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    Some processors like the AMD Athlon "Classic" support mmxext,
Mark Wielaard d3173f
    a sse1 subset. This subset is not properly detected by VEX.
Mark Wielaard d3173f
    The subset uses the same encoding as the sse1 instructions.
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    The subset is described at:
Mark Wielaard d3173f
      http://support.amd.com/us/Embedded_TechDocs/22466.pdf
Mark Wielaard d3173f
      https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    This introduces a new VEX_HWCAPS_X86_MMXEXT that sits between
Mark Wielaard d3173f
    the baseline (0) and VEX_HWCAPS_X86_SSE1. There is also a new
Mark Wielaard d3173f
    x86g_dirtyhelper_CPUID_mmxext to mimics a Athlon "Classic"
Mark Wielaard d3173f
    (Model 2, K75 "Pluto/Orion").
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    Groups all mmxext instructions together in one block.
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    git-svn-id: svn://svn.valgrind.org/vex/trunk@2745 8f6e269a-dfd6-0310-a8e1-e2731360e62c
Mark Wielaard d3173f
Mark Wielaard d3173f
diff --git a/VEX/priv/guest_x86_defs.h b/VEX/priv/guest_x86_defs.h
Mark Wielaard d3173f
index 389e6bb..1a16a0b 100644
Mark Wielaard d3173f
--- a/VEX/priv/guest_x86_defs.h
Mark Wielaard d3173f
+++ b/VEX/priv/guest_x86_defs.h
Mark Wielaard d3173f
@@ -144,6 +144,7 @@ extern ULong x86g_dirtyhelper_loadF80le  ( UInt );
Mark Wielaard d3173f
 extern void  x86g_dirtyhelper_storeF80le ( UInt, ULong );
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 extern void  x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* );
Mark Wielaard d3173f
+extern void  x86g_dirtyhelper_CPUID_mmxext ( VexGuestX86State* );
Mark Wielaard d3173f
 extern void  x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* );
Mark Wielaard d3173f
 extern void  x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* );
Mark Wielaard d3173f
 
Mark Wielaard d3173f
diff --git a/VEX/priv/guest_x86_helpers.c b/VEX/priv/guest_x86_helpers.c
Mark Wielaard d3173f
index 9c26794..e87e89f 100644
Mark Wielaard d3173f
--- a/VEX/priv/guest_x86_helpers.c
Mark Wielaard d3173f
+++ b/VEX/priv/guest_x86_helpers.c
Mark Wielaard d3173f
@@ -2207,6 +2207,63 @@ void x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* st )
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 /* CALLED FROM GENERATED CODE */
Mark Wielaard d3173f
 /* DIRTY HELPER (modifies guest state) */
Mark Wielaard d3173f
+/* Claim to be a Athlon "Classic" (Model 2, K75 "Pluto/Orion") */
Mark Wielaard d3173f
+/* But without 3DNow support (weird, but we really don't support it). */
Mark Wielaard d3173f
+void x86g_dirtyhelper_CPUID_mmxext ( VexGuestX86State* st )
Mark Wielaard d3173f
+{
Mark Wielaard d3173f
+   switch (st->guest_EAX) {
Mark Wielaard d3173f
+      /* vendor ID */
Mark Wielaard d3173f
+      case 0:
Mark Wielaard d3173f
+         st->guest_EAX = 0x1;
Mark Wielaard d3173f
+         st->guest_EBX = 0x68747541;
Mark Wielaard d3173f
+         st->guest_ECX = 0x444d4163;
Mark Wielaard d3173f
+         st->guest_EDX = 0x69746e65;
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+      /* feature bits */
Mark Wielaard d3173f
+      case 1:
Mark Wielaard d3173f
+         st->guest_EAX = 0x621;
Mark Wielaard d3173f
+         st->guest_EBX = 0x0;
Mark Wielaard d3173f
+         st->guest_ECX = 0x0;
Mark Wielaard d3173f
+         st->guest_EDX = 0x183f9ff;
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+      /* Highest Extended Function Supported (0x80000004 brand string) */
Mark Wielaard d3173f
+      case 0x80000000:
Mark Wielaard d3173f
+         st->guest_EAX = 0x80000004;
Mark Wielaard d3173f
+         st->guest_EBX = 0x68747541;
Mark Wielaard d3173f
+         st->guest_ECX = 0x444d4163;
Mark Wielaard d3173f
+         st->guest_EDX = 0x69746e65;
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+      /* Extended Processor Info and Feature Bits */
Mark Wielaard d3173f
+      case 0x80000001:
Mark Wielaard d3173f
+         st->guest_EAX = 0x721;
Mark Wielaard d3173f
+         st->guest_EBX = 0x0;
Mark Wielaard d3173f
+         st->guest_ECX = 0x0;
Mark Wielaard d3173f
+         st->guest_EDX = 0x1c3f9ff; /* Note no 3DNow. */
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+      /* Processor Brand String "AMD Athlon(tm) Processor" */
Mark Wielaard d3173f
+      case 0x80000002:
Mark Wielaard d3173f
+         st->guest_EAX = 0x20444d41;
Mark Wielaard d3173f
+         st->guest_EBX = 0x6c687441;
Mark Wielaard d3173f
+         st->guest_ECX = 0x74286e6f;
Mark Wielaard d3173f
+         st->guest_EDX = 0x5020296d;
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+      case 0x80000003:
Mark Wielaard d3173f
+         st->guest_EAX = 0x65636f72;
Mark Wielaard d3173f
+         st->guest_EBX = 0x726f7373;
Mark Wielaard d3173f
+         st->guest_ECX = 0x0;
Mark Wielaard d3173f
+         st->guest_EDX = 0x0;
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+      default:
Mark Wielaard d3173f
+         st->guest_EAX = 0x0;
Mark Wielaard d3173f
+         st->guest_EBX = 0x0;
Mark Wielaard d3173f
+         st->guest_ECX = 0x0;
Mark Wielaard d3173f
+         st->guest_EDX = 0x0;
Mark Wielaard d3173f
+         break;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+}
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+/* CALLED FROM GENERATED CODE */
Mark Wielaard d3173f
+/* DIRTY HELPER (modifies guest state) */
Mark Wielaard d3173f
 /* Claim to be the following SSE1-capable CPU:
Mark Wielaard d3173f
    vendor_id       : GenuineIntel
Mark Wielaard d3173f
    cpu family      : 6
Mark Wielaard d3173f
diff --git a/VEX/priv/guest_x86_toIR.c b/VEX/priv/guest_x86_toIR.c
Mark Wielaard d3173f
index 90499b0..e98f19c 100644
Mark Wielaard d3173f
--- a/VEX/priv/guest_x86_toIR.c
Mark Wielaard d3173f
+++ b/VEX/priv/guest_x86_toIR.c
Mark Wielaard d3173f
@@ -8318,7 +8318,18 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
       guest subarchitecture. */
Mark Wielaard d3173f
    if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
Mark Wielaard d3173f
       goto after_sse_decoders;
Mark Wielaard d3173f
-   
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* With mmxext only some extended MMX instructions are recognized.
Mark Wielaard d3173f
+      The mmxext instructions are MASKMOVQ MOVNTQ PAVGB PAVGW PMAXSW
Mark Wielaard d3173f
+      PMAXUB PMINSW PMINUB PMULHUW PSADBW PSHUFW PEXTRW PINSRW PMOVMSKB
Mark Wielaard d3173f
+      PREFETCHNTA PREFETCHT0 PREFETCHT1 PREFETCHT2 SFENCE
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+      http://support.amd.com/us/Embedded_TechDocs/22466.pdf
Mark Wielaard d3173f
+      https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions */
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
Mark Wielaard d3173f
+      goto mmxext;
Mark Wielaard d3173f
+
Mark Wielaard d3173f
    /* Otherwise we must be doing sse1 or sse2, so we can at least try
Mark Wielaard d3173f
       for SSE1 here. */
Mark Wielaard d3173f
 
Mark Wielaard d3173f
@@ -8627,6 +8638,11 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
       goto decode_success;
Mark Wielaard d3173f
    }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* mmxext sse1 subset starts here. mmxext only arches will parse
Mark Wielaard d3173f
+      only this subset of the sse1 instructions. */
Mark Wielaard d3173f
+  mmxext:
Mark Wielaard d3173f
+
Mark Wielaard d3173f
    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
Mark Wielaard d3173f
    /* 0F F7 = MASKMOVQ -- 8x8 masked store */
Mark Wielaard d3173f
    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
Mark Wielaard d3173f
@@ -8637,203 +8653,6 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
       goto decode_success;
Mark Wielaard d3173f
    }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
-   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
Mark Wielaard d3173f
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
Mark Wielaard d3173f
-      vassert(sz == 4);
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
Mark Wielaard d3173f
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
Mark Wielaard d3173f
-      vassert(sz == 4);
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
Mark Wielaard d3173f
-   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
Mark Wielaard d3173f
-      modrm = getIByte(delta+2);
Mark Wielaard d3173f
-      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
-         putXMMReg( gregOfRM(modrm), 
Mark Wielaard d3173f
-                    getXMMReg( eregOfRM(modrm) ));
Mark Wielaard d3173f
-         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
Mark Wielaard d3173f
-                                  nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-         delta += 2+1;
Mark Wielaard d3173f
-      } else {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
-         if (insn[1] == 0x28/*movaps*/)
Mark Wielaard d3173f
-            gen_SEGV_if_not_16_aligned( addr );
Mark Wielaard d3173f
-         putXMMReg( gregOfRM(modrm), 
Mark Wielaard d3173f
-                    loadLE(Ity_V128, mkexpr(addr)) );
Mark Wielaard d3173f
-         DIP("mov[ua]ps %s,%s\n", dis_buf,
Mark Wielaard d3173f
-                                  nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-         delta += 2+alen;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
Mark Wielaard d3173f
-   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F 
Mark Wielaard d3173f
-       && (insn[1] == 0x29 || insn[1] == 0x11)) {
Mark Wielaard d3173f
-      modrm = getIByte(delta+2);
Mark Wielaard d3173f
-      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
-         /* fall through; awaiting test case */
Mark Wielaard d3173f
-      } else {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
-         if (insn[1] == 0x29/*movaps*/)
Mark Wielaard d3173f
-            gen_SEGV_if_not_16_aligned( addr );
Mark Wielaard d3173f
-         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
Mark Wielaard d3173f
-         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
Mark Wielaard d3173f
-                                  dis_buf );
Mark Wielaard d3173f
-         delta += 2+alen;
Mark Wielaard d3173f
-         goto decode_success;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
Mark Wielaard d3173f
-   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
Mark Wielaard d3173f
-      modrm = getIByte(delta+2);
Mark Wielaard d3173f
-      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
-         delta += 2+1;
Mark Wielaard d3173f
-         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
Mark Wielaard d3173f
-                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
Mark Wielaard d3173f
-         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)), 
Mark Wielaard d3173f
-                               nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-      } else {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
-         delta += 2+alen;
Mark Wielaard d3173f
-         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
Mark Wielaard d3173f
-                          loadLE(Ity_I64, mkexpr(addr)) );
Mark Wielaard d3173f
-         DIP("movhps %s,%s\n", dis_buf, 
Mark Wielaard d3173f
-                               nameXMMReg( gregOfRM(modrm) ));
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
Mark Wielaard d3173f
-      if (!epartIsReg(insn[2])) {
Mark Wielaard d3173f
-         delta += 2;
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta, dis_buf );
Mark Wielaard d3173f
-         delta += alen;
Mark Wielaard d3173f
-         storeLE( mkexpr(addr), 
Mark Wielaard d3173f
-                  getXMMRegLane64( gregOfRM(insn[2]),
Mark Wielaard d3173f
-                                   1/*upper lane*/ ) );
Mark Wielaard d3173f
-         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
Mark Wielaard d3173f
-                               dis_buf);
Mark Wielaard d3173f
-         goto decode_success;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      /* else fall through */
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
Mark Wielaard d3173f
-   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
Mark Wielaard d3173f
-      modrm = getIByte(delta+2);
Mark Wielaard d3173f
-      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
-         delta += 2+1;
Mark Wielaard d3173f
-         putXMMRegLane64( gregOfRM(modrm),  
Mark Wielaard d3173f
-                          0/*lower lane*/,
Mark Wielaard d3173f
-                          getXMMRegLane64( eregOfRM(modrm), 1 ));
Mark Wielaard d3173f
-         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)), 
Mark Wielaard d3173f
-                                 nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-      } else {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
-         delta += 2+alen;
Mark Wielaard d3173f
-         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
Mark Wielaard d3173f
-                          loadLE(Ity_I64, mkexpr(addr)) );
Mark Wielaard d3173f
-         DIP("movlps %s, %s\n", 
Mark Wielaard d3173f
-             dis_buf, nameXMMReg( gregOfRM(modrm) ));
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
Mark Wielaard d3173f
-      if (!epartIsReg(insn[2])) {
Mark Wielaard d3173f
-         delta += 2;
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta, dis_buf );
Mark Wielaard d3173f
-         delta += alen;
Mark Wielaard d3173f
-         storeLE( mkexpr(addr), 
Mark Wielaard d3173f
-                  getXMMRegLane64( gregOfRM(insn[2]), 
Mark Wielaard d3173f
-                                   0/*lower lane*/ ) );
Mark Wielaard d3173f
-         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
Mark Wielaard d3173f
-                                dis_buf);
Mark Wielaard d3173f
-         goto decode_success;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      /* else fall through */
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
Mark Wielaard d3173f
-      to 4 lowest bits of ireg(G) */
Mark Wielaard d3173f
-   if (insn[0] == 0x0F && insn[1] == 0x50) {
Mark Wielaard d3173f
-      modrm = getIByte(delta+2);
Mark Wielaard d3173f
-      if (sz == 4 && epartIsReg(modrm)) {
Mark Wielaard d3173f
-         Int src;
Mark Wielaard d3173f
-         t0 = newTemp(Ity_I32);
Mark Wielaard d3173f
-         t1 = newTemp(Ity_I32);
Mark Wielaard d3173f
-         t2 = newTemp(Ity_I32);
Mark Wielaard d3173f
-         t3 = newTemp(Ity_I32);
Mark Wielaard d3173f
-         delta += 2+1;
Mark Wielaard d3173f
-         src = eregOfRM(modrm);
Mark Wielaard d3173f
-         assign( t0, binop( Iop_And32,
Mark Wielaard d3173f
-                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
Mark Wielaard d3173f
-                            mkU32(1) ));
Mark Wielaard d3173f
-         assign( t1, binop( Iop_And32,
Mark Wielaard d3173f
-                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
Mark Wielaard d3173f
-                            mkU32(2) ));
Mark Wielaard d3173f
-         assign( t2, binop( Iop_And32,
Mark Wielaard d3173f
-                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
Mark Wielaard d3173f
-                            mkU32(4) ));
Mark Wielaard d3173f
-         assign( t3, binop( Iop_And32,
Mark Wielaard d3173f
-                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
Mark Wielaard d3173f
-                            mkU32(8) ));
Mark Wielaard d3173f
-         putIReg(4, gregOfRM(modrm),
Mark Wielaard d3173f
-                    binop(Iop_Or32,
Mark Wielaard d3173f
-                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
Mark Wielaard d3173f
-                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
Mark Wielaard d3173f
-                         )
Mark Wielaard d3173f
-                 );
Mark Wielaard d3173f
-         DIP("movmskps %s,%s\n", nameXMMReg(src), 
Mark Wielaard d3173f
-                                 nameIReg(4, gregOfRM(modrm)));
Mark Wielaard d3173f
-         goto decode_success;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      /* else fall through */
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
Mark Wielaard d3173f
-   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
Mark Wielaard d3173f
-   if (insn[0] == 0x0F && insn[1] == 0x2B) {
Mark Wielaard d3173f
-      modrm = getIByte(delta+2);
Mark Wielaard d3173f
-      if (!epartIsReg(modrm)) {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
-         gen_SEGV_if_not_16_aligned( addr );
Mark Wielaard d3173f
-         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
Mark Wielaard d3173f
-         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
Mark Wielaard d3173f
-                                 dis_buf,
Mark Wielaard d3173f
-                                 nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-         delta += 2+alen;
Mark Wielaard d3173f
-         goto decode_success;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      /* else fall through */
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
Mark Wielaard d3173f
    /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
Mark Wielaard d3173f
       Intel manual does not say anything about the usual business of
Mark Wielaard d3173f
@@ -8854,70 +8673,6 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
       /* else fall through */
Mark Wielaard d3173f
    }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
-   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
Mark Wielaard d3173f
-      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
Mark Wielaard d3173f
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
Mark Wielaard d3173f
-      vassert(sz == 4);
Mark Wielaard d3173f
-      modrm = getIByte(delta+3);
Mark Wielaard d3173f
-      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
-         putXMMRegLane32( gregOfRM(modrm), 0,
Mark Wielaard d3173f
-                          getXMMRegLane32( eregOfRM(modrm), 0 ));
Mark Wielaard d3173f
-         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
Mark Wielaard d3173f
-                              nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-         delta += 3+1;
Mark Wielaard d3173f
-      } else {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
Mark Wielaard d3173f
-         /* zero bits 127:64 */
Mark Wielaard d3173f
-         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) ); 
Mark Wielaard d3173f
-         /* zero bits 63:32 */
Mark Wielaard d3173f
-         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) ); 
Mark Wielaard d3173f
-         /* write bits 31:0 */
Mark Wielaard d3173f
-         putXMMRegLane32( gregOfRM(modrm), 0,
Mark Wielaard d3173f
-                          loadLE(Ity_I32, mkexpr(addr)) );
Mark Wielaard d3173f
-         DIP("movss %s,%s\n", dis_buf,
Mark Wielaard d3173f
-                              nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
-         delta += 3+alen;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
Mark Wielaard d3173f
-      or lo 1/4 xmm). */
Mark Wielaard d3173f
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
Mark Wielaard d3173f
-      vassert(sz == 4);
Mark Wielaard d3173f
-      modrm = getIByte(delta+3);
Mark Wielaard d3173f
-      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
-         /* fall through, we don't yet have a test case */
Mark Wielaard d3173f
-      } else {
Mark Wielaard d3173f
-         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
Mark Wielaard d3173f
-         storeLE( mkexpr(addr),
Mark Wielaard d3173f
-                  getXMMRegLane32(gregOfRM(modrm), 0) );
Mark Wielaard d3173f
-         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
Mark Wielaard d3173f
-                              dis_buf);
Mark Wielaard d3173f
-         delta += 3+alen;
Mark Wielaard d3173f
-         goto decode_success;
Mark Wielaard d3173f
-      }
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
Mark Wielaard d3173f
-   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
Mark Wielaard d3173f
-      vassert(sz == 4);
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
-   /* 0F 56 = ORPS -- G = G and E */
Mark Wielaard d3173f
-   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
Mark Wielaard d3173f
-      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
    /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
Mark Wielaard d3173f
    /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
Mark Wielaard d3173f
    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
Mark Wielaard d3173f
@@ -9173,6 +8928,284 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
       goto decode_success;
Mark Wielaard d3173f
    }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
+   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
Mark Wielaard d3173f
+   if (insn[0] == 0x0F && insn[1] == 0xAE
Mark Wielaard d3173f
+       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
Mark Wielaard d3173f
+      vassert(sz == 4);
Mark Wielaard d3173f
+      delta += 3;
Mark Wielaard d3173f
+      /* Insert a memory fence.  It's sometimes important that these
Mark Wielaard d3173f
+         are carried through to the generated code. */
Mark Wielaard d3173f
+      stmt( IRStmt_MBE(Imbe_Fence) );
Mark Wielaard d3173f
+      DIP("sfence\n");
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* End of mmxext sse1 subset. No more sse parsing for mmxext only arches. */
Mark Wielaard d3173f
+   if (archinfo->hwcaps == VEX_HWCAPS_X86_MMXEXT/*integer only sse1 subset*/)
Mark Wielaard d3173f
+      goto after_sse_decoders;
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
Mark Wielaard d3173f
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
Mark Wielaard d3173f
+      vassert(sz == 4);
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
Mark Wielaard d3173f
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
Mark Wielaard d3173f
+      vassert(sz == 4);
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
Mark Wielaard d3173f
+   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
Mark Wielaard d3173f
+      modrm = getIByte(delta+2);
Mark Wielaard d3173f
+      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
+         putXMMReg( gregOfRM(modrm), 
Mark Wielaard d3173f
+                    getXMMReg( eregOfRM(modrm) ));
Mark Wielaard d3173f
+         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
Mark Wielaard d3173f
+                                  nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+         delta += 2+1;
Mark Wielaard d3173f
+      } else {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
+         if (insn[1] == 0x28/*movaps*/)
Mark Wielaard d3173f
+            gen_SEGV_if_not_16_aligned( addr );
Mark Wielaard d3173f
+         putXMMReg( gregOfRM(modrm), 
Mark Wielaard d3173f
+                    loadLE(Ity_V128, mkexpr(addr)) );
Mark Wielaard d3173f
+         DIP("mov[ua]ps %s,%s\n", dis_buf,
Mark Wielaard d3173f
+                                  nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+         delta += 2+alen;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
Mark Wielaard d3173f
+   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F 
Mark Wielaard d3173f
+       && (insn[1] == 0x29 || insn[1] == 0x11)) {
Mark Wielaard d3173f
+      modrm = getIByte(delta+2);
Mark Wielaard d3173f
+      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
+         /* fall through; awaiting test case */
Mark Wielaard d3173f
+      } else {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
+         if (insn[1] == 0x29/*movaps*/)
Mark Wielaard d3173f
+            gen_SEGV_if_not_16_aligned( addr );
Mark Wielaard d3173f
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
Mark Wielaard d3173f
+         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
Mark Wielaard d3173f
+                                  dis_buf );
Mark Wielaard d3173f
+         delta += 2+alen;
Mark Wielaard d3173f
+         goto decode_success;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
Mark Wielaard d3173f
+   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
Mark Wielaard d3173f
+      modrm = getIByte(delta+2);
Mark Wielaard d3173f
+      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
+         delta += 2+1;
Mark Wielaard d3173f
+         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
Mark Wielaard d3173f
+                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
Mark Wielaard d3173f
+         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)), 
Mark Wielaard d3173f
+                               nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+      } else {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
+         delta += 2+alen;
Mark Wielaard d3173f
+         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
Mark Wielaard d3173f
+                          loadLE(Ity_I64, mkexpr(addr)) );
Mark Wielaard d3173f
+         DIP("movhps %s,%s\n", dis_buf, 
Mark Wielaard d3173f
+                               nameXMMReg( gregOfRM(modrm) ));
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
Mark Wielaard d3173f
+      if (!epartIsReg(insn[2])) {
Mark Wielaard d3173f
+         delta += 2;
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
Mark Wielaard d3173f
+         delta += alen;
Mark Wielaard d3173f
+         storeLE( mkexpr(addr), 
Mark Wielaard d3173f
+                  getXMMRegLane64( gregOfRM(insn[2]),
Mark Wielaard d3173f
+                                   1/*upper lane*/ ) );
Mark Wielaard d3173f
+         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
Mark Wielaard d3173f
+                               dis_buf);
Mark Wielaard d3173f
+         goto decode_success;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      /* else fall through */
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
Mark Wielaard d3173f
+   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
Mark Wielaard d3173f
+      modrm = getIByte(delta+2);
Mark Wielaard d3173f
+      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
+         delta += 2+1;
Mark Wielaard d3173f
+         putXMMRegLane64( gregOfRM(modrm),  
Mark Wielaard d3173f
+                          0/*lower lane*/,
Mark Wielaard d3173f
+                          getXMMRegLane64( eregOfRM(modrm), 1 ));
Mark Wielaard d3173f
+         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)), 
Mark Wielaard d3173f
+                                 nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+      } else {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
+         delta += 2+alen;
Mark Wielaard d3173f
+         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
Mark Wielaard d3173f
+                          loadLE(Ity_I64, mkexpr(addr)) );
Mark Wielaard d3173f
+         DIP("movlps %s, %s\n", 
Mark Wielaard d3173f
+             dis_buf, nameXMMReg( gregOfRM(modrm) ));
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
Mark Wielaard d3173f
+      if (!epartIsReg(insn[2])) {
Mark Wielaard d3173f
+         delta += 2;
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
Mark Wielaard d3173f
+         delta += alen;
Mark Wielaard d3173f
+         storeLE( mkexpr(addr), 
Mark Wielaard d3173f
+                  getXMMRegLane64( gregOfRM(insn[2]), 
Mark Wielaard d3173f
+                                   0/*lower lane*/ ) );
Mark Wielaard d3173f
+         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
Mark Wielaard d3173f
+                                dis_buf);
Mark Wielaard d3173f
+         goto decode_success;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      /* else fall through */
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
Mark Wielaard d3173f
+      to 4 lowest bits of ireg(G) */
Mark Wielaard d3173f
+   if (insn[0] == 0x0F && insn[1] == 0x50) {
Mark Wielaard d3173f
+      modrm = getIByte(delta+2);
Mark Wielaard d3173f
+      if (sz == 4 && epartIsReg(modrm)) {
Mark Wielaard d3173f
+         Int src;
Mark Wielaard d3173f
+         t0 = newTemp(Ity_I32);
Mark Wielaard d3173f
+         t1 = newTemp(Ity_I32);
Mark Wielaard d3173f
+         t2 = newTemp(Ity_I32);
Mark Wielaard d3173f
+         t3 = newTemp(Ity_I32);
Mark Wielaard d3173f
+         delta += 2+1;
Mark Wielaard d3173f
+         src = eregOfRM(modrm);
Mark Wielaard d3173f
+         assign( t0, binop( Iop_And32,
Mark Wielaard d3173f
+                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
Mark Wielaard d3173f
+                            mkU32(1) ));
Mark Wielaard d3173f
+         assign( t1, binop( Iop_And32,
Mark Wielaard d3173f
+                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
Mark Wielaard d3173f
+                            mkU32(2) ));
Mark Wielaard d3173f
+         assign( t2, binop( Iop_And32,
Mark Wielaard d3173f
+                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
Mark Wielaard d3173f
+                            mkU32(4) ));
Mark Wielaard d3173f
+         assign( t3, binop( Iop_And32,
Mark Wielaard d3173f
+                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
Mark Wielaard d3173f
+                            mkU32(8) ));
Mark Wielaard d3173f
+         putIReg(4, gregOfRM(modrm),
Mark Wielaard d3173f
+                    binop(Iop_Or32,
Mark Wielaard d3173f
+                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
Mark Wielaard d3173f
+                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
Mark Wielaard d3173f
+                         )
Mark Wielaard d3173f
+                 );
Mark Wielaard d3173f
+         DIP("movmskps %s,%s\n", nameXMMReg(src), 
Mark Wielaard d3173f
+                                 nameIReg(4, gregOfRM(modrm)));
Mark Wielaard d3173f
+         goto decode_success;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      /* else fall through */
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
Mark Wielaard d3173f
+   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
Mark Wielaard d3173f
+   if (insn[0] == 0x0F && insn[1] == 0x2B) {
Mark Wielaard d3173f
+      modrm = getIByte(delta+2);
Mark Wielaard d3173f
+      if (!epartIsReg(modrm)) {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
Mark Wielaard d3173f
+         gen_SEGV_if_not_16_aligned( addr );
Mark Wielaard d3173f
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
Mark Wielaard d3173f
+         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
Mark Wielaard d3173f
+                                 dis_buf,
Mark Wielaard d3173f
+                                 nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+         delta += 2+alen;
Mark Wielaard d3173f
+         goto decode_success;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      /* else fall through */
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
Mark Wielaard d3173f
+      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
Mark Wielaard d3173f
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
Mark Wielaard d3173f
+      vassert(sz == 4);
Mark Wielaard d3173f
+      modrm = getIByte(delta+3);
Mark Wielaard d3173f
+      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
+         putXMMRegLane32( gregOfRM(modrm), 0,
Mark Wielaard d3173f
+                          getXMMRegLane32( eregOfRM(modrm), 0 ));
Mark Wielaard d3173f
+         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
Mark Wielaard d3173f
+                              nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+         delta += 3+1;
Mark Wielaard d3173f
+      } else {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
Mark Wielaard d3173f
+         /* zero bits 127:64 */
Mark Wielaard d3173f
+         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) ); 
Mark Wielaard d3173f
+         /* zero bits 63:32 */
Mark Wielaard d3173f
+         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) ); 
Mark Wielaard d3173f
+         /* write bits 31:0 */
Mark Wielaard d3173f
+         putXMMRegLane32( gregOfRM(modrm), 0,
Mark Wielaard d3173f
+                          loadLE(Ity_I32, mkexpr(addr)) );
Mark Wielaard d3173f
+         DIP("movss %s,%s\n", dis_buf,
Mark Wielaard d3173f
+                              nameXMMReg(gregOfRM(modrm)));
Mark Wielaard d3173f
+         delta += 3+alen;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
Mark Wielaard d3173f
+      or lo 1/4 xmm). */
Mark Wielaard d3173f
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
Mark Wielaard d3173f
+      vassert(sz == 4);
Mark Wielaard d3173f
+      modrm = getIByte(delta+3);
Mark Wielaard d3173f
+      if (epartIsReg(modrm)) {
Mark Wielaard d3173f
+         /* fall through, we don't yet have a test case */
Mark Wielaard d3173f
+      } else {
Mark Wielaard d3173f
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
Mark Wielaard d3173f
+         storeLE( mkexpr(addr),
Mark Wielaard d3173f
+                  getXMMRegLane32(gregOfRM(modrm), 0) );
Mark Wielaard d3173f
+         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
Mark Wielaard d3173f
+                              dis_buf);
Mark Wielaard d3173f
+         delta += 3+alen;
Mark Wielaard d3173f
+         goto decode_success;
Mark Wielaard d3173f
+      }
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
Mark Wielaard d3173f
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
Mark Wielaard d3173f
+      vassert(sz == 4);
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+   /* 0F 56 = ORPS -- G = G and E */
Mark Wielaard d3173f
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
Mark Wielaard d3173f
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
Mark Wielaard d3173f
+      goto decode_success;
Mark Wielaard d3173f
+   }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
    /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
Mark Wielaard d3173f
    if (insn[0] == 0x0F && insn[1] == 0x53) {
Mark Wielaard d3173f
       vassert(sz == 4);
Mark Wielaard d3173f
@@ -9205,18 +9238,6 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
       goto decode_success;
Mark Wielaard d3173f
    }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
-   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
Mark Wielaard d3173f
-   if (insn[0] == 0x0F && insn[1] == 0xAE
Mark Wielaard d3173f
-       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
Mark Wielaard d3173f
-      vassert(sz == 4);
Mark Wielaard d3173f
-      delta += 3;
Mark Wielaard d3173f
-      /* Insert a memory fence.  It's sometimes important that these
Mark Wielaard d3173f
-         are carried through to the generated code. */
Mark Wielaard d3173f
-      stmt( IRStmt_MBE(Imbe_Fence) );
Mark Wielaard d3173f
-      DIP("sfence\n");
Mark Wielaard d3173f
-      goto decode_success;
Mark Wielaard d3173f
-   }
Mark Wielaard d3173f
-
Mark Wielaard d3173f
    /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
Mark Wielaard d3173f
    if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
Mark Wielaard d3173f
       Int    select;
Mark Wielaard d3173f
@@ -14674,6 +14695,11 @@ DisResult disInstr_X86_WRK (
Mark Wielaard d3173f
             fAddr = &x86g_dirtyhelper_CPUID_sse1; 
Mark Wielaard d3173f
          } 
Mark Wielaard d3173f
          else
Mark Wielaard d3173f
+         if (archinfo->hwcaps & VEX_HWCAPS_X86_MMXEXT) {
Mark Wielaard d3173f
+            fName = "x86g_dirtyhelper_CPUID_mmxext";
Mark Wielaard d3173f
+            fAddr = &x86g_dirtyhelper_CPUID_mmxext;
Mark Wielaard d3173f
+         }
Mark Wielaard d3173f
+         else
Mark Wielaard d3173f
          if (archinfo->hwcaps == 0/*no SSE*/) {
Mark Wielaard d3173f
             fName = "x86g_dirtyhelper_CPUID_sse0";
Mark Wielaard d3173f
             fAddr = &x86g_dirtyhelper_CPUID_sse0; 
Mark Wielaard d3173f
diff --git a/VEX/priv/host_x86_defs.c b/VEX/priv/host_x86_defs.c
Mark Wielaard d3173f
index 21a05a9..693eaa2 100644
Mark Wielaard d3173f
--- a/VEX/priv/host_x86_defs.c
Mark Wielaard d3173f
+++ b/VEX/priv/host_x86_defs.c
Mark Wielaard d3173f
@@ -727,7 +727,8 @@ X86Instr* X86Instr_MFence ( UInt hwcaps ) {
Mark Wielaard d3173f
    X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
Mark Wielaard d3173f
    i->tag               = Xin_MFence;
Mark Wielaard d3173f
    i->Xin.MFence.hwcaps = hwcaps;
Mark Wielaard d3173f
-   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_SSE1
Mark Wielaard d3173f
+   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_MMXEXT
Mark Wielaard d3173f
+                            |VEX_HWCAPS_X86_SSE1
Mark Wielaard d3173f
                             |VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
                             |VEX_HWCAPS_X86_SSE3
Mark Wielaard d3173f
                             |VEX_HWCAPS_X86_LZCNT)));
Mark Wielaard d3173f
@@ -2695,7 +2696,7 @@ Int emit_X86Instr ( /*MB_MOD*/Bool* is_profInc,
Mark Wielaard d3173f
          *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
Mark Wielaard d3173f
          goto done;
Mark Wielaard d3173f
       }
Mark Wielaard d3173f
-      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_SSE1) {
Mark Wielaard d3173f
+      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_MMXEXT) {
Mark Wielaard d3173f
          /* sfence */
Mark Wielaard d3173f
          *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
Mark Wielaard d3173f
          /* lock addl $0,0(%esp) */
Mark Wielaard d3173f
diff --git a/VEX/priv/host_x86_defs.h b/VEX/priv/host_x86_defs.h
Mark Wielaard d3173f
index f810ab4..e03becf 100644
Mark Wielaard d3173f
--- a/VEX/priv/host_x86_defs.h
Mark Wielaard d3173f
+++ b/VEX/priv/host_x86_defs.h
Mark Wielaard d3173f
@@ -360,7 +360,7 @@ typedef
Mark Wielaard d3173f
       Xin_Store,     /* store 16/8 bit value in memory */
Mark Wielaard d3173f
       Xin_Set32,     /* convert condition code to 32-bit value */
Mark Wielaard d3173f
       Xin_Bsfr32,    /* 32-bit bsf/bsr */
Mark Wielaard d3173f
-      Xin_MFence,    /* mem fence (not just sse2, but sse0 and 1 too) */
Mark Wielaard d3173f
+      Xin_MFence,    /* mem fence (not just sse2, but sse0 and 1/mmxext too) */
Mark Wielaard d3173f
       Xin_ACAS,      /* 8/16/32-bit lock;cmpxchg */
Mark Wielaard d3173f
       Xin_DACAS,     /* lock;cmpxchg8b (doubleword ACAS, 2 x 32-bit only) */
Mark Wielaard d3173f
 
Mark Wielaard d3173f
@@ -508,13 +508,13 @@ typedef
Mark Wielaard d3173f
             HReg src;
Mark Wielaard d3173f
             HReg dst;
Mark Wielaard d3173f
          } Bsfr32;
Mark Wielaard d3173f
-         /* Mem fence (not just sse2, but sse0 and 1 too).  In short,
Mark Wielaard d3173f
-            an insn which flushes all preceding loads and stores as
Mark Wielaard d3173f
-            much as possible before continuing.  On SSE2 we emit a
Mark Wielaard d3173f
-            real "mfence", on SSE1 "sfence ; lock addl $0,0(%esp)" and
Mark Wielaard d3173f
-            on SSE0 "lock addl $0,0(%esp)".  This insn therefore
Mark Wielaard d3173f
-            carries the host's hwcaps so the assembler knows what to
Mark Wielaard d3173f
-            emit. */
Mark Wielaard d3173f
+         /* Mem fence (not just sse2, but sse0 and sse1/mmxext too).
Mark Wielaard d3173f
+            In short, an insn which flushes all preceding loads and
Mark Wielaard d3173f
+            stores as much as possible before continuing.  On SSE2
Mark Wielaard d3173f
+            we emit a real "mfence", on SSE1 or the MMXEXT subset
Mark Wielaard d3173f
+            "sfence ; lock addl $0,0(%esp)" and on SSE0
Mark Wielaard d3173f
+            "lock addl $0,0(%esp)".  This insn therefore carries the
Mark Wielaard d3173f
+            host's hwcaps so the assembler knows what to emit. */
Mark Wielaard d3173f
          struct {
Mark Wielaard d3173f
             UInt hwcaps;
Mark Wielaard d3173f
          } MFence;
Mark Wielaard d3173f
diff --git a/VEX/priv/host_x86_isel.c b/VEX/priv/host_x86_isel.c
Mark Wielaard d3173f
index 086aefc..90bc563 100644
Mark Wielaard d3173f
--- a/VEX/priv/host_x86_isel.c
Mark Wielaard d3173f
+++ b/VEX/priv/host_x86_isel.c
Mark Wielaard d3173f
@@ -3251,7 +3251,8 @@ static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
Mark Wielaard d3173f
 {
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 #  define REQUIRE_SSE1                                    \
Mark Wielaard d3173f
-      do { if (env->hwcaps == 0/*baseline, no sse*/)      \
Mark Wielaard d3173f
+      do { if (env->hwcaps == 0/*baseline, no sse*/       \
Mark Wielaard d3173f
+               ||  env->hwcaps == VEX_HWCAPS_X86_MMXEXT /*Integer SSE*/) \
Mark Wielaard d3173f
               goto vec_fail;                              \
Mark Wielaard d3173f
       } while (0)
Mark Wielaard d3173f
 
Mark Wielaard d3173f
@@ -4388,7 +4389,8 @@ HInstrArray* iselSB_X86 ( IRSB* bb,
Mark Wielaard d3173f
    /* sanity ... */
Mark Wielaard d3173f
    vassert(arch_host == VexArchX86);
Mark Wielaard d3173f
    vassert(0 == (hwcaps_host
Mark Wielaard d3173f
-                 & ~(VEX_HWCAPS_X86_SSE1
Mark Wielaard d3173f
+                 & ~(VEX_HWCAPS_X86_MMXEXT
Mark Wielaard d3173f
+                     | VEX_HWCAPS_X86_SSE1
Mark Wielaard d3173f
                      | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
                      | VEX_HWCAPS_X86_SSE3
Mark Wielaard d3173f
                      | VEX_HWCAPS_X86_LZCNT)));
Mark Wielaard d3173f
diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
Mark Wielaard d3173f
index e425950..5bb762f 100644
Mark Wielaard d3173f
--- a/VEX/priv/main_main.c
Mark Wielaard d3173f
+++ b/VEX/priv/main_main.c
Mark Wielaard d3173f
@@ -1086,23 +1086,25 @@
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 static HChar* show_hwcaps_x86 ( UInt hwcaps ) 
Mark Wielaard d3173f
 {
Mark Wielaard d3173f
-   /* Monotonic, SSE3 > SSE2 > SSE1 > baseline. */
Mark Wielaard d3173f
+   /* Monotonic, LZCNT > SSE3 > SSE2 > SSE1 > MMXEXT > baseline. */
Mark Wielaard d3173f
    switch (hwcaps) {
Mark Wielaard d3173f
       case 0:
Mark Wielaard d3173f
          return "x86-sse0";
Mark Wielaard d3173f
-      case VEX_HWCAPS_X86_SSE1:
Mark Wielaard d3173f
-         return "x86-sse1";
Mark Wielaard d3173f
-      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2:
Mark Wielaard d3173f
-         return "x86-sse1-sse2";
Mark Wielaard d3173f
-      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
+      case VEX_HWCAPS_X86_MMXEXT:
Mark Wielaard d3173f
+         return "x86-mmxext";
Mark Wielaard d3173f
+      case VEX_HWCAPS_X86_MMXEXT | VEX_HWCAPS_X86_SSE1:
Mark Wielaard d3173f
+         return "x86-mmxext-sse1";
Mark Wielaard d3173f
+      case VEX_HWCAPS_X86_MMXEXT | VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2:
Mark Wielaard d3173f
+         return "x86-mmxext-sse1-sse2";
Mark Wielaard d3173f
+      case VEX_HWCAPS_X86_MMXEXT | VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
            | VEX_HWCAPS_X86_LZCNT:
Mark Wielaard d3173f
-         return "x86-sse1-sse2-lzcnt";
Mark Wielaard d3173f
-      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
+         return "x86-mmxext-sse1-sse2-lzcnt";
Mark Wielaard d3173f
+      case VEX_HWCAPS_X86_MMXEXT | VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
            | VEX_HWCAPS_X86_SSE3:
Mark Wielaard d3173f
-         return "x86-sse1-sse2-sse3";
Mark Wielaard d3173f
-      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
+         return "x86-mmxext-sse1-sse2-sse3";
Mark Wielaard d3173f
+      case VEX_HWCAPS_X86_MMXEXT | VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
Mark Wielaard d3173f
            | VEX_HWCAPS_X86_SSE3 | VEX_HWCAPS_X86_LZCNT:
Mark Wielaard d3173f
-         return "x86-sse1-sse2-sse3-lzcnt";
Mark Wielaard d3173f
+         return "x86-mmxext-sse1-sse2-sse3-lzcnt";
Mark Wielaard d3173f
       default:
Mark Wielaard d3173f
          return NULL;
Mark Wielaard d3173f
    }
Mark Wielaard d3173f
diff --git a/VEX/pub/libvex.h b/VEX/pub/libvex.h
Mark Wielaard d3173f
index 4b36727..c8b5892 100644
Mark Wielaard d3173f
--- a/VEX/pub/libvex.h
Mark Wielaard d3173f
+++ b/VEX/pub/libvex.h
Mark Wielaard d3173f
@@ -71,11 +71,12 @@ typedef
Mark Wielaard d3173f
    combinations. */
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 /* x86: baseline capability is Pentium-1 (FPU, MMX, but no SSE), with
Mark Wielaard d3173f
-   cmpxchg8b. */
Mark Wielaard d3173f
-#define VEX_HWCAPS_X86_SSE1    (1<<1)  /* SSE1 support (Pentium III) */
Mark Wielaard d3173f
-#define VEX_HWCAPS_X86_SSE2    (1<<2)  /* SSE2 support (Pentium 4) */
Mark Wielaard d3173f
-#define VEX_HWCAPS_X86_SSE3    (1<<3)  /* SSE3 support (>= Prescott) */
Mark Wielaard d3173f
-#define VEX_HWCAPS_X86_LZCNT   (1<<4)  /* SSE4a LZCNT insn */
Mark Wielaard d3173f
+   cmpxchg8b. MMXEXT is a special AMD only subset of SSE1 (Integer SSE). */
Mark Wielaard d3173f
+#define VEX_HWCAPS_X86_MMXEXT  (1<<1)  /* A subset of SSE1 on early AMD */
Mark Wielaard d3173f
+#define VEX_HWCAPS_X86_SSE1    (1<<2)  /* SSE1 support (Pentium III) */
Mark Wielaard d3173f
+#define VEX_HWCAPS_X86_SSE2    (1<<3)  /* SSE2 support (Pentium 4) */
Mark Wielaard d3173f
+#define VEX_HWCAPS_X86_SSE3    (1<<4)  /* SSE3 support (>= Prescott) */
Mark Wielaard d3173f
+#define VEX_HWCAPS_X86_LZCNT   (1<<5)  /* SSE4a LZCNT insn */
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 /* amd64: baseline capability is SSE2, with cmpxchg8b but not
Mark Wielaard d3173f
    cmpxchg16b. */
Mark Wielaard d3173f
commit 4c6f0638553e69b7f70c17a64a8f60114d6f6230
Mark Wielaard d3173f
Author: mjw <mjw@a5019735-40e9-0310-863c-91ae7b9d1cf9>
Mark Wielaard d3173f
Date:   Tue Aug 27 10:23:23 2013 +0000
Mark Wielaard d3173f
Mark Wielaard d3173f
    Support mmxext (integer sse) subset on i386 (athlon). Bug #323713
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    Some processors like the AMD Athlon "Classic" support mmxext,
Mark Wielaard d3173f
    a sse1 subset. This subset is not properly detected by VEX.
Mark Wielaard d3173f
    The subset uses the same encoding as the sse1 instructions.
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    The subset is described at:
Mark Wielaard d3173f
      http://support.amd.com/us/Embedded_TechDocs/22466.pdf
Mark Wielaard d3173f
      https://en.wikipedia.org/wiki/3DNow!#3DNow.21_extensions
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    Detects mmxext subset from cpuid information (and enables it
Mark Wielaard d3173f
    when full sse1 is found). Also fixes the prereq of
Mark Wielaard d3173f
    none/tests/x86/insn_mmxext.vgtest so that it also runs when
Mark Wielaard d3173f
    full sse1 (and not just the mmxext subset) is found.
Mark Wielaard d3173f
    It already passed on such configurations. With the VEX patch
Mark Wielaard d3173f
    (r2745) it also passes with just the mmxext subset.
Mark Wielaard d3173f
    
Mark Wielaard d3173f
    git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13515 a5019735-40e9-0310-863c-91ae7b9d1cf9
Mark Wielaard d3173f
Mark Wielaard d3173f
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
Mark Wielaard d3173f
index 353c05b..2fd5f07 100644
Mark Wielaard d3173f
--- a/coregrind/m_machine.c
Mark Wielaard d3173f
+++ b/coregrind/m_machine.c
Mark Wielaard d3173f
@@ -685,7 +685,7 @@
Mark Wielaard d3173f
    LibVEX_default_VexArchInfo(&vai;;
Mark Wielaard d3173f
 
Mark Wielaard d3173f
 #if defined(VGA_x86)
Mark Wielaard d3173f
-   { Bool have_sse1, have_sse2, have_cx8, have_lzcnt;
Mark Wielaard d3173f
+   { Bool have_sse1, have_sse2, have_cx8, have_lzcnt, have_mmxext;
Mark Wielaard d3173f
      UInt eax, ebx, ecx, edx, max_extended;
Mark Wielaard d3173f
      UChar vstr[13];
Mark Wielaard d3173f
      vstr[0] = 0;
Mark Wielaard d3173f
@@ -722,17 +722,27 @@
Mark Wielaard d3173f
      if (!have_cx8)
Mark Wielaard d3173f
         return False;
Mark Wielaard d3173f
 
Mark Wielaard d3173f
-     /* Figure out if this is an AMD that can do LZCNT. */
Mark Wielaard d3173f
+     /* Figure out if this is an AMD that can do mmxext and/or LZCNT. */
Mark Wielaard d3173f
+     have_mmxext = False;
Mark Wielaard d3173f
      have_lzcnt = False;
Mark Wielaard d3173f
      if (0 == VG_(strcmp)(vstr, "AuthenticAMD")
Mark Wielaard d3173f
          && max_extended >= 0x80000001) {
Mark Wielaard d3173f
         VG_(cpuid)(0x80000001, 0, &eax, &ebx, &ecx, &edx;;
Mark Wielaard d3173f
         have_lzcnt = (ecx & (1<<5)) != 0; /* True => have LZCNT */
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+        /* Some older AMD processors support a sse1 subset (Integer SSE). */
Mark Wielaard d3173f
+        have_mmxext = !have_sse1 && ((edx & (1<<22)) != 0);
Mark Wielaard d3173f
      }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
-     if (have_sse2 && have_sse1) {
Mark Wielaard d3173f
+     /* Intel processors don't define the mmxext extension, but since it
Mark Wielaard d3173f
+        is just a sse1 subset always define it when we have sse1. */
Mark Wielaard d3173f
+     if (have_sse1)
Mark Wielaard d3173f
+        have_mmxext = True;
Mark Wielaard d3173f
+
Mark Wielaard d3173f
+     if (have_sse2 && have_sse1 && have_mmxext) {
Mark Wielaard d3173f
         va          = VexArchX86;
Mark Wielaard d3173f
-        vai.hwcaps  = VEX_HWCAPS_X86_SSE1;
Mark Wielaard d3173f
+        vai.hwcaps  = VEX_HWCAPS_X86_MMXEXT;
Mark Wielaard d3173f
+        vai.hwcaps |= VEX_HWCAPS_X86_SSE1;
Mark Wielaard d3173f
         vai.hwcaps |= VEX_HWCAPS_X86_SSE2;
Mark Wielaard d3173f
         if (have_lzcnt)
Mark Wielaard d3173f
            vai.hwcaps |= VEX_HWCAPS_X86_LZCNT;
Mark Wielaard d3173f
@@ -740,13 +750,21 @@
Mark Wielaard d3173f
         return True;
Mark Wielaard d3173f
      }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
-     if (have_sse1) {
Mark Wielaard d3173f
+     if (have_sse1 && have_mmxext) {
Mark Wielaard d3173f
         va          = VexArchX86;
Mark Wielaard d3173f
-        vai.hwcaps  = VEX_HWCAPS_X86_SSE1;
Mark Wielaard d3173f
+        vai.hwcaps  = VEX_HWCAPS_X86_MMXEXT;
Mark Wielaard d3173f
+        vai.hwcaps |= VEX_HWCAPS_X86_SSE1;
Mark Wielaard d3173f
         VG_(machine_x86_have_mxcsr) = 1;
Mark Wielaard d3173f
         return True;
Mark Wielaard d3173f
      }
Mark Wielaard d3173f
 
Mark Wielaard d3173f
+     if (have_mmxext) {
Mark Wielaard d3173f
+        va          = VexArchX86;
Mark Wielaard d3173f
+        vai.hwcaps  = VEX_HWCAPS_X86_MMXEXT;
Mark Wielaard d3173f
+        VG_(machine_x86_have_mxcsr) = 0;
Mark Wielaard d3173f
+        return True;
Mark Wielaard d3173f
+     }
Mark Wielaard d3173f
+
Mark Wielaard d3173f
      va         = VexArchX86;
Mark Wielaard d3173f
      vai.hwcaps = 0; /*baseline - no sse at all*/
Mark Wielaard d3173f
      VG_(machine_x86_have_mxcsr) = 0;
Mark Wielaard d3173f
diff --git a/none/tests/x86/insn_mmxext.vgtest b/none/tests/x86/insn_mmxext.vgtest
Mark Wielaard d3173f
index ad48b6e..e3627d6 100644
Mark Wielaard d3173f
--- a/none/tests/x86/insn_mmxext.vgtest
Mark Wielaard d3173f
+++ b/none/tests/x86/insn_mmxext.vgtest
Mark Wielaard d3173f
@@ -1,3 +1,4 @@
Mark Wielaard d3173f
 prog: ../../../none/tests/x86/insn_mmxext
Mark Wielaard d3173f
-prereq: ../../../tests/x86_amd64_features x86-mmxext
Mark Wielaard d3173f
+# mmxext is an old AMD subset of sse1, so either will do.
Mark Wielaard d3173f
+prereq: ../../../tests/x86_amd64_features x86-mmxext || ../../../tests/x86_amd64_features x86-sse
Mark Wielaard d3173f
 vgopts: -q