diff --git a/valgrind-3.12.0-clone-spawn.patch b/valgrind-3.12.0-clone-spawn.patch
new file mode 100644
index 0000000..f30e7aa
--- /dev/null
+++ b/valgrind-3.12.0-clone-spawn.patch
@@ -0,0 +1,3325 @@
+commit e31d72da6cb415d0856ad53dac78e307548cd831
+Author: philippe <philippe@a5019735-40e9-0310-863c-91ae7b9d1cf9>
+Date:   Sun Dec 11 21:39:23 2016 +0000
+
+    Fix 342040 Valgrind mishandles clone with CLONE_VFORK | CLONE_VM that clones to a different stack
+    Fix 373192 Calling posix_spawn in glibc 2.24 completely broken
+    
+    Functionally, this patch just does the following 2 changes to the
+    fork clone handling:
+    * It does not mask anymore CLONE_VFORK :
+      The only effect of this flag is to suspend the parent, waiting for
+      the child to either exit or execve.
+      If some applications depends on this synchronisation, better keep it,
+      as it will not harm to suspend the parent valgrind waiting for the
+      child valgrind to exit or execve.
+    * In case the guest calls the clone syscall providing a non zero client stack,
+      set the child guest SP after the syscall, before executing guest instructions.
+      Not setting the guest stack ptr was the source of the problem reported
+      in the bugs.
+    
+    This also adds a test case  none/tests/linux/clonev.
+    Before this patch, test gives a SEGV, which is fixed by the patch.
+    
+    The patch is however a lot bigger : this fix was touching some (mostly
+    identical/duplicated) code in all the linux platforms.
+    So, the clone/fork code has been factorised as much as possible.
+    This removes about 1700 lines of code.
+    
+    This has been tested on:
+    * amd64
+    * x86
+    * ppc64 be and le
+    * ppc32
+    * arm64
+    
+    This has been compiled on but *not really tested* on:
+    * mips64 (not too clear how to properly build and run valgrind on gcc22)
+    
+    It has *not* been compiled and *not* tested on:
+    * arm
+    * mips32
+    * tilegx
+    * darwin   (normally, no impact)
+    * solaris  (normally, no impact)
+    
+    The changes are relatively mechanical, so it is not impossible that
+    it will compile and work out of the box on these platforms.
+    Otherwise, questions welcome.
+    
+    A few points of interest:
+    * Some platforms did have a typedef void vki_modify_ldt_t,
+      and some platforms had no definition for this type at all.
+      To make it easier to factorise, for such platforms, the following has
+      been used:
+         typedef char vki_modify_ldt_t;
+        When the sizeof vki_modify_ldt_t is > 1, then the arg syscall is checked.
+      This is somewhat a hack, but was simplifying the factorisation.
+    
+    * for mips32/mips64 and tilegx, there is a strange unconditional assignment
+      of 0 to a register (guest_r2 on mips, guest_r0 on tilegx).
+      Unclear what this is, in particular because this is assigned whatever
+      the result of the syscall (success or not).
+    
+    
+    
+    
+    git-svn-id: svn://svn.valgrind.org/valgrind/trunk@16186 a5019735-40e9-0310-863c-91ae7b9d1cf9
+
+diff --git a/coregrind/m_syswrap/priv_syswrap-linux.h b/coregrind/m_syswrap/priv_syswrap-linux.h
+index 38fcd7b..06ea7cd 100644
+--- a/coregrind/m_syswrap/priv_syswrap-linux.h
++++ b/coregrind/m_syswrap/priv_syswrap-linux.h
+@@ -39,12 +39,10 @@ extern Word ML_(start_thread_NORETURN) ( void* arg );
+ extern Addr ML_(allocstack)            ( ThreadId tid );
+ extern void ML_(call_on_new_stack_0_1) ( Addr stack, Addr retaddr,
+ 			                 void (*f)(Word), Word arg1 );
+-extern SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
+-                                   Int* parent_tidptr, Int* child_tidptr );
+-
+ 
+ // Linux-specific (but non-arch-specific) syscalls
+ 
++DECL_TEMPLATE(linux, sys_clone)
+ DECL_TEMPLATE(linux, sys_mount);
+ DECL_TEMPLATE(linux, sys_oldumount);
+ DECL_TEMPLATE(linux, sys_umount);
+@@ -61,6 +59,10 @@ DECL_TEMPLATE(linux, sys_vmsplice);
+ DECL_TEMPLATE(linux, sys_readahead);
+ DECL_TEMPLATE(linux, sys_move_pages);
+ 
++// clone is similar enough between linux variants to have a generic
++// version, but which will call an extern defined in syswrap-<platform>-linux.c
++DECL_TEMPLATE(linux, sys_clone);
++
+ // POSIX, but various sub-cases differ between Linux and Darwin.
+ DECL_TEMPLATE(linux, sys_fcntl);
+ DECL_TEMPLATE(linux, sys_fcntl64);
+@@ -368,7 +370,83 @@ DECL_TEMPLATE(linux, sys_getpeername);
+ DECL_TEMPLATE(linux, sys_socketpair);
+ DECL_TEMPLATE(linux, sys_kcmp);
+ 
+-#endif   // __PRIV_SYSWRAP_LINUX_H
++// Some arch specific functions called from syswrap-linux.c
++extern Int do_syscall_clone_x86_linux ( Word (*fn)(void *), 
++                                        void* stack, 
++                                        Int   flags, 
++                                        void* arg,
++                                        Int*  child_tid, 
++                                        Int*  parent_tid, 
++                                        void* tls_ptr);
++extern SysRes ML_(x86_sys_set_thread_area) ( ThreadId tid,
++                                             vki_modify_ldt_t* info );
++extern void ML_(x86_setup_LDT_GDT) ( /*OUT*/ ThreadArchState *child, 
++                                     /*IN*/  ThreadArchState *parent );
++
++extern Long do_syscall_clone_amd64_linux ( Word (*fn)(void *), 
++                                           void* stack, 
++                                           Long  flags, 
++                                           void* arg,
++                                           Int* child_tid, 
++                                           Int* parent_tid, 
++                                           void* tls_ptr);
++extern ULong do_syscall_clone_ppc32_linux ( Word (*fn)(void *), 
++                                            void* stack, 
++                                            Int   flags, 
++                                            void* arg,
++                                            Int*  child_tid, 
++                                            Int*  parent_tid, 
++                                            void* tls_ptr);
++extern ULong do_syscall_clone_ppc64_linux ( Word (*fn)(void *), 
++                                            void* stack, 
++                                            Int   flags, 
++                                            void* arg,
++                                            Int*  child_tid, 
++                                            Int*  parent_tid, 
++                                            void* tls_ptr );
++extern ULong do_syscall_clone_s390x_linux ( void  *stack,
++                                            ULong flags,
++                                            Int   *parent_tid,
++                                            Int   *child_tid,
++                                            void*  tls_ptr,
++                                            Word (*fn)(void *),
++                                            void  *arg);
++extern Long do_syscall_clone_arm64_linux ( Word (*fn)(void *), 
++                                           void* stack, 
++                                           Long  flags, 
++                                           void* arg,
++                                           Int*  child_tid,
++                                           Int*  parent_tid,
++                                           void* tls_ptr );
++extern ULong do_syscall_clone_arm_linux   ( Word (*fn)(void *), 
++                                            void* stack, 
++                                            Int   flags, 
++                                            void* arg,
++                                            Int*  child_tid,
++                                            Int*  parent_tid,
++                                            void* tls_ptr );
++extern ULong do_syscall_clone_mips64_linux ( Word (*fn) (void *),  /* a0 - 4 */
++                                             void* stack,          /* a1 - 5 */
++                                             Int   flags,          /* a2 - 6 */
++                                             void* arg,            /* a3 - 7 */
++                                             Int*  parent_tid,     /* a4 - 8 */
++                                             void* tls_ptr,        /* a5 - 9 */
++                                             Int*  child_tid );    /* a6 - 10 */
++extern UInt do_syscall_clone_mips_linux ( Word (*fn) (void *), //a0     0    32
++                                          void* stack,         //a1     4    36
++                                          Int   flags,         //a2     8    40
++                                          void* arg,           //a3     12   44
++                                          Int*  child_tid,     //stack  16   48
++                                          Int*  parent_tid,    //stack  20   52
++                                          void* tls_ptr);      //stack  24   56
++extern Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
++                                            void* stack,          //r1
++                                            Long  flags,          //r2
++                                            void* arg,            //r3
++                                            Long* child_tid,      //r4
++                                            Long* parent_tid,     //r5
++                                            void* tls_ptr );      //r6
++ #endif   // __PRIV_SYSWRAP_LINUX_H
+ 
+ /*--------------------------------------------------------------------*/
+ /*--- end                                                          ---*/
+diff --git a/coregrind/m_syswrap/syswrap-amd64-linux.c b/coregrind/m_syswrap/syswrap-amd64-linux.c
+index 08e9a93..3fe9938 100644
+--- a/coregrind/m_syswrap/syswrap-amd64-linux.c
++++ b/coregrind/m_syswrap/syswrap-amd64-linux.c
+@@ -130,14 +130,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-Long do_syscall_clone_amd64_linux ( Word (*fn)(void *), 
+-                                    void* stack, 
+-                                    Long  flags, 
+-                                    void* arg,
+-                                    Long* child_tid, 
+-                                    Long* parent_tid, 
+-                                    vki_modify_ldt_t * );
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ ".text\n"
+ ".globl do_syscall_clone_amd64_linux\n"
+@@ -183,126 +176,6 @@ asm(
+ #undef __NR_EXIT
+ 
+ 
+-// forward declaration
+-static void setup_child ( ThreadArchState*, ThreadArchState* );
+-
+-/* 
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for EIP, and a separate stack
+-   for ESP.
+- */
+-static SysRes do_clone ( ThreadId ptid, 
+-                         ULong flags, Addr rsp, 
+-                         Long* parent_tidptr, 
+-                         Long* child_tidptr, 
+-                         Addr tlsaddr )
+-{
+-   static const Bool debug = False;
+-
+-   ThreadId     ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   UWord*       stack;
+-   SysRes       res;
+-   Long         rax;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-   /* Copy register state
+-
+-      Both parent and child return to the same place, and the code
+-      following the clone syscall works out which is which, so we
+-      don't need to worry about it.
+-
+-      The parent gets the child's new tid returned from clone, but the
+-      child gets 0.
+-
+-      If the clone call specifies a NULL rsp for the new thread, then
+-      it actually gets a copy of the parent's rsp.
+-   */
+-   setup_child( &ctst->arch, &ptst->arch );
+-
+-   /* Make sys_clone appear to have returned Success(0) in the
+-      child. */
+-   ctst->arch.vex.guest_RAX = 0;
+-
+-   if (rsp != 0)
+-      ctst->arch.vex.guest_RSP = rsp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   /* inherit signal mask */
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* Start the child with its threadgroup being the same as the
+-      parent's.  This is so that any exit_group calls that happen
+-      after the child is created but before it sets its
+-      os_state.threadgroup field for real (in thread_wrapper in
+-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-      a race condition in which the thread is unkillable (via
+-      exit_group) because its threadgroup is not set.  The race window
+-      is probably only a few hundred or a few thousand cycles long.
+-      See #226116. */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (rsp, ctst);
+-
+-   /* Assume the clone will succeed, and tell any tool that wants to
+-      know that this thread has come into existence.  If the clone
+-      fails, we'll send out a ll_exit notification for it at the out:
+-      label below, to clean up. */
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      if (debug)
+-	 VG_(printf)("clone child has SETTLS: tls at %#lx\n", tlsaddr);
+-      ctst->arch.vex.guest_FS_CONST = tlsaddr;
+-   }
+-
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   /* start the thread with everything blocked */
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   /* Create the new thread */
+-   rax = do_syscall_clone_amd64_linux(
+-            ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
+-            child_tidptr, parent_tidptr, NULL
+-         );
+-   res = VG_(mk_SysRes_amd64_linux)( rax );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-  out:
+-   if (sr_isError(res)) {
+-      /* clone failed */
+-      VG_(cleanup_thread)(&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-}
+-
+-
+ /* ---------------------------------------------------------------------
+    More thread stuff
+    ------------------------------------------------------------------ */
+@@ -311,16 +184,6 @@ void VG_(cleanup_thread) ( ThreadArchState *arch )
+ {  
+ }  
+ 
+-void setup_child ( /*OUT*/ ThreadArchState *child, 
+-                   /*IN*/  ThreadArchState *parent )
+-{  
+-   /* We inherit our parent's guest state. */
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-}  
+-
+-
+ /* ---------------------------------------------------------------------
+    PRE/POST wrappers for AMD64/Linux-specific syscalls
+    ------------------------------------------------------------------ */
+@@ -333,7 +196,6 @@ void setup_child ( /*OUT*/ ThreadArchState *child,
+    the right thing to do is to make these wrappers 'static' since they
+    aren't visible outside this file, but that requires even more macro
+    magic. */
+-DECL_TEMPLATE(amd64_linux, sys_clone);
+ DECL_TEMPLATE(amd64_linux, sys_rt_sigreturn);
+ DECL_TEMPLATE(amd64_linux, sys_arch_prctl);
+ DECL_TEMPLATE(amd64_linux, sys_ptrace);
+@@ -342,108 +204,6 @@ DECL_TEMPLATE(amd64_linux, sys_mmap);
+ DECL_TEMPLATE(amd64_linux, sys_syscall184);
+ 
+ 
+-PRE(sys_clone)
+-{
+-   ULong cloneflags;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-   PRE_REG_READ2(int, "clone",
+-                 unsigned long, flags,
+-                 void *, child_stack);
+-
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA3("clone", int *, parent_tidptr);
+-      }
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG1 & VKI_CLONE_SETTLS) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA4("clone", vki_modify_ldt_t *, tlsinfo);
+-      }
+-      PRE_MEM_READ("clone(tlsinfo)", ARG4, sizeof(vki_modify_ldt_t));
+-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
+-                                             VKI_PROT_READ)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA5("clone", int *, child_tidptr);
+-      }
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG4, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(Int), VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-
+-   cloneflags = ARG1;
+-
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  ARG1,          /* flags */
+-                  (Addr)ARG2,    /* child ESP */
+-                  (Long *)ARG3,  /* parent_tidptr */
+-                  (Long *)ARG4,  /* child_tidptr */
+-                  (Addr)ARG5));  /* set_tls */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,      /* flags */
+-                       (Int *)ARG3,     /* parent_tidptr */
+-                       (Int *)ARG4));   /* child_tidptr */
+-      break;
+-
+-   default:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg,
+-                   "Unsupported clone() flags: 0x%lx\n", ARG1);
+-      VG_(message)(Vg_UserMsg,
+-                   "\n");
+-      VG_(message)(Vg_UserMsg,
+-                   "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg,
+-                   " - via a threads library (LinuxThreads or NPTL)\n");
+-      VG_(message)(Vg_UserMsg,
+-                   " - via the implementation of fork or vfork\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG4, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_rt_sigreturn)
+ {
+    /* This isn't really a syscall at all - it's a misuse of the
+@@ -761,7 +521,7 @@ static SyscallTableEntry syscall_table[] = {
+    LINX_(__NR_setsockopt,        sys_setsockopt),     // 54
+ 
+    LINXY(__NR_getsockopt,        sys_getsockopt),     // 55 
+-   PLAX_(__NR_clone,             sys_clone),          // 56 
++   LINX_(__NR_clone,             sys_clone),          // 56 
+    GENX_(__NR_fork,              sys_fork),           // 57 
+    GENX_(__NR_vfork,             sys_fork),           // 58 treat as fork
+    GENX_(__NR_execve,            sys_execve),         // 59 
+diff --git a/coregrind/m_syswrap/syswrap-arm-linux.c b/coregrind/m_syswrap/syswrap-arm-linux.c
+index 3bbd109..b417428 100644
+--- a/coregrind/m_syswrap/syswrap-arm-linux.c
++++ b/coregrind/m_syswrap/syswrap-arm-linux.c
+@@ -102,14 +102,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-ULong do_syscall_clone_arm_linux   ( Word (*fn)(void *), 
+-                                     void* stack, 
+-                                     Int   flags, 
+-                                     void* arg,
+-                                     Int*  child_tid,
+-                                     Int*  parent_tid,
+-                                     void* tls );
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ ".text\n"
+ ".globl do_syscall_clone_arm_linux\n"
+@@ -148,104 +141,8 @@ asm(
+ #undef __NR_EXIT
+ 
+ // forward declarations
+-static void setup_child ( ThreadArchState*, ThreadArchState* );
+-static void assign_guest_tls(ThreadId ctid, Addr tlsptr);
+ static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr );
+             
+-/* 
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for IP, and a separate stack
+-   for SP.
+- */
+-static SysRes do_clone ( ThreadId ptid, 
+-                         UInt flags, Addr sp, 
+-                         Int *parent_tidptr, 
+-                         Int *child_tidptr, 
+-                         Addr child_tls)
+-{
+-   ThreadId ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   UInt r0;
+-   UWord *stack;
+-   SysRes res;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-
+-   if(stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-   setup_child( &ctst->arch, &ptst->arch );
+-
+-   ctst->arch.vex.guest_R0 = 0;
+-   if(sp != 0)
+-      ctst->arch.vex.guest_R13 = sp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* Start the child with its threadgroup being the same as the
+-      parent's.  This is so that any exit_group calls that happen
+-      after the child is created but before it sets its
+-      os_state.threadgroup field for real (in thread_wrapper in
+-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-      a race condition in which the thread is unkillable (via
+-      exit_group) because its threadgroup is not set.  The race window
+-      is probably only a few hundred or a few thousand cycles long.
+-      See #226116. */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (sp, ctst);
+-
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      /* Just assign the tls pointer in the guest TPIDRURO. */
+-      assign_guest_tls(ctid, child_tls);
+-   }
+-    
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   r0 = do_syscall_clone_arm_linux(
+-      ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
+-      child_tidptr, parent_tidptr, NULL
+-   );
+-   //VG_(printf)("AFTER SYSCALL, %x and %x  CHILD: %d PARENT: %d\n",child_tidptr, parent_tidptr,*child_tidptr,*parent_tidptr);
+-    
+-   res = VG_(mk_SysRes_arm_linux)( r0 );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-out:
+-   if (sr_isError(res)) {
+-      VG_(cleanup_thread)(&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-}
+-
+-
+-
+ /* ---------------------------------------------------------------------
+    More thread stuff
+    ------------------------------------------------------------------ */
+@@ -256,26 +153,13 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
+ {
+ }  
+ 
+-void setup_child ( /*OUT*/ ThreadArchState *child,
+-                   /*IN*/  ThreadArchState *parent )
+-{
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-}
+-
+-static void assign_guest_tls(ThreadId tid, Addr tlsptr)
+-{
+-   VG_(threads)[tid].arch.vex.guest_TPIDRURO = tlsptr;
+-}
+-
+ /* Assigns tlsptr to the guest TPIDRURO.
+    If needed for the specific hardware, really executes
+    the set_tls syscall.
+ */
+ static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
+ {
+-   assign_guest_tls(tid, tlsptr);
++   VG_(threads)[tid].arch.vex.guest_TPIDRURO = tlsptr;
+ 
+    if (KernelVariantiS(KernelVariant_android_no_hw_tls,
+                        VG_(clo_kernel_variant))) {
+@@ -333,7 +217,6 @@ DECL_TEMPLATE(arm_linux, sys_stat64);
+ DECL_TEMPLATE(arm_linux, sys_lstat64);
+ DECL_TEMPLATE(arm_linux, sys_fstatat64);
+ DECL_TEMPLATE(arm_linux, sys_fstat64);
+-DECL_TEMPLATE(arm_linux, sys_clone);
+ DECL_TEMPLATE(arm_linux, sys_sigreturn);
+ DECL_TEMPLATE(arm_linux, sys_rt_sigreturn);
+ DECL_TEMPLATE(arm_linux, sys_sigsuspend);
+@@ -424,100 +307,6 @@ POST(sys_fstat64)
+    POST_MEM_WRITE( ARG2, sizeof(struct vki_stat64) );
+ }
+ 
+-PRE(sys_clone)
+-{
+-    UInt cloneflags;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-   PRE_REG_READ5(int, "clone",
+-                 unsigned long, flags,
+-                 void *, child_stack,
+-                 int *, parent_tidptr,
+-                 void *, child_tls,
+-                 int *, child_tidptr);
+-
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG1 & VKI_CLONE_SETTLS) {
+-      PRE_MEM_READ("clone(tls_user_desc)", ARG4, sizeof(vki_modify_ldt_t));
+-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
+-                                             VKI_PROT_READ)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-
+-   cloneflags = ARG1;
+-
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  ARG1,         /* flags */
+-                  (Addr)ARG2,   /* child ESP */
+-                  (Int *)ARG3,  /* parent_tidptr */
+-                  (Int *)ARG5,  /* child_tidptr */
+-                  (Addr)ARG4)); /* set_tls */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,      /* flags */
+-                       (Int *)ARG3,     /* parent_tidptr */
+-                       (Int *)ARG5));   /* child_tidptr */
+-      break;
+-
+-   default:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
+-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+-      VG_(message)(Vg_UserMsg, " - for the Quadrics Elan3 user-space driver\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG5, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_sigreturn)
+ {
+    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
+@@ -901,7 +690,7 @@ static SyscallTableEntry syscall_main_table[] = {
+    GENX_(__NR_fsync,             sys_fsync),          // 118
+    PLAX_(__NR_sigreturn,         sys_sigreturn),      // 119 ?/Linux
+ 
+-   PLAX_(__NR_clone,             sys_clone),          // 120
++   LINX_(__NR_clone,             sys_clone),          // 120
+ //zz    //   (__NR_setdomainname,     sys_setdomainname),  // 121 */*(?)
+    GENXY(__NR_uname,             sys_newuname),       // 122
+ //   PLAX_(__NR_modify_ldt,        sys_modify_ldt),     // 123
+diff --git a/coregrind/m_syswrap/syswrap-arm64-linux.c b/coregrind/m_syswrap/syswrap-arm64-linux.c
+index 6b579e8..1be6629 100644
+--- a/coregrind/m_syswrap/syswrap-arm64-linux.c
++++ b/coregrind/m_syswrap/syswrap-arm64-linux.c
+@@ -138,14 +138,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-Long do_syscall_clone_arm64_linux ( Word (*fn)(void *), 
+-                                    void* child_stack, 
+-                                    Long  flags, 
+-                                    void* arg,
+-                                    Int*  child_tid,
+-                                    Int*  parent_tid,
+-                                    void* tls );
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ ".text\n"
+ ".globl do_syscall_clone_arm64_linux\n"
+@@ -196,121 +189,6 @@ static void setup_child ( ThreadArchState*, ThreadArchState* );
+ static void assign_guest_tls(ThreadId ctid, Addr tlsptr);
+ //ZZ static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr );
+             
+-/* 
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for IP, and a separate stack
+-   for SP.
+- */
+-static SysRes do_clone ( ThreadId ptid, 
+-                         ULong flags,
+-                         Addr  child_xsp, 
+-                         Int*  parent_tidptr, 
+-                         Int*  child_tidptr, 
+-                         Addr  child_tls )
+-{
+-   ThreadId     ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   UWord*       stack;
+-   SysRes       res;
+-   ULong        x0;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-   /* Copy register state
+-
+-      Both parent and child return to the same place, and the code
+-      following the clone syscall works out which is which, so we
+-      don't need to worry about it.
+-
+-      The parent gets the child's new tid returned from clone, but the
+-      child gets 0.
+-
+-      If the clone call specifies a NULL xsp for the new thread, then
+-      it actually gets a copy of the parent's xsp.
+-   */
+-   setup_child( &ctst->arch, &ptst->arch );
+-
+-   /* Make sys_clone appear to have returned Success(0) in the
+-      child. */
+-   ctst->arch.vex.guest_X0 = 0;
+-
+-   if (child_xsp != 0)
+-      ctst->arch.vex.guest_XSP = child_xsp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   /* inherit signal mask */
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* Start the child with its threadgroup being the same as the
+-      parent's.  This is so that any exit_group calls that happen
+-      after the child is created but before it sets its
+-      os_state.threadgroup field for real (in thread_wrapper in
+-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-      a race condition in which the thread is unkillable (via
+-      exit_group) because its threadgroup is not set.  The race window
+-      is probably only a few hundred or a few thousand cycles long.
+-      See #226116. */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack)(child_xsp, ctst);
+-
+-   /* Assume the clone will succeed, and tell any tool that wants to
+-      know that this thread has come into existence.  If the clone
+-      fails, we'll send out a ll_exit notification for it at the out:
+-      label below, to clean up. */
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      /* Just assign the tls pointer in the guest TPIDR_EL0. */
+-      assign_guest_tls(ctid, child_tls);
+-   }
+-    
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   /* start the thread with everything blocked */
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   x0 = do_syscall_clone_arm64_linux(
+-      ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
+-      child_tidptr, parent_tidptr, NULL
+-   );
+-    
+-   res = VG_(mk_SysRes_arm64_linux)( x0 );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-  out:
+-   if (sr_isError(res)) {
+-      /* clone failed */
+-      VG_(cleanup_thread)(&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-}
+-
+-
+ /* ---------------------------------------------------------------------
+    More thread stuff
+    ------------------------------------------------------------------ */
+@@ -397,7 +275,6 @@ DECL_TEMPLATE(arm64_linux, sys_mmap);
+ //ZZ DECL_TEMPLATE(arm_linux, sys_lstat64);
+ //ZZ DECL_TEMPLATE(arm_linux, sys_fstatat64);
+ //ZZ DECL_TEMPLATE(arm_linux, sys_fstat64);
+-DECL_TEMPLATE(arm64_linux, sys_clone);
+ //ZZ DECL_TEMPLATE(arm_linux, sys_sigreturn);
+ DECL_TEMPLATE(arm64_linux, sys_rt_sigreturn);
+ //ZZ DECL_TEMPLATE(arm_linux, sys_sigsuspend);
+@@ -512,110 +389,6 @@ PRE(sys_mmap)
+ //ZZ    POST_MEM_WRITE( ARG2, sizeof(struct vki_stat64) );
+ //ZZ }
+ 
+-/* Aarch64 seems to use CONFIG_CLONE_BACKWARDS in the kernel.  See:
+-      http://dev.gentoo.org/~vapier/aarch64/linux-3.12.6.config
+-      http://people.redhat.com/wcohen/aarch64/aarch64_config
+-   from linux-3.10.5/kernel/fork.c 
+-    #ifdef CONFIG_CLONE_BACKWARDS
+-    SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+-                     int __user *, parent_tidptr,
+-                     int, tls_val,
+-                     int __user *, child_tidptr)
+-*/
+-PRE(sys_clone)
+-{
+-   UInt cloneflags;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-   PRE_REG_READ5(int, "clone",
+-                 unsigned long, flags,
+-                 void *, child_stack,
+-                 int *, parent_tidptr,
+-                 void *, child_tls,
+-                 int *, child_tidptr);
+-
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-//ZZ    if (ARG1 & VKI_CLONE_SETTLS) {
+-//ZZ       PRE_MEM_READ("clone(tls_user_desc)", ARG4, sizeof(vki_modify_ldt_t));
+-//ZZ       if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
+-//ZZ                                              VKI_PROT_READ)) {
+-//ZZ          SET_STATUS_Failure( VKI_EFAULT );
+-//ZZ          return;
+-//ZZ       }
+-//ZZ    }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-
+-   cloneflags = ARG1;
+-
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  ARG1,         /* flags */
+-                  (Addr)ARG2,   /* child SP */
+-                  (Int*)ARG3,   /* parent_tidptr */
+-                  (Int*)ARG5,   /* child_tidptr */
+-                  (Addr)ARG4)); /* tls_val */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,     /* flags */
+-                       (Int*)ARG3,     /* parent_tidptr */
+-                       (Int*)ARG5));   /* child_tidptr */
+-      break;
+-
+-   default:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
+-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+-      VG_(message)(Vg_UserMsg, " - for the Quadrics Elan3 user-space driver\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG5, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ //ZZ PRE(sys_sigreturn)
+ //ZZ {
+ //ZZ    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
+@@ -1072,7 +845,7 @@ static SyscallTableEntry syscall_main_table[] = {
+    LINX_(__NR_add_key,           sys_add_key),           // 217
+ 
+    LINXY(__NR_keyctl,            sys_keyctl),            // 219
+-   PLAX_(__NR_clone,             sys_clone),             // 220
++   LINX_(__NR_clone,             sys_clone),             // 220
+    GENX_(__NR_execve,            sys_execve),            // 221
+    PLAX_(__NR_mmap,              sys_mmap),              // 222
+    PLAX_(__NR_fadvise64,         sys_fadvise64),         // 223
+diff --git a/coregrind/m_syswrap/syswrap-linux.c b/coregrind/m_syswrap/syswrap-linux.c
+index b3ffdb1..aa00a5f 100644
+--- a/coregrind/m_syswrap/syswrap-linux.c
++++ b/coregrind/m_syswrap/syswrap-linux.c
+@@ -93,9 +93,8 @@ static VgSchedReturnCode thread_wrapper(Word /*ThreadId*/ tidW)
+    VG_TRACK(pre_thread_first_insn, tid);
+ 
+    tst->os_state.lwpid = VG_(gettid)();
+-   /* Set the threadgroup for real.  This overwrites the provisional
+-      value set in do_clone() syswrap-*-linux.c.  See comments in
+-      do_clone for background, also #226116. */
++   /* Set the threadgroup for real.  This overwrites the provisional value set
++      in do_clone().  See comments in do_clone for background, also #226116. */
+    tst->os_state.threadgroup = VG_(getpid)();
+ 
+    /* Thread created with all signals blocked; scheduler will set the
+@@ -430,17 +429,327 @@ void VG_(main_thread_wrapper_NORETURN)(ThreadId tid)
+    vg_assert(0);
+ }
+ 
++/* Clone a new thread. Note that in the clone syscalls, we hard-code
++   tlsaddr argument as NULL : the guest TLS is emulated via guest
++   registers, and Valgrind itself has no thread local storage. */
++static SysRes clone_new_thread ( Word (*fn)(void *), 
++                                 void* stack, 
++                                 Word  flags, 
++                                 ThreadState* ctst,
++                                 Int* child_tidptr, 
++                                 Int* parent_tidptr)
++{
++   SysRes res;
++   /* Note that in all the below, we make sys_clone appear to have returned
++      Success(0) in the child, by assigning the relevant child guest
++      register(s) just before the clone syscall. */
++#if defined(VGP_x86_linux)
++   Int          eax;
++   ctst->arch.vex.guest_EAX = 0;
++   eax = do_syscall_clone_x86_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   res = VG_(mk_SysRes_x86_linux)( eax );
++#elif defined(VGP_amd64_linux)
++   Long         rax;
++   ctst->arch.vex.guest_RAX = 0;
++   rax = do_syscall_clone_amd64_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   res = VG_(mk_SysRes_amd64_linux)( rax );
++#elif defined(VGP_ppc32_linux)
++   ULong        word64;
++   UInt old_cr = LibVEX_GuestPPC32_get_CR( &ctst->arch.vex );
++   /* %r3 = 0 */
++   ctst->arch.vex.guest_GPR3 = 0;
++   /* %cr0.so = 0 */
++   LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
++   word64 = do_syscall_clone_ppc32_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   /* High half word64 is syscall return value.  Low half is
++      the entire CR, from which we need to extract CR0.SO. */
++   /* VG_(printf)("word64 = 0x%llx\n", word64); */
++   res = VG_(mk_SysRes_ppc32_linux)(/*val*/(UInt)(word64 >> 32), 
++                                    /*errflag*/ (((UInt)word64) >> 28) & 1);
++#elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
++   ULong        word64;
++   UInt old_cr = LibVEX_GuestPPC64_get_CR( &ctst->arch.vex );
++   /* %r3 = 0 */
++   ctst->arch.vex.guest_GPR3 = 0;
++   /* %cr0.so = 0 */
++   LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
++   word64 = do_syscall_clone_ppc64_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   /* Low half word64 is syscall return value.  Hi half is
++      the entire CR, from which we need to extract CR0.SO. */
++   /* VG_(printf)("word64 = 0x%llx\n", word64); */
++   res = VG_(mk_SysRes_ppc64_linux)
++      (/*val*/(UInt)(word64 & 0xFFFFFFFFULL), 
++       /*errflag*/ (UInt)((word64 >> (32+28)) & 1));
++#elif defined(VGP_s390x_linux)
++   ULong        r2;
++   ctst->arch.vex.guest_r2 = 0;
++   r2 = do_syscall_clone_s390x_linux
++      (stack, flags, parent_tidptr, child_tidptr, NULL,
++       ML_(start_thread_NORETURN), ctst);
++   res = VG_(mk_SysRes_s390x_linux)( r2 );
++#elif defined(VGP_arm64_linux)
++   ULong        x0;
++   ctst->arch.vex.guest_X0 = 0;
++   x0 = do_syscall_clone_arm64_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   res = VG_(mk_SysRes_arm64_linux)( x0 );
++#elif defined(VGP_arm_linux)
++   UInt r0;
++   ctst->arch.vex.guest_R0 = 0;
++   r0 = do_syscall_clone_arm_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   res = VG_(mk_SysRes_arm_linux)( r0 );
++#elif defined(VGP_mips64_linux)
++   UInt ret = 0;
++   ctst->arch.vex.guest_r2 = 0;
++   ctst->arch.vex.guest_r7 = 0;
++   ret = do_syscall_clone_mips64_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       parent_tidptr, NULL, child_tidptr);
++   res = VG_(mk_SysRes_mips64_linux)( /* val */ ret, 0, /* errflag */ 0);
++#elif defined(VGP_mips32_linux)
++   UInt ret = 0;
++   ctst->arch.vex.guest_r2 = 0;
++   ctst->arch.vex.guest_r7 = 0;
++   ret = do_syscall_clone_mips_linux
++      (ML_(start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   /* High half word64 is syscall return value.  Low half is
++      the entire CR, from which we need to extract CR0.SO. */ 
++   res = VG_ (mk_SysRes_mips32_linux) (/*val */ ret, 0, /*errflag */ 0);
++#elif defined(VGP_tilegx_linux)
++   Long ret = 0;
++   ctst->arch.vex.guest_r0 = 0;
++   ctst->arch.vex.guest_r3 = 0;
++   ret = do_syscall_clone_tilegx_linux
++      (ML_ (start_thread_NORETURN), stack, flags, ctst,
++       child_tidptr, parent_tidptr, NULL);
++   /* High half word64 is syscall return value. */
++   res = VG_(mk_SysRes_tilegx_linux) (/*val */ ret);
++#else
++# error Unknown platform
++#endif
++   return res;
++}
++
++static void setup_child ( /*OUT*/ ThreadArchState *child, 
++                          /*IN*/  ThreadArchState *parent )
++{  
++   /* We inherit our parent's guest state. */
++   child->vex = parent->vex;
++   child->vex_shadow1 = parent->vex_shadow1;
++   child->vex_shadow2 = parent->vex_shadow2;
+ 
+-/* Do a clone which is really a fork() */
+-SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
+-                            Int* parent_tidptr, Int* child_tidptr )
++#if defined(VGP_x86_linux)
++   extern void ML_(x86_setup_LDT_GDT) ( /*OUT*/ ThreadArchState *child, 
++                                        /*IN*/  ThreadArchState *parent );
++   ML_(x86_setup_LDT_GDT)(child, parent);
++#endif
++}
++
++static SysRes setup_child_tls (ThreadId ctid, Addr tlsaddr)
++{
++   static const Bool debug = False;
++   ThreadState* ctst = VG_(get_ThreadState)(ctid);
++   // res is succesful by default, overriden if a real syscall is needed/done.
++   SysRes res = VG_(mk_SysRes_Success)(0);
++
++   if (debug)
++      VG_(printf)("clone child has SETTLS: tls at %#lx\n", tlsaddr);
++
++#if defined(VGP_x86_linux)
++   vki_modify_ldt_t* tlsinfo = (vki_modify_ldt_t*)tlsaddr;
++   if (debug)
++      VG_(printf)("clone child has SETTLS: tls info at %p: idx=%u "
++                  "base=%#lx limit=%x; esp=%#x fs=%x gs=%x\n",
++                  tlsinfo, tlsinfo->entry_number, 
++                  tlsinfo->base_addr, tlsinfo->limit,
++                  ctst->arch.vex.guest_ESP,
++                  ctst->arch.vex.guest_FS, ctst->arch.vex.guest_GS);
++   res = ML_(x86_sys_set_thread_area)(ctid, tlsinfo);
++#elif defined(VGP_amd64_linux)
++   ctst->arch.vex.guest_FS_CONST = tlsaddr;
++#elif defined(VGP_ppc32_linux)
++   ctst->arch.vex.guest_GPR2 = tlsaddr;
++#elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
++   ctst->arch.vex.guest_GPR13 = tlsaddr;
++#elif defined(VGP_s390x_linux)
++   ctst->arch.vex.guest_a0 = (UInt) (tlsaddr >> 32);
++   ctst->arch.vex.guest_a1 = (UInt) tlsaddr;
++#elif defined(VGP_arm64_linux)
++   /* Just assign the tls pointer in the guest TPIDR_EL0. */
++   ctst->arch.vex.guest_TPIDR_EL0 = tlsaddr;
++#elif defined(VGP_arm_linux)
++   /* Just assign the tls pointer in the guest TPIDRURO. */
++   ctst->arch.vex.guest_TPIDRURO = tlsaddr;
++#elif defined(VGP_mips64_linux)
++   ctst->arch.vex.guest_ULR = tlsaddr;
++   ctst->arch.vex.guest_r27 = tlsaddr;
++#elif defined(VGP_mips32_linux)
++   ctst->arch.vex.guest_ULR = tlsaddr;
++   ctst->arch.vex.guest_r27 = tlsaddr;
++#elif defined(VGP_tilegx_linux)
++   ctst->arch.vex.guest_r53 = tlsaddr;
++#else
++# error Unknown platform
++#endif
++   return res;
++} 
++
++/* 
++   When a client clones, we need to keep track of the new thread.  This means:
++   1. allocate a ThreadId+ThreadState+stack for the thread
++
++   2. initialize the thread's new VCPU state
++
++   3. create the thread using the same args as the client requested,
++   but using the scheduler entrypoint for EIP, and a separate stack
++   for ESP.
++ */
++static SysRes do_clone ( ThreadId ptid, 
++                         UWord flags, Addr sp, 
++                         Int* parent_tidptr, 
++                         Int* child_tidptr, 
++                         Addr tlsaddr)
++{
++   ThreadId     ctid = VG_(alloc_ThreadState)();
++   ThreadState* ptst = VG_(get_ThreadState)(ptid);
++   ThreadState* ctst = VG_(get_ThreadState)(ctid);
++   UWord*       stack;
++   SysRes       res;
++   vki_sigset_t blockall, savedmask;
++
++   VG_(sigfillset)(&blockall);
++
++   vg_assert(VG_(is_running_thread)(ptid));
++   vg_assert(VG_(is_valid_tid)(ctid));
++
++   stack = (UWord*)ML_(allocstack)(ctid);
++   if (stack == NULL) {
++      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
++      goto out;
++   }
++
++   /* Copy register state
++
++      Both parent and child return to the same place, and the code
++      following the clone syscall works out which is which, so we
++      don't need to worry about it.
++
++      The parent gets the child's new tid returned from clone, but the
++      child gets 0.
++
++      If the clone call specifies a NULL sp for the new thread, then
++      it actually gets a copy of the parent's sp.
++   */
++   setup_child( &ctst->arch, &ptst->arch );
++
++   if (sp != 0)
++      VG_(set_SP)(ctid, sp);
++
++   ctst->os_state.parent = ptid;
++
++   /* inherit signal mask */
++   ctst->sig_mask     = ptst->sig_mask;
++   ctst->tmp_sig_mask = ptst->sig_mask;
++
++   /* Start the child with its threadgroup being the same as the
++      parent's.  This is so that any exit_group calls that happen
++      after the child is created but before it sets its
++      os_state.threadgroup field for real (in thread_wrapper in
++      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
++      a race condition in which the thread is unkillable (via
++      exit_group) because its threadgroup is not set.  The race window
++      is probably only a few hundred or a few thousand cycles long.
++      See #226116. */
++   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
++
++   ML_(guess_and_register_stack) (sp, ctst);
++   
++   /* Assume the clone will succeed, and tell any tool that wants to
++      know that this thread has come into existence.  We cannot defer
++      it beyond this point because setup_tls, just below,
++      causes checks to assert by making references to the new ThreadId
++      if we don't state the new thread exists prior to that point.
++      If the clone fails, we'll send out a ll_exit notification for it
++      at the out: label below, to clean up. */
++   vg_assert(VG_(owns_BigLock_LL)(ptid));
++   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
++
++   if (flags & VKI_CLONE_SETTLS) {
++      res = setup_child_tls(ctid, tlsaddr);
++      if (sr_isError(res))
++	 goto out;
++   }
++   flags &= ~VKI_CLONE_SETTLS;
++
++   /* start the thread with everything blocked */
++   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
++
++   /* Create the new thread */
++   res = clone_new_thread ( ML_(start_thread_NORETURN), stack, flags, ctst,
++                            child_tidptr, parent_tidptr);
++
++   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
++
++  out:
++   if (sr_isError(res)) {
++      /* clone failed */
++      VG_(cleanup_thread)(&ctst->arch);
++      ctst->status = VgTs_Empty;
++      /* oops.  Better tell the tool the thread exited in a hurry :-) */
++      VG_TRACK( pre_thread_ll_exit, ctid );
++   }
++
++#if defined(VGP_mips64_linux) || defined(VGP_mips32_linux)
++   // ??? why do we set unconditionally r2 to 0, even when error out ???
++   ptst->arch.vex.guest_r2 = 0;
++#elif defined(VGP_tilegx_linux)
++   // ??? why do we set unconditionally r0 to 0, even when error out ???
++   ptst->arch.vex.guest_r0 = 0;
++#endif
++
++   return res;
++}
++
++/* Do a clone which is really a fork().
++   ML_(do_fork_clone) uses the clone syscall to fork a child process.
++   Note that this should not be called for a thread creation.
++   Also, some flags combinations are not supported, and such combinations
++   are handled either by masking the non supported flags or by asserting.
++
++   The CLONE_VFORK flag is accepted, as this just tells that the parent is
++   suspended till the child exits or calls execve. We better keep this flag,
++   just in case the guests parent/client code depends on this synchronisation.
++
++   We cannot keep the flag CLONE_VM, as Valgrind will do whatever host
++   instructions in the child process, that will mess up the parent host
++   memory. So, we hope for the best and assumes that the guest application does
++   not (really) depends on sharing the memory between parent and child in the
++   interval between clone and exits/execve.
++
++   If child_sp != 0, the child (guest) sp will be set to child_sp just after the
++   clone syscall, before child guest instructions are executed. */
++static SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
++                                   Int* parent_tidptr, Int* child_tidptr,
++                                   Addr child_sp)
+ {
+    vki_sigset_t fork_saved_mask;
+    vki_sigset_t mask;
+    SysRes       res;
+ 
+    if (flags & (VKI_CLONE_SETTLS | VKI_CLONE_FS | VKI_CLONE_VM 
+-                | VKI_CLONE_FILES | VKI_CLONE_VFORK))
++                | VKI_CLONE_FILES))
+       return VG_(mk_SysRes_Error)( VKI_EINVAL );
+ 
+    /* Block all signals during fork, so that we can fix things up in
+@@ -476,6 +785,8 @@ SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
+ 
+    if (!sr_isError(res) && sr_Res(res) == 0) {
+       /* child */
++      if (child_sp != 0)
++          VG_(set_SP)(tid, child_sp);
+       VG_(do_atfork_child)(tid);
+ 
+       /* restore signal mask */
+@@ -508,7 +819,6 @@ SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
+    return res;
+ }
+ 
+-
+ /* ---------------------------------------------------------------------
+    PRE/POST wrappers for arch-generic, Linux-specific syscalls
+    ------------------------------------------------------------------ */
+@@ -519,6 +829,157 @@ SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
+ #error Unknown endianness
+ #endif
+ 
++PRE(sys_clone)
++{
++   UInt cloneflags;
++   Bool badarg = False;
++
++   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
++
++// Order of arguments differs between platforms.
++#if defined(VGP_x86_linux) \
++    || defined(VGP_ppc32_linux) \
++    || defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)	\
++    || defined(VGP_arm_linux) || defined(VGP_mips32_linux) \
++    || defined(VGP_mips64_linux) || defined(VGP_arm64_linux)
++#define ARG_CHILD_TIDPTR ARG5
++#define PRA_CHILD_TIDPTR PRA5
++#define ARG_TLS          ARG4
++#define PRA_TLS          PRA4
++#elif defined(VGP_amd64_linux) || defined(VGP_tilegx_linux) \
++    || defined(VGP_s390x_linux)
++#define ARG_CHILD_TIDPTR ARG4
++#define PRA_CHILD_TIDPTR PRA4
++#define ARG_TLS          ARG5
++#define PRA_TLS          PRA5
++#else
++# error Unknown platform
++#endif
++// And s390x is even more special, and inverts flags and child stack args
++#if defined(VGP_s390x_linux)
++#define ARG_FLAGS       ARG2
++#define PRA_FLAGS       PRA2
++#define ARG_CHILD_STACK ARG1
++#define PRA_CHILD_STACK PRA1
++#else
++#define ARG_FLAGS       ARG1
++#define PRA_FLAGS       PRA1
++#define ARG_CHILD_STACK ARG2
++#define PRA_CHILD_STACK PRA2
++#endif
++
++   if (VG_(tdict).track_pre_reg_read) {
++      PRA_FLAGS("clone", unsigned long, flags);
++      PRA_CHILD_STACK("clone",  void *, child_stack);
++   }
++
++   if (ARG_FLAGS & VKI_CLONE_PARENT_SETTID) {
++      if (VG_(tdict).track_pre_reg_read) {
++         PRA3("clone", int *, parent_tidptr);
++      }
++      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
++      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
++                                             VKI_PROT_WRITE)) {
++         badarg = True;
++      }
++   }
++   if (ARG_FLAGS & VKI_CLONE_SETTLS) {
++      if (VG_(tdict).track_pre_reg_read) {
++         PRA_TLS("clone", vki_modify_ldt_t *, tlsinfo);
++      }
++      /* Not very clear what is vki_modify_ldt_t: for many platforms, it is a
++         dummy type (that we define as a char). We only dereference/check the
++         ARG_TLS pointer if the type looks like a real type, i.e. sizeof > 1. */
++      if (sizeof(vki_modify_ldt_t) > 1) {
++         PRE_MEM_READ("clone(tlsinfo)", ARG_TLS, sizeof(vki_modify_ldt_t));
++         if (!VG_(am_is_valid_for_client)(ARG_TLS, sizeof(vki_modify_ldt_t), 
++                                          VKI_PROT_READ)) {
++            badarg = True;
++         }
++      }
++   }
++   if (ARG_FLAGS & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
++      if (VG_(tdict).track_pre_reg_read) {
++         PRA_CHILD_TIDPTR("clone", int *, child_tidptr);
++      }
++      PRE_MEM_WRITE("clone(child_tidptr)", ARG_CHILD_TIDPTR, sizeof(Int));
++      if (!VG_(am_is_valid_for_client)(ARG_CHILD_TIDPTR, sizeof(Int), 
++                                             VKI_PROT_WRITE)) {
++         badarg = True;
++      }
++   }
++
++   if (badarg) {
++      SET_STATUS_Failure( VKI_EFAULT );
++      return;
++   }
++
++   cloneflags = ARG_FLAGS;
++
++   if (!ML_(client_signal_OK)(ARG_FLAGS & VKI_CSIGNAL)) {
++      SET_STATUS_Failure( VKI_EINVAL );
++      return;
++   }
++
++   /* Only look at the flags we really care about */
++   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
++                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
++   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
++      /* thread creation */
++      SET_STATUS_from_SysRes(
++         do_clone(tid,
++                  ARG_FLAGS,               /* flags */
++                  (Addr)ARG_CHILD_STACK,   /* child ESP */
++                  (Int*)ARG3,              /* parent_tidptr */
++                  (Int*)ARG_CHILD_TIDPTR,  /* child_tidptr */
++                  (Addr)ARG_TLS));         /* set_tls */
++      break;
++
++   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
++      // FALLTHROUGH - assume vfork (somewhat) == fork, see ML_(do_fork_clone).
++      cloneflags &= ~VKI_CLONE_VM;
++
++   case 0: /* plain fork */
++      SET_STATUS_from_SysRes(
++         ML_(do_fork_clone)(tid,
++                       cloneflags,      /* flags */
++                       (Int*)ARG3,     /* parent_tidptr */
++                       (Int*)ARG_CHILD_TIDPTR,     /* child_tidptr */
++                       (Addr)ARG_CHILD_STACK));
++      break;
++
++   default:
++      /* should we just ENOSYS? */
++      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG_FLAGS);
++      VG_(message)(Vg_UserMsg, "\n");
++      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
++      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
++      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
++      VG_(unimplemented)
++         ("Valgrind does not support general clone().");
++   }
++
++   if (SUCCESS) {
++      if (ARG_FLAGS & VKI_CLONE_PARENT_SETTID)
++         POST_MEM_WRITE(ARG3, sizeof(Int));
++      if (ARG_FLAGS & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
++         POST_MEM_WRITE(ARG_CHILD_TIDPTR, sizeof(Int));
++
++      /* Thread creation was successful; let the child have the chance
++         to run */
++      *flags |= SfYieldAfter;
++   }
++
++#undef ARG_CHILD_TIDPTR
++#undef PRA_CHILD_TIDPTR
++#undef ARG_TLS
++#undef PRA_TLS
++#undef ARG_FLAGS
++#undef PRA_FLAGS
++#undef ARG_CHILD_STACK
++#undef PRA_CHILD_STACK
++}
++
+ /* ---------------------------------------------------------------------
+    *mount wrappers
+    ------------------------------------------------------------------ */
+diff --git a/coregrind/m_syswrap/syswrap-mips64-linux.c b/coregrind/m_syswrap/syswrap-mips64-linux.c
+index 6e3db74..d3d70c5 100644
+--- a/coregrind/m_syswrap/syswrap-mips64-linux.c
++++ b/coregrind/m_syswrap/syswrap-mips64-linux.c
+@@ -136,14 +136,7 @@ asm (
+ #define __NR_CLONE        __NR_clone
+ #define __NR_EXIT         __NR_exit
+ 
+-ULong do_syscall_clone_mips64_linux ( Word (*fn) (void *),  /* a0 - 4 */
+-                                      void* stack,          /* a1 - 5 */
+-                                      Int   flags,          /* a2 - 6 */
+-                                      void* arg,            /* a3 - 7 */
+-                                      Int*  parent_tid,     /* a4 - 8 */
+-                                      void* /* Int tls */,  /* a5 - 9 */
+-                                      Int*  child_tid );    /* a6 - 10 */
+-
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ ".text\n" 
+ ".set noreorder\n"
+@@ -199,104 +192,13 @@ asm(
+ #undef __NR_EXIT
+ 
+ /* forward declarations */
+-static void setup_child ( ThreadArchState *, ThreadArchState *);
+ static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr);
+ 
+-/* When a client clones, we need to keep track of the new thread. This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested, but using
+-      the scheduler entrypoint for IP, and a separate stack for SP. */
+-static SysRes do_clone ( ThreadId ptid,
+-                         UInt flags, Addr sp,
+-                         Int* parent_tidptr,
+-                         Int* child_tidptr,
+-                         Addr child_tls )
+-{
+-   const Bool debug = False;
+-   ThreadId ctid = VG_ (alloc_ThreadState) ();
+-   ThreadState * ptst = VG_ (get_ThreadState) (ptid);
+-   ThreadState * ctst = VG_ (get_ThreadState) (ctid);
+-   UInt ret = 0;
+-   UWord * stack;
+-   SysRes res;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-   stack = (UWord *)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)(VKI_ENOMEM);
+-      goto out;
+-   }
+-   setup_child(&ctst->arch, &ptst->arch);
+-
+-   /* on MIPS we need to set V0 and A3 to zero */
+-   ctst->arch.vex.guest_r2 = 0;
+-   ctst->arch.vex.guest_r7 = 0;
+-   if (sp != 0)
+-      ctst->arch.vex.guest_r29 = sp;
+-
+-   ctst->os_state.parent = ptid;
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (sp, ctst);
+-
+-   VG_TRACK(pre_thread_ll_create, ptid, ctid);
+-   if (flags & VKI_CLONE_SETTLS) {
+-       if (debug)
+-         VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
+-       res = sys_set_tls(ctid, child_tls);
+-       if (sr_isError(res))
+-          goto out;
+-       ctst->arch.vex.guest_r27 = child_tls;
+-   }
+-
+-   flags &= ~VKI_CLONE_SETTLS;
+-   VG_ (sigprocmask) (VKI_SIG_SETMASK, &blockall, &savedmask);
+-   /* Create the new thread */
+-   ret = do_syscall_clone_mips64_linux(ML_(start_thread_NORETURN),
+-                                       stack, flags, &VG_(threads)[ctid],
+-                                       parent_tidptr, NULL /*child_tls*/,
+-                                       child_tidptr);
+-   if (debug)
+-     VG_(printf)("ret: 0x%x\n", ret);
+-
+-   res = VG_(mk_SysRes_mips64_linux)( /* val */ ret, 0, /* errflag */ 0);
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-   out:
+-   if (sr_isError (res)) {
+-      VG_ (cleanup_thread) (&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      VG_TRACK (pre_thread_ll_exit, ctid);
+-   }
+-   ptst->arch.vex.guest_r2 = 0;
+-
+-   return res;
+-}
+-
+ /* ---------------------------------------------------------------------
+                           More thread stuff
+    ------------------------------------------------------------------ */
+ void VG_(cleanup_thread) ( ThreadArchState * arch ) { };
+ 
+-void setup_child ( /* OUT */ ThreadArchState * child,
+-                   /* IN  */ ThreadArchState * parent )
+-{
+-   /* We inherit our parent's guest state. */
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-}
+-
+ SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
+ {
+    VG_(threads)[tid].arch.vex.guest_ULR = tlsptr;
+@@ -316,7 +218,6 @@ SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
+    file, but that requires even more macro magic. */
+ 
+ DECL_TEMPLATE (mips_linux, sys_set_thread_area);
+-DECL_TEMPLATE (mips_linux, sys_clone);
+ DECL_TEMPLATE (mips_linux, sys_tee);
+ DECL_TEMPLATE (mips_linux, sys_splice);
+ DECL_TEMPLATE (mips_linux, sys_vmsplice);
+@@ -494,84 +395,6 @@ PRE(sys_mmap)
+                                  (Off64T) ARG6);
+    SET_STATUS_from_SysRes(r);
+ }
+-
+-PRE(sys_clone)
+-{
+-   Bool badarg = False;
+-   UInt cloneflags;
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )", ARG1, ARG2, ARG3,
+-                                                      ARG4, ARG5);
+-   PRE_REG_READ2(int, "clone", unsigned long, flags, void *, child_stack);
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA3("clone", int *, parent_tidptr);
+-      }
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), VKI_PROT_WRITE)) {
+-         badarg = True;
+-      }
+-   }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA5("clone", int *, child_tidptr);
+-      }
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof (Int));
+-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof (Int), VKI_PROT_WRITE))
+-         badarg = True;
+-   }
+-   if (badarg) {
+-      SET_STATUS_Failure(VKI_EFAULT);
+-      return;
+-   }
+-   cloneflags = ARG1;
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure(VKI_EINVAL);
+-      return;
+-   }
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS
+-           |VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-      case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-         /* thread creation */
+-         SET_STATUS_from_SysRes(do_clone(tid,
+-                                         ARG1,          /* flags */
+-                                         (Addr)ARG2,    /* child SP */
+-                                         (Int *)ARG3,   /* parent_tidptr */
+-                                         (Int *)ARG5,   /* child_tidptr */
+-                                         (Addr)ARG4));  /* child_tls */
+-         break;
+-
+-      case VKI_CLONE_VFORK | VKI_CLONE_VM:  /* vfork */
+-         /* FALLTHROUGH - assume vfork == fork */
+-         cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-      case 0:  /* plain fork */
+-         SET_STATUS_from_SysRes(ML_(do_fork_clone)(tid,
+-                                cloneflags,     /* flags */
+-                                (Int *)ARG3,    /* parent_tidptr */
+-                                (Int *)ARG5));  /* child_tidptr */
+-         break;
+-
+-      default:
+-         /* should we just ENOSYS? */
+-         VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
+-         VG_(message)(Vg_UserMsg, "\n");
+-         VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-         VG_(message)(Vg_UserMsg,
+-                       " - via a threads library (LinuxThreads or NPTL)\n");
+-         VG_(message)(Vg_UserMsg,
+-                       " - via the implementation of fork or vfork\n");
+-         VG_(unimplemented)("Valgrind does not support general clone().");
+-   }
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG5, sizeof(Int));
+-      /* Thread creation was successful; let the child have the chance to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_rt_sigreturn)
+ {
+    /* See comments on PRE(sys_rt_sigreturn) in syswrap-s390x-linux.c for
+@@ -766,7 +589,7 @@ static SyscallTableEntry syscall_main_table[] = {
+    LINXY (__NR_socketpair, sys_socketpair),
+    LINX_ (__NR_setsockopt, sys_setsockopt),
+    LINXY (__NR_getsockopt, sys_getsockopt),
+-   PLAX_ (__NR_clone, sys_clone),
++   LINX_ (__NR_clone, sys_clone),
+    GENX_ (__NR_fork, sys_fork),
+    GENX_ (__NR_execve, sys_execve),
+    GENX_ (__NR_exit, sys_exit),
+diff --git a/coregrind/m_syswrap/syswrap-ppc32-linux.c b/coregrind/m_syswrap/syswrap-ppc32-linux.c
+index 379fcb3..a654a90 100644
+--- a/coregrind/m_syswrap/syswrap-ppc32-linux.c
++++ b/coregrind/m_syswrap/syswrap-ppc32-linux.c
+@@ -146,14 +146,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-ULong do_syscall_clone_ppc32_linux ( Word (*fn)(void *), 
+-                                     void* stack, 
+-                                     Int   flags, 
+-                                     void* arg,
+-                                     Int*  child_tid, 
+-                                     Int*  parent_tid, 
+-                                     vki_modify_ldt_t * );
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ ".text\n"
+ ".globl do_syscall_clone_ppc32_linux\n"
+@@ -216,145 +209,6 @@ asm(
+ #undef __NR_CLONE
+ #undef __NR_EXIT
+ 
+-// forward declarations
+-static void setup_child ( ThreadArchState*, ThreadArchState* );
+-
+-/* 
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for IP, and a separate stack
+-   for SP.
+- */
+-static SysRes do_clone ( ThreadId ptid, 
+-                         UInt flags, Addr sp, 
+-                         Int *parent_tidptr, 
+-                         Int *child_tidptr, 
+-                         Addr child_tls)
+-{
+-   const Bool debug = False;
+-
+-   ThreadId     ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   ULong        word64;
+-   UWord*       stack;
+-   SysRes       res;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-//?   /* make a stack frame */
+-//?   stack -= 16;
+-//?   *(UWord *)stack = 0;
+-
+-
+-   /* Copy register state
+-
+-      Both parent and child return to the same place, and the code
+-      following the clone syscall works out which is which, so we
+-      don't need to worry about it.
+-
+-      The parent gets the child's new tid returned from clone, but the
+-      child gets 0.
+-
+-      If the clone call specifies a NULL SP for the new thread, then
+-      it actually gets a copy of the parent's SP.
+-
+-      The child's TLS register (r2) gets set to the tlsaddr argument
+-      if the CLONE_SETTLS flag is set.
+-   */
+-   setup_child( &ctst->arch, &ptst->arch );
+-
+-   /* Make sys_clone appear to have returned Success(0) in the
+-      child. */
+-   { UInt old_cr = LibVEX_GuestPPC32_get_CR( &ctst->arch.vex );
+-     /* %r3 = 0 */
+-     ctst->arch.vex.guest_GPR3 = 0;
+-     /* %cr0.so = 0 */
+-     LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
+-   }
+-
+-   if (sp != 0)
+-      ctst->arch.vex.guest_GPR1 = sp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   /* inherit signal mask */
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* Start the child with its threadgroup being the same as the
+-      parent's.  This is so that any exit_group calls that happen
+-      after the child is created but before it sets its
+-      os_state.threadgroup field for real (in thread_wrapper in
+-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-      a race condition in which the thread is unkillable (via
+-      exit_group) because its threadgroup is not set.  The race window
+-      is probably only a few hundred or a few thousand cycles long.
+-      See #226116. */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (sp, ctst);
+-
+-   /* Assume the clone will succeed, and tell any tool that wants to
+-      know that this thread has come into existence.  If the clone
+-      fails, we'll send out a ll_exit notification for it at the out:
+-      label below, to clean up. */
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      if (debug)
+-         VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
+-      ctst->arch.vex.guest_GPR2 = child_tls;
+-   }
+-
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   /* start the thread with everything blocked */
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   /* Create the new thread */
+-   word64 = do_syscall_clone_ppc32_linux(
+-               ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
+-               child_tidptr, parent_tidptr, NULL
+-            );
+-   /* High half word64 is syscall return value.  Low half is
+-      the entire CR, from which we need to extract CR0.SO. */
+-   /* VG_(printf)("word64 = 0x%llx\n", word64); */
+-   res = VG_(mk_SysRes_ppc32_linux)( 
+-            /*val*/(UInt)(word64 >> 32), 
+-            /*errflag*/ (((UInt)word64) >> 28) & 1 
+-         );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-  out:
+-   if (sr_isError(res)) {
+-      /* clone failed */
+-      VG_(cleanup_thread)(&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-}
+-
+-
+ 
+ /* ---------------------------------------------------------------------
+    More thread stuff
+@@ -364,16 +218,6 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
+ {
+ }  
+ 
+-void setup_child ( /*OUT*/ ThreadArchState *child,
+-                   /*IN*/  ThreadArchState *parent )
+-{
+-   /* We inherit our parent's guest state. */
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-}
+-
+-
+ /* ---------------------------------------------------------------------
+    PRE/POST wrappers for ppc32/Linux-specific syscalls
+    ------------------------------------------------------------------ */
+@@ -393,7 +237,6 @@ DECL_TEMPLATE(ppc32_linux, sys_stat64);
+ DECL_TEMPLATE(ppc32_linux, sys_lstat64);
+ DECL_TEMPLATE(ppc32_linux, sys_fstatat64);
+ DECL_TEMPLATE(ppc32_linux, sys_fstat64);
+-DECL_TEMPLATE(ppc32_linux, sys_clone);
+ DECL_TEMPLATE(ppc32_linux, sys_sigreturn);
+ DECL_TEMPLATE(ppc32_linux, sys_rt_sigreturn);
+ DECL_TEMPLATE(ppc32_linux, sys_sigsuspend);
+@@ -530,91 +373,6 @@ POST(sys_fstat64)
+ //..    }
+ //.. }
+ 
+-PRE(sys_clone)
+-{
+-   UInt cloneflags;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-   PRE_REG_READ5(int, "clone",
+-                 unsigned long, flags,
+-                 void *,        child_stack,
+-                 int *,         parent_tidptr,
+-                 void *,        child_tls,
+-                 int *,         child_tidptr);
+-
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-
+-   cloneflags = ARG1;
+-
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  ARG1,         /* flags */
+-                  (Addr)ARG2,   /* child SP */
+-                  (Int *)ARG3,  /* parent_tidptr */
+-                  (Int *)ARG5,  /* child_tidptr */
+-                  (Addr)ARG4)); /* child_tls */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,      /* flags */
+-                       (Int *)ARG3,     /* parent_tidptr */
+-                       (Int *)ARG5));   /* child_tidptr */
+-      break;
+-
+-   default:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
+-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG5, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_sigreturn)
+ {
+    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
+@@ -999,7 +757,7 @@ static SyscallTableEntry syscall_table[] = {
+    GENX_(__NR_fsync,             sys_fsync),             // 118
+    PLAX_(__NR_sigreturn,         sys_sigreturn),         // 119 ?/Linux
+ //.. 
+-   PLAX_(__NR_clone,             sys_clone),             // 120
++   LINX_(__NR_clone,             sys_clone),             // 120
+ //..    //   (__NR_setdomainname,     sys_setdomainname),     // 121 */*(?)
+    GENXY(__NR_uname,             sys_newuname),          // 122
+ //..    PLAX_(__NR_modify_ldt,        sys_modify_ldt),        // 123
+diff --git a/coregrind/m_syswrap/syswrap-ppc64-linux.c b/coregrind/m_syswrap/syswrap-ppc64-linux.c
+index 1ae4454..f90140d 100644
+--- a/coregrind/m_syswrap/syswrap-ppc64-linux.c
++++ b/coregrind/m_syswrap/syswrap-ppc64-linux.c
+@@ -209,14 +209,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-ULong do_syscall_clone_ppc64_linux ( Word (*fn)(void *), 
+-                                     void* stack, 
+-                                     Int   flags, 
+-                                     void* arg,
+-                                     Int*  child_tid, 
+-                                     Int*  parent_tid, 
+-                                     void/*vki_modify_ldt_t*/ * );
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ #if defined(VGP_ppc64be_linux)
+ "   .align   2\n"
+@@ -366,148 +359,6 @@ asm(
+ #undef __NR_CLONE
+ #undef __NR_EXIT
+ 
+-// forward declarations
+-static void setup_child ( ThreadArchState*, ThreadArchState* );
+-
+-/* 
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for IP, and a separate stack
+-   for SP.
+- */
+-static SysRes do_clone ( ThreadId ptid, 
+-                         UInt flags, Addr sp, 
+-                         Int *parent_tidptr, 
+-                         Int *child_tidptr, 
+-                         Addr child_tls)
+-{
+-   const Bool debug = False;
+-
+-   ThreadId     ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   ULong        word64;
+-   UWord*       stack;
+-   SysRes       res;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-//?   /* make a stack frame */
+-//?   stack -= 16;
+-//?   *(UWord *)stack = 0;
+-
+-
+-   /* Copy register state
+-
+-      Both parent and child return to the same place, and the code
+-      following the clone syscall works out which is which, so we
+-      don't need to worry about it.
+-
+-      The parent gets the child's new tid returned from clone, but the
+-      child gets 0.
+-
+-      If the clone call specifies a NULL SP for the new thread, then
+-      it actually gets a copy of the parent's SP.
+-
+-      The child's TLS register (r2) gets set to the tlsaddr argument
+-      if the CLONE_SETTLS flag is set.
+-   */
+-   setup_child( &ctst->arch, &ptst->arch );
+-
+-   /* Make sys_clone appear to have returned Success(0) in the
+-      child. */
+-   { UInt old_cr = LibVEX_GuestPPC64_get_CR( &ctst->arch.vex );
+-     /* %r3 = 0 */
+-     ctst->arch.vex.guest_GPR3 = 0;
+-     /* %cr0.so = 0 */
+-     LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
+-   }
+-
+-   if (sp != 0)
+-      ctst->arch.vex.guest_GPR1 = sp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   /* inherit signal mask */
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* Start the child with its threadgroup being the same as the
+-      parent's.  This is so that any exit_group calls that happen
+-      after the child is created but before it sets its
+-      os_state.threadgroup field for real (in thread_wrapper in
+-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-      a race condition in which the thread is unkillable (via
+-      exit_group) because its threadgroup is not set.  The race window
+-      is probably only a few hundred or a few thousand cycles long.
+-      See #226116. */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (sp, ctst);
+-
+-   /* Assume the clone will succeed, and tell any tool that wants to
+-      know that this thread has come into existence.  If the clone
+-      fails, we'll send out a ll_exit notification for it at the out:
+-      label below, to clean up. */
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      if (debug)
+-         VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
+-      ctst->arch.vex.guest_GPR13 = child_tls;
+-   }
+-
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   /* start the thread with everything blocked */
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   /* Create the new thread */
+-   word64 = do_syscall_clone_ppc64_linux(
+-               ML_(start_thread_NORETURN),
+-               stack, flags, &VG_(threads)[ctid],
+-               child_tidptr, parent_tidptr, NULL
+-            );
+-
+-   /* Low half word64 is syscall return value.  Hi half is
+-      the entire CR, from which we need to extract CR0.SO. */
+-   /* VG_(printf)("word64 = 0x%llx\n", word64); */
+-   res = VG_(mk_SysRes_ppc64_linux)( 
+-            /*val*/(UInt)(word64 & 0xFFFFFFFFULL), 
+-            /*errflag*/ (UInt)((word64 >> (32+28)) & 1)
+-         );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-  out:
+-   if (sr_isError(res)) {
+-      /* clone failed */
+-      VG_(cleanup_thread)(&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-}
+-
+-
+-
+ /* ---------------------------------------------------------------------
+    More thread stuff
+    ------------------------------------------------------------------ */
+@@ -516,16 +367,6 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
+ {
+ }  
+ 
+-void setup_child ( /*OUT*/ ThreadArchState *child,
+-                   /*IN*/  ThreadArchState *parent )
+-{
+-   /* We inherit our parent's guest state. */
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-}
+-
+-
+ /* ---------------------------------------------------------------------
+    PRE/POST wrappers for ppc64/Linux-specific syscalls
+    ------------------------------------------------------------------ */
+@@ -544,7 +385,6 @@ DECL_TEMPLATE(ppc64_linux, sys_mmap);
+ //zz DECL_TEMPLATE(ppc64_linux, sys_stat64);
+ //zz DECL_TEMPLATE(ppc64_linux, sys_lstat64);
+ //zz DECL_TEMPLATE(ppc64_linux, sys_fstat64);
+-DECL_TEMPLATE(ppc64_linux, sys_clone);
+ //zz DECL_TEMPLATE(ppc64_linux, sys_sigreturn);
+ DECL_TEMPLATE(ppc64_linux, sys_rt_sigreturn);
+ DECL_TEMPLATE(ppc64_linux, sys_fadvise64);
+@@ -629,92 +469,6 @@ PRE(sys_mmap)
+ //zz   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat64) );
+ //zz }
+ 
+-
+-PRE(sys_clone)
+-{
+-   UInt cloneflags;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-   PRE_REG_READ5(int, "clone",
+-                 unsigned long, flags,
+-                 void *,        child_stack,
+-                 int *,         parent_tidptr,
+-                 void *,        child_tls,
+-                 int *,         child_tidptr);
+-
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-
+-   cloneflags = ARG1;
+-
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  ARG1,         /* flags */
+-                  (Addr)ARG2,   /* child SP */
+-                  (Int *)ARG3,  /* parent_tidptr */
+-                  (Int *)ARG5,  /* child_tidptr */
+-                  (Addr)ARG4)); /* child_tls */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,      /* flags */
+-                       (Int *)ARG3,     /* parent_tidptr */
+-                       (Int *)ARG5));   /* child_tidptr */
+-      break;
+-
+-   default:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
+-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG5, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_fadvise64)
+ {
+    PRINT("sys_fadvise64 ( %ld, %ld, %lu, %ld )",  SARG1, SARG2, SARG3, SARG4);
+@@ -922,7 +676,7 @@ static SyscallTableEntry syscall_table[] = {
+    GENX_(__NR_fsync,             sys_fsync),              // 118
+ // _____(__NR_sigreturn,         sys_sigreturn),          // 119
+ 
+-   PLAX_(__NR_clone,             sys_clone),              // 120
++   LINX_(__NR_clone,             sys_clone),              // 120
+ // _____(__NR_setdomainname,     sys_setdomainname),      // 121
+    GENXY(__NR_uname,             sys_newuname),           // 122
+ // _____(__NR_modify_ldt,        sys_modify_ldt),         // 123
+diff --git a/coregrind/m_syswrap/syswrap-s390x-linux.c b/coregrind/m_syswrap/syswrap-s390x-linux.c
+index ebb8295..f596341 100644
+--- a/coregrind/m_syswrap/syswrap-s390x-linux.c
++++ b/coregrind/m_syswrap/syswrap-s390x-linux.c
+@@ -138,14 +138,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-ULong do_syscall_clone_s390x_linux ( void  *stack,
+-                                     ULong flags,
+-                                     Int   *parent_tid,
+-                                     Int   *child_tid,
+-                                     Addr  tlsaddr,
+-                                     Word (*fn)(void *),
+-                                     void  *arg);
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+    "   .text\n"
+    "   .align  4\n"
+@@ -182,126 +175,6 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
+   /* only used on x86 for descriptor tables */
+ }
+ 
+-static void setup_child ( /*OUT*/ ThreadArchState *child,
+-                   /*IN*/  ThreadArchState *parent )
+-{
+-   /* We inherit our parent's guest state. */
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-}
+-
+-
+-/*
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for IP, and a separate stack
+-   for SP.
+- */
+-static SysRes do_clone ( ThreadId ptid,
+-                         Addr sp, ULong flags,
+-                         Int *parent_tidptr,
+-                         Int *child_tidptr,
+-                         Addr tlsaddr)
+-{
+-   static const Bool debug = False;
+-
+-   ThreadId     ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   UWord*       stack;
+-   SysRes       res;
+-   ULong        r2;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-   /* Copy register state
+-
+-      Both parent and child return to the same place, and the code
+-      following the clone syscall works out which is which, so we
+-      don't need to worry about it.
+-
+-      The parent gets the child's new tid returned from clone, but the
+-      child gets 0.
+-
+-      If the clone call specifies a NULL sp for the new thread, then
+-      it actually gets a copy of the parent's sp.
+-   */
+-   setup_child( &ctst->arch, &ptst->arch );
+-
+-   /* Make sys_clone appear to have returned Success(0) in the
+-      child. */
+-   ctst->arch.vex.guest_r2 = 0;
+-
+-   if (sp != 0)
+-      ctst->arch.vex.guest_SP = sp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   /* inherit signal mask */
+-   ctst->sig_mask = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* have the parents thread group */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (sp, ctst);
+-
+-   /* Assume the clone will succeed, and tell any tool that wants to
+-      know that this thread has come into existence.  If the clone
+-      fails, we'll send out a ll_exit notification for it at the out:
+-      label below, to clean up. */
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      if (debug)
+-	 VG_(printf)("clone child has SETTLS: tls at %#lx\n", tlsaddr);
+-      ctst->arch.vex.guest_a0 = (UInt) (tlsaddr >> 32);
+-      ctst->arch.vex.guest_a1 = (UInt) tlsaddr;
+-   }
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   /* start the thread with everything blocked */
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   /* Create the new thread */
+-   r2 = do_syscall_clone_s390x_linux(
+-            stack, flags, parent_tidptr, child_tidptr, tlsaddr,
+-            ML_(start_thread_NORETURN), &VG_(threads)[ctid]);
+-
+-   res = VG_(mk_SysRes_s390x_linux)( r2 );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-  out:
+-   if (sr_isError(res)) {
+-      /* clone failed */
+-      ctst->status = VgTs_Empty;
+-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-
+-}
+-
+-
+-
+ /* ---------------------------------------------------------------------
+    PRE/POST wrappers for s390x/Linux-specific syscalls
+    ------------------------------------------------------------------ */
+@@ -317,7 +190,6 @@ static SysRes do_clone ( ThreadId ptid,
+ 
+ DECL_TEMPLATE(s390x_linux, sys_ptrace);
+ DECL_TEMPLATE(s390x_linux, sys_mmap);
+-DECL_TEMPLATE(s390x_linux, sys_clone);
+ DECL_TEMPLATE(s390x_linux, sys_sigreturn);
+ DECL_TEMPLATE(s390x_linux, sys_rt_sigreturn);
+ DECL_TEMPLATE(s390x_linux, sys_fadvise64);
+@@ -452,99 +324,6 @@ PRE(sys_mmap)
+    SET_STATUS_from_SysRes(r);
+ }
+ 
+-PRE(sys_clone)
+-{
+-   UInt cloneflags;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4, ARG5);
+-   PRE_REG_READ2(int, "clone",
+-                 void *,        child_stack,
+-                 unsigned long, flags);
+-
+-   if (ARG2 & VKI_CLONE_PARENT_SETTID) {
+-      if (VG_(tdict).track_pre_reg_read)
+-         PRA3("clone(parent_tidptr)", int *, parent_tidptr);
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int),
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-   if (ARG2 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      if (VG_(tdict).track_pre_reg_read)
+-         PRA4("clone(child_tidptr)", int *, child_tidptr);
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG4, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(Int),
+-                                             VKI_PROT_WRITE)) {
+-         SET_STATUS_Failure( VKI_EFAULT );
+-         return;
+-      }
+-   }
+-
+-   /* The kernel simply copies reg6 (ARG5) into AR0 and AR1, no checks */
+-   if (ARG2 & VKI_CLONE_SETTLS) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA5("clone", Addr, tlsinfo);
+-      }
+-   }
+-
+-   cloneflags = ARG2;
+-
+-   if (!ML_(client_signal_OK)(ARG2 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  (Addr)ARG1,   /* child SP */
+-                  ARG2,         /* flags */
+-                  (Int *)ARG3,  /* parent_tidptr */
+-                  (Int *)ARG4, /* child_tidptr */
+-                  (Addr)ARG5)); /*  tlsaddr */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,      /* flags */
+-                       (Int *)ARG3,     /* parent_tidptr */
+-                       (Int *)ARG4));   /* child_tidptr */
+-      break;
+-
+-   default:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG2);
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg, " - via a threads library (NPTL)\n");
+-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG2 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG2 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG4, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_sigreturn)
+ {
+    ThreadState* tst;
+@@ -775,7 +554,7 @@ static SyscallTableEntry syscall_table[] = {
+    GENX_(__NR_fsync,  sys_fsync),                                     // 118
+    PLAX_(__NR_sigreturn, sys_sigreturn),                              // 119
+ 
+-   PLAX_(__NR_clone,  sys_clone),                                     // 120
++   LINX_(__NR_clone,  sys_clone),                                     // 120
+ // ?????(__NR_setdomainname, ),                                       // 121
+    GENXY(__NR_uname, sys_newuname),                                   // 122
+    GENX_(123, sys_ni_syscall), /* unimplemented (by the kernel) */    // 123
+diff --git a/coregrind/m_syswrap/syswrap-tilegx-linux.c b/coregrind/m_syswrap/syswrap-tilegx-linux.c
+index 7501b20..05d81e8 100644
+--- a/coregrind/m_syswrap/syswrap-tilegx-linux.c
++++ b/coregrind/m_syswrap/syswrap-tilegx-linux.c
+@@ -224,14 +224,7 @@ void ML_(call_on_new_stack_0_1) (Addr stack, Addr retaddr,
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
+-                                     void *stack,          //r1
+-                                     Long flags,           //r2
+-                                     void *arg,            //r3
+-                                     Long * child_tid,     //r4
+-                                     Long * parent_tid,    //r5
+-                                     Long   tls );         //r6
+-    /*
++   /*
+       stack
+       high -> 4  r29
+       3
+@@ -239,6 +232,7 @@ Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
+       1  r10
+       low  -> 0  lr    <- sp
+     */
++// See priv_syswrap-linux.h for arg profile.
+      asm (
+        ".text\n"
+        "   .globl   do_syscall_clone_tilegx_linux\n"
+@@ -315,101 +309,6 @@ Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
+ #undef __NR_EXIT
+ 
+ // forward declarations
+-static void setup_child ( ThreadArchState *, ThreadArchState * );
+-static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr );
+- /*
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-   2. initialize the thread's new VCPU state
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for IP, and a separate stack
+-   for SP.
+- */
+-static SysRes do_clone ( ThreadId ptid,
+-                         Long flags, Addr sp,
+-                         Long * parent_tidptr,
+-                         Long * child_tidptr,
+-                         Addr child_tls )
+-{
+-  const Bool debug = False;
+-  ThreadId ctid = VG_ (alloc_ThreadState) ();
+-  ThreadState * ptst = VG_ (get_ThreadState) (ptid);
+-  ThreadState * ctst = VG_ (get_ThreadState) (ctid);
+-  Long ret = 0;
+-  Long * stack;
+-  SysRes res;
+-  vki_sigset_t blockall, savedmask;
+-
+-  VG_ (sigfillset) (&blockall);
+-  vg_assert (VG_ (is_running_thread) (ptid));
+-  vg_assert (VG_ (is_valid_tid) (ctid));
+-  stack = (Long *) ML_ (allocstack) (ctid);
+-  if (stack == NULL) {
+-    res = VG_ (mk_SysRes_Error) (VKI_ENOMEM);
+-    goto out;
+-  }
+-  setup_child (&ctst->arch, &ptst->arch);
+-
+-  /* On TILEGX we need to set r0 and r3 to zero */
+-  ctst->arch.vex.guest_r0 = 0;
+-  ctst->arch.vex.guest_r3 = 0;
+-  if (sp != 0)
+-    ctst->arch.vex.guest_r54 = sp;
+-
+-  ctst->os_state.parent = ptid;
+-  ctst->sig_mask = ptst->sig_mask;
+-  ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-  /* Start the child with its threadgroup being the same as the
+-     parent's.  This is so that any exit_group calls that happen
+-     after the child is created but before it sets its
+-     os_state.threadgroup field for real (in thread_wrapper in
+-     syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-     a race condition in which the thread is unkillable (via
+-     exit_group) because its threadgroup is not set.  The race window
+-     is probably only a few hundred or a few thousand cycles long.
+-     See #226116. */
+-
+-  ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-  ML_(guess_and_register_stack) (sp, ctst);
+-
+-  VG_TRACK (pre_thread_ll_create, ptid, ctid);
+-  if (flags & VKI_CLONE_SETTLS) {
+-    if (debug)
+-      VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
+-    ctst->arch.vex.guest_r53 = child_tls;
+-    res = sys_set_tls(ctid, child_tls);
+-    if (sr_isError(res))
+-      goto out;
+-  }
+-
+-  flags &= ~VKI_CLONE_SETTLS;
+-  VG_ (sigprocmask) (VKI_SIG_SETMASK, &blockall, &savedmask);
+-  /* Create the new thread */
+-  ret = do_syscall_clone_tilegx_linux (ML_ (start_thread_NORETURN),
+-                                       stack, flags, &VG_ (threads)[ctid],
+-                                       child_tidptr, parent_tidptr,
+-                                       (Long)NULL /*child_tls*/);
+-
+-  /* High half word64 is syscall return value. */
+-  if (debug)
+-    VG_(printf)("ret: 0x%llx\n", (ULong)ret);
+-
+-  res = VG_(mk_SysRes_tilegx_linux) (/*val */ ret);
+-
+-  VG_ (sigprocmask) (VKI_SIG_SETMASK, &savedmask, NULL);
+-
+- out:
+-  if (sr_isError (res)) {
+-    VG_(cleanup_thread) (&ctst->arch);
+-    ctst->status = VgTs_Empty;
+-    VG_TRACK (pre_thread_ll_exit, ctid);
+-  }
+-  ptst->arch.vex.guest_r0 = 0;
+-
+-  return res;
+-}
+-
+ extern Addr do_brk ( Addr newbrk );
+ 
+ extern
+@@ -428,23 +327,6 @@ extern Bool linux_kernel_2_6_22(void);
+ void
+ VG_ (cleanup_thread) ( ThreadArchState * arch ) { }
+ 
+-void
+-setup_child ( /*OUT*/ ThreadArchState * child,
+-              /*IN*/ ThreadArchState * parent )
+-{
+-  /* We inherit our parent's guest state. */
+-  child->vex = parent->vex;
+-  child->vex_shadow1 = parent->vex_shadow1;
+-  child->vex_shadow2 = parent->vex_shadow2;
+-}
+-
+-SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
+-{
+-  VG_(threads)[tid].arch.vex.guest_r53 = tlsptr;
+-  return VG_(mk_SysRes_Success)( 0 );
+-}
+-
+-
+ /* ---------------------------------------------------------------------
+    PRE/POST wrappers for tilegx/Linux-specific syscalls
+    ------------------------------------------------------------------ */
+@@ -457,7 +339,6 @@ SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
+    aren't visible outside this file, but that requires even more macro
+    magic. */
+ 
+-DECL_TEMPLATE (tilegx_linux, sys_clone);
+ DECL_TEMPLATE (tilegx_linux, sys_rt_sigreturn);
+ DECL_TEMPLATE (tilegx_linux, sys_socket);
+ DECL_TEMPLATE (tilegx_linux, sys_setsockopt);
+@@ -496,94 +377,6 @@ DECL_TEMPLATE (tilegx_linux, sys_syscall184);
+ DECL_TEMPLATE (tilegx_linux, sys_cacheflush);
+ DECL_TEMPLATE (tilegx_linux, sys_set_dataplane);
+ 
+-PRE(sys_clone)
+-{
+-  ULong cloneflags;
+-
+-  PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-  PRE_REG_READ5(int, "clone",
+-                unsigned long, flags,
+-                void *, child_stack,
+-                int *, parent_tidptr,
+-                int *, child_tidptr,
+-                void *, tlsaddr);
+-
+-  if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-    PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-    if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), VKI_PROT_WRITE)) {
+-      SET_STATUS_Failure( VKI_EFAULT );
+-      return;
+-    }
+-  }
+-  if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-    PRE_MEM_WRITE("clone(child_tidptr)", ARG4, sizeof(Int));
+-    if (!VG_(am_is_valid_for_client)(ARG4, sizeof(Int), VKI_PROT_WRITE)) {
+-      SET_STATUS_Failure( VKI_EFAULT );
+-      return;
+-    }
+-  }
+-
+-  cloneflags = ARG1;
+-
+-  if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-    SET_STATUS_Failure( VKI_EINVAL );
+-    return;
+-  }
+-
+-  /* Only look at the flags we really care about */
+-  switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS
+-                        | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-  case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-    /* thread creation */
+-    SET_STATUS_from_SysRes(
+-      do_clone(tid,
+-               ARG1,          /* flags */
+-               (Addr)ARG2,    /* child ESP */
+-               (Long *)ARG3,  /* parent_tidptr */
+-               (Long *)ARG4,  /* child_tidptr */
+-               (Addr)ARG5));  /* set_tls */
+-    break;
+-
+-  case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-    /* FALLTHROUGH - assume vfork == fork */
+-    cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-  case 0: /* plain fork */
+-    SET_STATUS_from_SysRes(
+-      ML_(do_fork_clone)(tid,
+-                         cloneflags,      /* flags */
+-                         (Int *)ARG3,     /* parent_tidptr */
+-                         (Int *)ARG4));   /* child_tidptr */
+-    break;
+-
+-  default:
+-    /* should we just ENOSYS? */
+-    VG_(message)(Vg_UserMsg,
+-                 "Unsupported clone() flags: 0x%lx\n", ARG1);
+-    VG_(message)(Vg_UserMsg,
+-                 "\n");
+-    VG_(message)(Vg_UserMsg,
+-                 "The only supported clone() uses are:\n");
+-    VG_(message)(Vg_UserMsg,
+-                 " - via a threads library (LinuxThreads or NPTL)\n");
+-    VG_(message)(Vg_UserMsg,
+-                 " - via the implementation of fork or vfork\n");
+-    VG_(unimplemented)
+-      ("Valgrind does not support general clone().");
+-  }
+-
+-  if (SUCCESS) {
+-    if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-      POST_MEM_WRITE(ARG3, sizeof(Int));
+-    if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-      POST_MEM_WRITE(ARG4, sizeof(Int));
+-
+-    /* Thread creation was successful; let the child have the chance
+-       to run */
+-    *flags |= SfYieldAfter;
+-  }
+-}
+-
+ PRE(sys_rt_sigreturn)
+ {
+   /* This isn't really a syscall at all - it's a misuse of the
+@@ -1344,7 +1137,7 @@ static SyscallTableEntry syscall_table[] = {
+   LINX_(__NR_add_key,           sys_add_key),              // 217
+   LINX_(__NR_request_key,       sys_request_key),          // 218
+   LINXY(__NR_keyctl,            sys_keyctl),               // 219
+-  PLAX_(__NR_clone,             sys_clone),                // 220
++  LINX_(__NR_clone,             sys_clone),                // 220
+   GENX_(__NR_execve,            sys_execve),               // 221
+   PLAX_(__NR_mmap,              sys_mmap),                 // 222
+   GENXY(__NR_mprotect,          sys_mprotect),             // 226
+diff --git a/coregrind/m_syswrap/syswrap-x86-linux.c b/coregrind/m_syswrap/syswrap-x86-linux.c
+index 0e5af98..f8c4eb4 100644
+--- a/coregrind/m_syswrap/syswrap-x86-linux.c
++++ b/coregrind/m_syswrap/syswrap-x86-linux.c
+@@ -131,14 +131,7 @@ asm(
+ #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
+ #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
+ 
+-extern
+-Int do_syscall_clone_x86_linux ( Word (*fn)(void *), 
+-                                 void* stack, 
+-                                 Int   flags, 
+-                                 void* arg,
+-                                 Int*  child_tid, 
+-                                 Int*  parent_tid, 
+-                                 vki_modify_ldt_t * );
++// See priv_syswrap-linux.h for arg profile.
+ asm(
+ ".text\n"
+ ".globl do_syscall_clone_x86_linux\n"
+@@ -191,141 +184,6 @@ asm(
+ #undef __NR_EXIT
+ 
+ 
+-// forward declarations
+-static void setup_child ( ThreadArchState*, ThreadArchState*, Bool );
+-static SysRes sys_set_thread_area ( ThreadId, vki_modify_ldt_t* );
+-
+-/* 
+-   When a client clones, we need to keep track of the new thread.  This means:
+-   1. allocate a ThreadId+ThreadState+stack for the thread
+-
+-   2. initialize the thread's new VCPU state
+-
+-   3. create the thread using the same args as the client requested,
+-   but using the scheduler entrypoint for EIP, and a separate stack
+-   for ESP.
+- */
+-static SysRes do_clone ( ThreadId ptid, 
+-                         UInt flags, Addr esp, 
+-                         Int* parent_tidptr, 
+-                         Int* child_tidptr, 
+-                         vki_modify_ldt_t *tlsinfo)
+-{
+-   static const Bool debug = False;
+-
+-   ThreadId     ctid = VG_(alloc_ThreadState)();
+-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+-   UWord*       stack;
+-   SysRes       res;
+-   Int          eax;
+-   vki_sigset_t blockall, savedmask;
+-
+-   VG_(sigfillset)(&blockall);
+-
+-   vg_assert(VG_(is_running_thread)(ptid));
+-   vg_assert(VG_(is_valid_tid)(ctid));
+-
+-   stack = (UWord*)ML_(allocstack)(ctid);
+-   if (stack == NULL) {
+-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+-      goto out;
+-   }
+-
+-   /* Copy register state
+-
+-      Both parent and child return to the same place, and the code
+-      following the clone syscall works out which is which, so we
+-      don't need to worry about it.
+-
+-      The parent gets the child's new tid returned from clone, but the
+-      child gets 0.
+-
+-      If the clone call specifies a NULL esp for the new thread, then
+-      it actually gets a copy of the parent's esp.
+-   */
+-   /* Note: the clone call done by the Quadrics Elan3 driver specifies
+-      clone flags of 0xF00, and it seems to rely on the assumption
+-      that the child inherits a copy of the parent's GDT.  
+-      setup_child takes care of setting that up. */
+-   setup_child( &ctst->arch, &ptst->arch, True );
+-
+-   /* Make sys_clone appear to have returned Success(0) in the
+-      child. */
+-   ctst->arch.vex.guest_EAX = 0;
+-
+-   if (esp != 0)
+-      ctst->arch.vex.guest_ESP = esp;
+-
+-   ctst->os_state.parent = ptid;
+-
+-   /* inherit signal mask */
+-   ctst->sig_mask     = ptst->sig_mask;
+-   ctst->tmp_sig_mask = ptst->sig_mask;
+-
+-   /* Start the child with its threadgroup being the same as the
+-      parent's.  This is so that any exit_group calls that happen
+-      after the child is created but before it sets its
+-      os_state.threadgroup field for real (in thread_wrapper in
+-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+-      a race condition in which the thread is unkillable (via
+-      exit_group) because its threadgroup is not set.  The race window
+-      is probably only a few hundred or a few thousand cycles long.
+-      See #226116. */
+-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+-
+-   ML_(guess_and_register_stack) (esp, ctst);
+-   
+-   /* Assume the clone will succeed, and tell any tool that wants to
+-      know that this thread has come into existence.  We cannot defer
+-      it beyond this point because sys_set_thread_area, just below,
+-      causes tCheck to assert by making references to the new ThreadId
+-      if we don't state the new thread exists prior to that point.
+-      If the clone fails, we'll send out a ll_exit notification for it
+-      at the out: label below, to clean up. */
+-   vg_assert(VG_(owns_BigLock_LL)(ptid));
+-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+-
+-   if (flags & VKI_CLONE_SETTLS) {
+-      if (debug)
+-	 VG_(printf)("clone child has SETTLS: tls info at %p: idx=%u "
+-                     "base=%#lx limit=%x; esp=%#x fs=%x gs=%x\n",
+-		     tlsinfo, tlsinfo->entry_number, 
+-                     tlsinfo->base_addr, tlsinfo->limit,
+-		     ptst->arch.vex.guest_ESP,
+-		     ctst->arch.vex.guest_FS, ctst->arch.vex.guest_GS);
+-      res = sys_set_thread_area(ctid, tlsinfo);
+-      if (sr_isError(res))
+-	 goto out;
+-   }
+-
+-   flags &= ~VKI_CLONE_SETTLS;
+-
+-   /* start the thread with everything blocked */
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+-
+-   /* Create the new thread */
+-   eax = do_syscall_clone_x86_linux(
+-            ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
+-            child_tidptr, parent_tidptr, NULL
+-         );
+-   res = VG_(mk_SysRes_x86_linux)( eax );
+-
+-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+-
+-  out:
+-   if (sr_isError(res)) {
+-      /* clone failed */
+-      VG_(cleanup_thread)(&ctst->arch);
+-      ctst->status = VgTs_Empty;
+-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+-      VG_TRACK( pre_thread_ll_exit, ctid );
+-   }
+-
+-   return res;
+-}
+-
+-
+ /* ---------------------------------------------------------------------
+    LDT/GDT simulation
+    ------------------------------------------------------------------ */
+@@ -630,7 +488,7 @@ static SysRes sys_modify_ldt ( ThreadId tid,
+ }
+ 
+ 
+-static SysRes sys_set_thread_area ( ThreadId tid, vki_modify_ldt_t* info )
++SysRes ML_(x86_sys_set_thread_area) ( ThreadId tid, vki_modify_ldt_t* info )
+ {
+    Int                  idx;
+    VexGuestX86SegDescr* gdt;
+@@ -738,15 +596,9 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
+ }  
+ 
+ 
+-static void setup_child ( /*OUT*/ ThreadArchState *child, 
+-                          /*IN*/  ThreadArchState *parent,
+-                          Bool inherit_parents_GDT )
++void ML_(x86_setup_LDT_GDT) ( /*OUT*/ ThreadArchState *child, 
++                              /*IN*/  ThreadArchState *parent )
+ {
+-   /* We inherit our parent's guest state. */
+-   child->vex = parent->vex;
+-   child->vex_shadow1 = parent->vex_shadow1;
+-   child->vex_shadow2 = parent->vex_shadow2;
+-
+    /* We inherit our parent's LDT. */
+    if (parent->vex.guest_LDT == (HWord)NULL) {
+       /* We hope this is the common case. */
+@@ -763,7 +615,7 @@ static void setup_child ( /*OUT*/ ThreadArchState *child,
+       only). */
+    child->vex.guest_GDT = (HWord)NULL;
+ 
+-   if (inherit_parents_GDT && parent->vex.guest_GDT != (HWord)NULL) {
++   if (parent->vex.guest_GDT != (HWord)NULL) {
+       child->vex.guest_GDT = (HWord)alloc_zeroed_x86_GDT();
+       copy_GDT_from_to( (VexGuestX86SegDescr*)parent->vex.guest_GDT,
+                         (VexGuestX86SegDescr*)child->vex.guest_GDT );
+@@ -787,7 +639,6 @@ DECL_TEMPLATE(x86_linux, sys_stat64);
+ DECL_TEMPLATE(x86_linux, sys_fstatat64);
+ DECL_TEMPLATE(x86_linux, sys_fstat64);
+ DECL_TEMPLATE(x86_linux, sys_lstat64);
+-DECL_TEMPLATE(x86_linux, sys_clone);
+ DECL_TEMPLATE(x86_linux, old_mmap);
+ DECL_TEMPLATE(x86_linux, sys_mmap2);
+ DECL_TEMPLATE(x86_linux, sys_sigreturn);
+@@ -835,137 +686,6 @@ PRE(old_select)
+    }
+ }
+ 
+-PRE(sys_clone)
+-{
+-   UInt cloneflags;
+-   Bool badarg = False;
+-
+-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+-   PRE_REG_READ2(int, "clone",
+-                 unsigned long, flags,
+-                 void *, child_stack);
+-
+-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA3("clone", int *, parent_tidptr);
+-      }
+-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         badarg = True;
+-      }
+-   }
+-   if (ARG1 & VKI_CLONE_SETTLS) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA4("clone", vki_modify_ldt_t *, tlsinfo);
+-      }
+-      PRE_MEM_READ("clone(tlsinfo)", ARG4, sizeof(vki_modify_ldt_t));
+-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
+-                                             VKI_PROT_READ)) {
+-         badarg = True;
+-      }
+-   }
+-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+-      if (VG_(tdict).track_pre_reg_read) {
+-         PRA5("clone", int *, child_tidptr);
+-      }
+-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
+-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
+-                                             VKI_PROT_WRITE)) {
+-         badarg = True;
+-      }
+-   }
+-
+-   if (badarg) {
+-      SET_STATUS_Failure( VKI_EFAULT );
+-      return;
+-   }
+-
+-   cloneflags = ARG1;
+-
+-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
+-      SET_STATUS_Failure( VKI_EINVAL );
+-      return;
+-   }
+-
+-   /* Be ultra-paranoid and filter out any clone-variants we don't understand:
+-      - ??? specifies clone flags of 0x100011
+-      - ??? specifies clone flags of 0x1200011.
+-      - NPTL specifies clone flags of 0x7D0F00.
+-      - The Quadrics Elan3 driver specifies clone flags of 0xF00.
+-      - Newer Quadrics Elan3 drivers with NTPL support specify 0x410F00.
+-      Everything else is rejected. 
+-   */
+-   if (
+-        1 ||
+-        /* 11 Nov 05: for the time being, disable this ultra-paranoia.
+-           The switch below probably does a good enough job. */
+-          (cloneflags == 0x100011 || cloneflags == 0x1200011
+-                                  || cloneflags == 0x7D0F00
+-                                  || cloneflags == 0x790F00
+-                                  || cloneflags == 0x3D0F00
+-                                  || cloneflags == 0x410F00
+-                                  || cloneflags == 0xF00
+-                                  || cloneflags == 0xF21)) {
+-     /* OK */
+-   }
+-   else {
+-      /* Nah.  We don't like it.  Go away. */
+-      goto reject;
+-   }
+-
+-   /* Only look at the flags we really care about */
+-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+-      /* thread creation */
+-      SET_STATUS_from_SysRes(
+-         do_clone(tid,
+-                  ARG1,         /* flags */
+-                  (Addr)ARG2,   /* child ESP */
+-                  (Int *)ARG3,  /* parent_tidptr */
+-                  (Int *)ARG5,  /* child_tidptr */
+-                  (vki_modify_ldt_t *)ARG4)); /* set_tls */
+-      break;
+-
+-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+-      /* FALLTHROUGH - assume vfork == fork */
+-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
+-
+-   case 0: /* plain fork */
+-      SET_STATUS_from_SysRes(
+-         ML_(do_fork_clone)(tid,
+-                       cloneflags,      /* flags */
+-                       (Int *)ARG3,     /* parent_tidptr */
+-                       (Int *)ARG5));   /* child_tidptr */
+-      break;
+-
+-   default:
+-   reject:
+-      /* should we just ENOSYS? */
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
+-      VG_(message)(Vg_UserMsg, "\n");
+-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
+-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+-      VG_(message)(Vg_UserMsg, " - for the Quadrics Elan3 user-space driver\n");
+-      VG_(unimplemented)
+-         ("Valgrind does not support general clone().");
+-   }
+-
+-   if (SUCCESS) {
+-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
+-         POST_MEM_WRITE(ARG3, sizeof(Int));
+-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+-         POST_MEM_WRITE(ARG5, sizeof(Int));
+-
+-      /* Thread creation was successful; let the child have the chance
+-         to run */
+-      *flags |= SfYieldAfter;
+-   }
+-}
+-
+ PRE(sys_sigreturn)
+ {
+    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
+@@ -1063,7 +783,7 @@ PRE(sys_set_thread_area)
+    PRE_MEM_READ( "set_thread_area(u_info)", ARG1, sizeof(vki_modify_ldt_t) );
+ 
+    /* "do" the syscall ourselves; the kernel never sees it */
+-   SET_STATUS_from_SysRes( sys_set_thread_area( tid, (void *)ARG1 ) );
++   SET_STATUS_from_SysRes( ML_(x86_sys_set_thread_area)( tid, (void *)ARG1 ) );
+ }
+ 
+ PRE(sys_get_thread_area)
+@@ -1553,7 +1273,7 @@ static SyscallTableEntry syscall_table[] = {
+    GENX_(__NR_fsync,             sys_fsync),          // 118
+    PLAX_(__NR_sigreturn,         sys_sigreturn),      // 119 ?/Linux
+ 
+-   PLAX_(__NR_clone,             sys_clone),          // 120
++   LINX_(__NR_clone,             sys_clone),          // 120
+ //zz    //   (__NR_setdomainname,     sys_setdomainname),  // 121 */*(?)
+    GENXY(__NR_uname,             sys_newuname),       // 122
+    PLAX_(__NR_modify_ldt,        sys_modify_ldt),     // 123
+diff --git a/include/vki/vki-arm64-linux.h b/include/vki/vki-arm64-linux.h
+index df34dd6..5a3b08f 100644
+--- a/include/vki/vki-arm64-linux.h
++++ b/include/vki/vki-arm64-linux.h
+@@ -586,7 +586,8 @@ struct vki_ucontext {
+ //ZZ };
+ //ZZ 
+ //ZZ // [[Nb: for our convenience within Valgrind, use a more specific name]]
+-//ZZ typedef struct vki_user_desc vki_modify_ldt_t;
++
++typedef char vki_modify_ldt_t;
+ 
+ //----------------------------------------------------------------------
+ // From linux-3.10.5/include/asm-generic/ipcbuf.h
+diff --git a/include/vki/vki-mips32-linux.h b/include/vki/vki-mips32-linux.h
+index 5be8e15..b6c9914 100644
+--- a/include/vki/vki-mips32-linux.h
++++ b/include/vki/vki-mips32-linux.h
+@@ -679,7 +679,7 @@ struct vki_ucontext {
+ };
+ 
+ // CAB: TODO
+-typedef void vki_modify_ldt_t;
++typedef char vki_modify_ldt_t;
+ 
+ //----------------------------------------------------------------------
+ // From linux-2.6.35.5/include/asm-mips/ipcbuf.h
+diff --git a/include/vki/vki-mips64-linux.h b/include/vki/vki-mips64-linux.h
+index 26b8e9f..ca49b10 100644
+--- a/include/vki/vki-mips64-linux.h
++++ b/include/vki/vki-mips64-linux.h
+@@ -710,6 +710,7 @@ struct vki_ucontext {
+        vki_sigset_t           uc_sigmask;  /* mask last for extensibility */
+ };
+ 
++typedef char vki_modify_ldt_t;
+ //----------------------------------------------------------------------
+ // From linux-2.6.35.9/include/asm-mips/ipcbuf.h
+ //----------------------------------------------------------------------
+diff --git a/include/vki/vki-ppc32-linux.h b/include/vki/vki-ppc32-linux.h
+index 70c2835..0fd3c79 100644
+--- a/include/vki/vki-ppc32-linux.h
++++ b/include/vki/vki-ppc32-linux.h
+@@ -811,10 +811,9 @@ struct vki_ucontext {
+ //.. };
+ //.. 
+ //.. // [[Nb: for our convenience within Valgrind, use a more specific name]]
+-//.. typedef struct vki_user_desc vki_modify_ldt_t;
+ 
+ // CAB: TODO
+-typedef void vki_modify_ldt_t;
++typedef char vki_modify_ldt_t;
+ 
+ 
+ //----------------------------------------------------------------------
+diff --git a/include/vki/vki-ppc64-linux.h b/include/vki/vki-ppc64-linux.h
+index b410663..fd5cea6 100644
+--- a/include/vki/vki-ppc64-linux.h
++++ b/include/vki/vki-ppc64-linux.h
+@@ -685,6 +685,9 @@ struct vki_ucontext {
+   struct vki_sigcontext uc_mcontext;  /* last for extensibility */
+ };
+ 
++// CAB: TODO
++typedef char vki_modify_ldt_t;
++
+ //----------------------------------------------------------------------
+ // From linux-2.6.13/include/asm-ppc64/ipcbuf.h
+ //----------------------------------------------------------------------
+diff --git a/include/vki/vki-s390x-linux.h b/include/vki/vki-s390x-linux.h
+index c3f6d00..1ef5cf7 100644
+--- a/include/vki/vki-s390x-linux.h
++++ b/include/vki/vki-s390x-linux.h
+@@ -822,6 +822,8 @@ struct vki_ucontext {
+ 	vki_sigset_t	      uc_sigmask; /* mask last for extensibility */
+ };
+ 
++typedef char vki_modify_ldt_t;
++
+ //----------------------------------------------------------------------
+ // From linux-2.6.16.60/include/asm-s390/ipcbuf.h
+ //----------------------------------------------------------------------
diff --git a/valgrind.spec b/valgrind.spec
index 25649bc..0ebb1c5 100644
--- a/valgrind.spec
+++ b/valgrind.spec
@@ -87,6 +87,11 @@ Patch7: valgrind-3.12.0-arm64-ppc64-prlimit64.patch
 # KDE#376279 Handle unknown HINT instructions on aarch64 by ignoring them.
 Patch8: valgrind-3.12.0-arm64-hint.patch
 
+# KDE#342040 Valgrind mishandles clone with CLONE_VFORK | CLONE_VM
+#            that clones to a different stack
+# KDE#373192 Calling posix_spawn in glibc 2.24 completely broken
+Patch9: valgrind-3.12.0-clone-spawn.patch
+
 %if %{build_multilib}
 # Ensure glibc{,-devel} is installed for both multilib arches
 BuildRequires: /lib/libc.so.6 /usr/lib/libc.so /lib64/libc.so.6 /usr/lib64/libc.so
@@ -204,6 +209,7 @@ Valgrind User Manual for details.
 %patch6 -p1
 %patch7 -p1
 %patch8 -p1
+%patch9 -p1
 
 %build
 # We need to use the software collection compiler and binutils if available.
@@ -400,6 +406,7 @@ echo ===============END TESTING===============
 * Sat Feb 18 2017 Mark Wielaard <mjw@redhat.com>
 - Add valgrind-3.12.0-arm64-ppc64-prlimit64.patch
 - Add valgrind-3.12.0-arm64-hint.patch
+- Add valgrind-3.12.0-clone-spawn.patch
 
 * Fri Feb 17 2017 Mark Wielaard <mjw@redhat.com> - 3.12.0-5
 - Add valgrind-3.12.0-ppc64-r2.patch (#1424367)