Blob Blame History Raw
commit e31d72da6cb415d0856ad53dac78e307548cd831
Author: philippe <philippe@a5019735-40e9-0310-863c-91ae7b9d1cf9>
Date:   Sun Dec 11 21:39:23 2016 +0000

    Fix 342040 Valgrind mishandles clone with CLONE_VFORK | CLONE_VM that clones to a different stack
    Fix 373192 Calling posix_spawn in glibc 2.24 completely broken
    
    Functionally, this patch just does the following 2 changes to the
    fork clone handling:
    * It does not mask anymore CLONE_VFORK :
      The only effect of this flag is to suspend the parent, waiting for
      the child to either exit or execve.
      If some applications depends on this synchronisation, better keep it,
      as it will not harm to suspend the parent valgrind waiting for the
      child valgrind to exit or execve.
    * In case the guest calls the clone syscall providing a non zero client stack,
      set the child guest SP after the syscall, before executing guest instructions.
      Not setting the guest stack ptr was the source of the problem reported
      in the bugs.
    
    This also adds a test case  none/tests/linux/clonev.
    Before this patch, test gives a SEGV, which is fixed by the patch.
    
    The patch is however a lot bigger : this fix was touching some (mostly
    identical/duplicated) code in all the linux platforms.
    So, the clone/fork code has been factorised as much as possible.
    This removes about 1700 lines of code.
    
    This has been tested on:
    * amd64
    * x86
    * ppc64 be and le
    * ppc32
    * arm64
    
    This has been compiled on but *not really tested* on:
    * mips64 (not too clear how to properly build and run valgrind on gcc22)
    
    It has *not* been compiled and *not* tested on:
    * arm
    * mips32
    * tilegx
    * darwin   (normally, no impact)
    * solaris  (normally, no impact)
    
    The changes are relatively mechanical, so it is not impossible that
    it will compile and work out of the box on these platforms.
    Otherwise, questions welcome.
    
    A few points of interest:
    * Some platforms did have a typedef void vki_modify_ldt_t,
      and some platforms had no definition for this type at all.
      To make it easier to factorise, for such platforms, the following has
      been used:
         typedef char vki_modify_ldt_t;
        When the sizeof vki_modify_ldt_t is > 1, then the arg syscall is checked.
      This is somewhat a hack, but was simplifying the factorisation.
    
    * for mips32/mips64 and tilegx, there is a strange unconditional assignment
      of 0 to a register (guest_r2 on mips, guest_r0 on tilegx).
      Unclear what this is, in particular because this is assigned whatever
      the result of the syscall (success or not).
    
    
    
    
    git-svn-id: svn://svn.valgrind.org/valgrind/trunk@16186 a5019735-40e9-0310-863c-91ae7b9d1cf9

diff --git a/coregrind/m_syswrap/priv_syswrap-linux.h b/coregrind/m_syswrap/priv_syswrap-linux.h
index 38fcd7b..06ea7cd 100644
--- a/coregrind/m_syswrap/priv_syswrap-linux.h
+++ b/coregrind/m_syswrap/priv_syswrap-linux.h
@@ -39,12 +39,10 @@ extern Word ML_(start_thread_NORETURN) ( void* arg );
 extern Addr ML_(allocstack)            ( ThreadId tid );
 extern void ML_(call_on_new_stack_0_1) ( Addr stack, Addr retaddr,
 			                 void (*f)(Word), Word arg1 );
-extern SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
-                                   Int* parent_tidptr, Int* child_tidptr );
-
 
 // Linux-specific (but non-arch-specific) syscalls
 
+DECL_TEMPLATE(linux, sys_clone)
 DECL_TEMPLATE(linux, sys_mount);
 DECL_TEMPLATE(linux, sys_oldumount);
 DECL_TEMPLATE(linux, sys_umount);
@@ -61,6 +59,10 @@ DECL_TEMPLATE(linux, sys_vmsplice);
 DECL_TEMPLATE(linux, sys_readahead);
 DECL_TEMPLATE(linux, sys_move_pages);
 
+// clone is similar enough between linux variants to have a generic
+// version, but which will call an extern defined in syswrap-<platform>-linux.c
+DECL_TEMPLATE(linux, sys_clone);
+
 // POSIX, but various sub-cases differ between Linux and Darwin.
 DECL_TEMPLATE(linux, sys_fcntl);
 DECL_TEMPLATE(linux, sys_fcntl64);
@@ -368,7 +370,83 @@ DECL_TEMPLATE(linux, sys_getpeername);
 DECL_TEMPLATE(linux, sys_socketpair);
 DECL_TEMPLATE(linux, sys_kcmp);
 
-#endif   // __PRIV_SYSWRAP_LINUX_H
+// Some arch specific functions called from syswrap-linux.c
+extern Int do_syscall_clone_x86_linux ( Word (*fn)(void *), 
+                                        void* stack, 
+                                        Int   flags, 
+                                        void* arg,
+                                        Int*  child_tid, 
+                                        Int*  parent_tid, 
+                                        void* tls_ptr);
+extern SysRes ML_(x86_sys_set_thread_area) ( ThreadId tid,
+                                             vki_modify_ldt_t* info );
+extern void ML_(x86_setup_LDT_GDT) ( /*OUT*/ ThreadArchState *child, 
+                                     /*IN*/  ThreadArchState *parent );
+
+extern Long do_syscall_clone_amd64_linux ( Word (*fn)(void *), 
+                                           void* stack, 
+                                           Long  flags, 
+                                           void* arg,
+                                           Int* child_tid, 
+                                           Int* parent_tid, 
+                                           void* tls_ptr);
+extern ULong do_syscall_clone_ppc32_linux ( Word (*fn)(void *), 
+                                            void* stack, 
+                                            Int   flags, 
+                                            void* arg,
+                                            Int*  child_tid, 
+                                            Int*  parent_tid, 
+                                            void* tls_ptr);
+extern ULong do_syscall_clone_ppc64_linux ( Word (*fn)(void *), 
+                                            void* stack, 
+                                            Int   flags, 
+                                            void* arg,
+                                            Int*  child_tid, 
+                                            Int*  parent_tid, 
+                                            void* tls_ptr );
+extern ULong do_syscall_clone_s390x_linux ( void  *stack,
+                                            ULong flags,
+                                            Int   *parent_tid,
+                                            Int   *child_tid,
+                                            void*  tls_ptr,
+                                            Word (*fn)(void *),
+                                            void  *arg);
+extern Long do_syscall_clone_arm64_linux ( Word (*fn)(void *), 
+                                           void* stack, 
+                                           Long  flags, 
+                                           void* arg,
+                                           Int*  child_tid,
+                                           Int*  parent_tid,
+                                           void* tls_ptr );
+extern ULong do_syscall_clone_arm_linux   ( Word (*fn)(void *), 
+                                            void* stack, 
+                                            Int   flags, 
+                                            void* arg,
+                                            Int*  child_tid,
+                                            Int*  parent_tid,
+                                            void* tls_ptr );
+extern ULong do_syscall_clone_mips64_linux ( Word (*fn) (void *),  /* a0 - 4 */
+                                             void* stack,          /* a1 - 5 */
+                                             Int   flags,          /* a2 - 6 */
+                                             void* arg,            /* a3 - 7 */
+                                             Int*  parent_tid,     /* a4 - 8 */
+                                             void* tls_ptr,        /* a5 - 9 */
+                                             Int*  child_tid );    /* a6 - 10 */
+extern UInt do_syscall_clone_mips_linux ( Word (*fn) (void *), //a0     0    32
+                                          void* stack,         //a1     4    36
+                                          Int   flags,         //a2     8    40
+                                          void* arg,           //a3     12   44
+                                          Int*  child_tid,     //stack  16   48
+                                          Int*  parent_tid,    //stack  20   52
+                                          void* tls_ptr);      //stack  24   56
+extern Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
+                                            void* stack,          //r1
+                                            Long  flags,          //r2
+                                            void* arg,            //r3
+                                            Long* child_tid,      //r4
+                                            Long* parent_tid,     //r5
+                                            void* tls_ptr );      //r6
+ #endif   // __PRIV_SYSWRAP_LINUX_H
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/m_syswrap/syswrap-amd64-linux.c b/coregrind/m_syswrap/syswrap-amd64-linux.c
index 08e9a93..3fe9938 100644
--- a/coregrind/m_syswrap/syswrap-amd64-linux.c
+++ b/coregrind/m_syswrap/syswrap-amd64-linux.c
@@ -130,14 +130,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-Long do_syscall_clone_amd64_linux ( Word (*fn)(void *), 
-                                    void* stack, 
-                                    Long  flags, 
-                                    void* arg,
-                                    Long* child_tid, 
-                                    Long* parent_tid, 
-                                    vki_modify_ldt_t * );
+// See priv_syswrap-linux.h for arg profile.
 asm(
 ".text\n"
 ".globl do_syscall_clone_amd64_linux\n"
@@ -183,126 +176,6 @@ asm(
 #undef __NR_EXIT
 
 
-// forward declaration
-static void setup_child ( ThreadArchState*, ThreadArchState* );
-
-/* 
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for EIP, and a separate stack
-   for ESP.
- */
-static SysRes do_clone ( ThreadId ptid, 
-                         ULong flags, Addr rsp, 
-                         Long* parent_tidptr, 
-                         Long* child_tidptr, 
-                         Addr tlsaddr )
-{
-   static const Bool debug = False;
-
-   ThreadId     ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   UWord*       stack;
-   SysRes       res;
-   Long         rax;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-   /* Copy register state
-
-      Both parent and child return to the same place, and the code
-      following the clone syscall works out which is which, so we
-      don't need to worry about it.
-
-      The parent gets the child's new tid returned from clone, but the
-      child gets 0.
-
-      If the clone call specifies a NULL rsp for the new thread, then
-      it actually gets a copy of the parent's rsp.
-   */
-   setup_child( &ctst->arch, &ptst->arch );
-
-   /* Make sys_clone appear to have returned Success(0) in the
-      child. */
-   ctst->arch.vex.guest_RAX = 0;
-
-   if (rsp != 0)
-      ctst->arch.vex.guest_RSP = rsp;
-
-   ctst->os_state.parent = ptid;
-
-   /* inherit signal mask */
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* Start the child with its threadgroup being the same as the
-      parent's.  This is so that any exit_group calls that happen
-      after the child is created but before it sets its
-      os_state.threadgroup field for real (in thread_wrapper in
-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-      a race condition in which the thread is unkillable (via
-      exit_group) because its threadgroup is not set.  The race window
-      is probably only a few hundred or a few thousand cycles long.
-      See #226116. */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (rsp, ctst);
-
-   /* Assume the clone will succeed, and tell any tool that wants to
-      know that this thread has come into existence.  If the clone
-      fails, we'll send out a ll_exit notification for it at the out:
-      label below, to clean up. */
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      if (debug)
-	 VG_(printf)("clone child has SETTLS: tls at %#lx\n", tlsaddr);
-      ctst->arch.vex.guest_FS_CONST = tlsaddr;
-   }
-
-   flags &= ~VKI_CLONE_SETTLS;
-
-   /* start the thread with everything blocked */
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   /* Create the new thread */
-   rax = do_syscall_clone_amd64_linux(
-            ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
-            child_tidptr, parent_tidptr, NULL
-         );
-   res = VG_(mk_SysRes_amd64_linux)( rax );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-  out:
-   if (sr_isError(res)) {
-      /* clone failed */
-      VG_(cleanup_thread)(&ctst->arch);
-      ctst->status = VgTs_Empty;
-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-}
-
-
 /* ---------------------------------------------------------------------
    More thread stuff
    ------------------------------------------------------------------ */
@@ -311,16 +184,6 @@ void VG_(cleanup_thread) ( ThreadArchState *arch )
 {  
 }  
 
-void setup_child ( /*OUT*/ ThreadArchState *child, 
-                   /*IN*/  ThreadArchState *parent )
-{  
-   /* We inherit our parent's guest state. */
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-}  
-
-
 /* ---------------------------------------------------------------------
    PRE/POST wrappers for AMD64/Linux-specific syscalls
    ------------------------------------------------------------------ */
@@ -333,7 +196,6 @@ void setup_child ( /*OUT*/ ThreadArchState *child,
    the right thing to do is to make these wrappers 'static' since they
    aren't visible outside this file, but that requires even more macro
    magic. */
-DECL_TEMPLATE(amd64_linux, sys_clone);
 DECL_TEMPLATE(amd64_linux, sys_rt_sigreturn);
 DECL_TEMPLATE(amd64_linux, sys_arch_prctl);
 DECL_TEMPLATE(amd64_linux, sys_ptrace);
@@ -342,108 +204,6 @@ DECL_TEMPLATE(amd64_linux, sys_mmap);
 DECL_TEMPLATE(amd64_linux, sys_syscall184);
 
 
-PRE(sys_clone)
-{
-   ULong cloneflags;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-   PRE_REG_READ2(int, "clone",
-                 unsigned long, flags,
-                 void *, child_stack);
-
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA3("clone", int *, parent_tidptr);
-      }
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG1 & VKI_CLONE_SETTLS) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA4("clone", vki_modify_ldt_t *, tlsinfo);
-      }
-      PRE_MEM_READ("clone(tlsinfo)", ARG4, sizeof(vki_modify_ldt_t));
-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
-                                             VKI_PROT_READ)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA5("clone", int *, child_tidptr);
-      }
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG4, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(Int), VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-
-   cloneflags = ARG1;
-
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  ARG1,          /* flags */
-                  (Addr)ARG2,    /* child ESP */
-                  (Long *)ARG3,  /* parent_tidptr */
-                  (Long *)ARG4,  /* child_tidptr */
-                  (Addr)ARG5));  /* set_tls */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,      /* flags */
-                       (Int *)ARG3,     /* parent_tidptr */
-                       (Int *)ARG4));   /* child_tidptr */
-      break;
-
-   default:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg,
-                   "Unsupported clone() flags: 0x%lx\n", ARG1);
-      VG_(message)(Vg_UserMsg,
-                   "\n");
-      VG_(message)(Vg_UserMsg,
-                   "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg,
-                   " - via a threads library (LinuxThreads or NPTL)\n");
-      VG_(message)(Vg_UserMsg,
-                   " - via the implementation of fork or vfork\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG4, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_rt_sigreturn)
 {
    /* This isn't really a syscall at all - it's a misuse of the
@@ -761,7 +521,7 @@ static SyscallTableEntry syscall_table[] = {
    LINX_(__NR_setsockopt,        sys_setsockopt),     // 54
 
    LINXY(__NR_getsockopt,        sys_getsockopt),     // 55 
-   PLAX_(__NR_clone,             sys_clone),          // 56 
+   LINX_(__NR_clone,             sys_clone),          // 56 
    GENX_(__NR_fork,              sys_fork),           // 57 
    GENX_(__NR_vfork,             sys_fork),           // 58 treat as fork
    GENX_(__NR_execve,            sys_execve),         // 59 
diff --git a/coregrind/m_syswrap/syswrap-arm-linux.c b/coregrind/m_syswrap/syswrap-arm-linux.c
index 3bbd109..b417428 100644
--- a/coregrind/m_syswrap/syswrap-arm-linux.c
+++ b/coregrind/m_syswrap/syswrap-arm-linux.c
@@ -102,14 +102,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-ULong do_syscall_clone_arm_linux   ( Word (*fn)(void *), 
-                                     void* stack, 
-                                     Int   flags, 
-                                     void* arg,
-                                     Int*  child_tid,
-                                     Int*  parent_tid,
-                                     void* tls );
+// See priv_syswrap-linux.h for arg profile.
 asm(
 ".text\n"
 ".globl do_syscall_clone_arm_linux\n"
@@ -148,104 +141,8 @@ asm(
 #undef __NR_EXIT
 
 // forward declarations
-static void setup_child ( ThreadArchState*, ThreadArchState* );
-static void assign_guest_tls(ThreadId ctid, Addr tlsptr);
 static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr );
             
-/* 
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for IP, and a separate stack
-   for SP.
- */
-static SysRes do_clone ( ThreadId ptid, 
-                         UInt flags, Addr sp, 
-                         Int *parent_tidptr, 
-                         Int *child_tidptr, 
-                         Addr child_tls)
-{
-   ThreadId ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   UInt r0;
-   UWord *stack;
-   SysRes res;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-
-   if(stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-   setup_child( &ctst->arch, &ptst->arch );
-
-   ctst->arch.vex.guest_R0 = 0;
-   if(sp != 0)
-      ctst->arch.vex.guest_R13 = sp;
-
-   ctst->os_state.parent = ptid;
-
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* Start the child with its threadgroup being the same as the
-      parent's.  This is so that any exit_group calls that happen
-      after the child is created but before it sets its
-      os_state.threadgroup field for real (in thread_wrapper in
-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-      a race condition in which the thread is unkillable (via
-      exit_group) because its threadgroup is not set.  The race window
-      is probably only a few hundred or a few thousand cycles long.
-      See #226116. */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (sp, ctst);
-
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      /* Just assign the tls pointer in the guest TPIDRURO. */
-      assign_guest_tls(ctid, child_tls);
-   }
-    
-   flags &= ~VKI_CLONE_SETTLS;
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   r0 = do_syscall_clone_arm_linux(
-      ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
-      child_tidptr, parent_tidptr, NULL
-   );
-   //VG_(printf)("AFTER SYSCALL, %x and %x  CHILD: %d PARENT: %d\n",child_tidptr, parent_tidptr,*child_tidptr,*parent_tidptr);
-    
-   res = VG_(mk_SysRes_arm_linux)( r0 );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-out:
-   if (sr_isError(res)) {
-      VG_(cleanup_thread)(&ctst->arch);
-      ctst->status = VgTs_Empty;
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-}
-
-
-
 /* ---------------------------------------------------------------------
    More thread stuff
    ------------------------------------------------------------------ */
@@ -256,26 +153,13 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
 {
 }  
 
-void setup_child ( /*OUT*/ ThreadArchState *child,
-                   /*IN*/  ThreadArchState *parent )
-{
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-}
-
-static void assign_guest_tls(ThreadId tid, Addr tlsptr)
-{
-   VG_(threads)[tid].arch.vex.guest_TPIDRURO = tlsptr;
-}
-
 /* Assigns tlsptr to the guest TPIDRURO.
    If needed for the specific hardware, really executes
    the set_tls syscall.
 */
 static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
 {
-   assign_guest_tls(tid, tlsptr);
+   VG_(threads)[tid].arch.vex.guest_TPIDRURO = tlsptr;
 
    if (KernelVariantiS(KernelVariant_android_no_hw_tls,
                        VG_(clo_kernel_variant))) {
@@ -333,7 +217,6 @@ DECL_TEMPLATE(arm_linux, sys_stat64);
 DECL_TEMPLATE(arm_linux, sys_lstat64);
 DECL_TEMPLATE(arm_linux, sys_fstatat64);
 DECL_TEMPLATE(arm_linux, sys_fstat64);
-DECL_TEMPLATE(arm_linux, sys_clone);
 DECL_TEMPLATE(arm_linux, sys_sigreturn);
 DECL_TEMPLATE(arm_linux, sys_rt_sigreturn);
 DECL_TEMPLATE(arm_linux, sys_sigsuspend);
@@ -424,100 +307,6 @@ POST(sys_fstat64)
    POST_MEM_WRITE( ARG2, sizeof(struct vki_stat64) );
 }
 
-PRE(sys_clone)
-{
-    UInt cloneflags;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-   PRE_REG_READ5(int, "clone",
-                 unsigned long, flags,
-                 void *, child_stack,
-                 int *, parent_tidptr,
-                 void *, child_tls,
-                 int *, child_tidptr);
-
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG1 & VKI_CLONE_SETTLS) {
-      PRE_MEM_READ("clone(tls_user_desc)", ARG4, sizeof(vki_modify_ldt_t));
-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
-                                             VKI_PROT_READ)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-
-   cloneflags = ARG1;
-
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  ARG1,         /* flags */
-                  (Addr)ARG2,   /* child ESP */
-                  (Int *)ARG3,  /* parent_tidptr */
-                  (Int *)ARG5,  /* child_tidptr */
-                  (Addr)ARG4)); /* set_tls */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,      /* flags */
-                       (Int *)ARG3,     /* parent_tidptr */
-                       (Int *)ARG5));   /* child_tidptr */
-      break;
-
-   default:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
-      VG_(message)(Vg_UserMsg, " - for the Quadrics Elan3 user-space driver\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG5, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_sigreturn)
 {
    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
@@ -901,7 +690,7 @@ static SyscallTableEntry syscall_main_table[] = {
    GENX_(__NR_fsync,             sys_fsync),          // 118
    PLAX_(__NR_sigreturn,         sys_sigreturn),      // 119 ?/Linux
 
-   PLAX_(__NR_clone,             sys_clone),          // 120
+   LINX_(__NR_clone,             sys_clone),          // 120
 //zz    //   (__NR_setdomainname,     sys_setdomainname),  // 121 */*(?)
    GENXY(__NR_uname,             sys_newuname),       // 122
 //   PLAX_(__NR_modify_ldt,        sys_modify_ldt),     // 123
diff --git a/coregrind/m_syswrap/syswrap-arm64-linux.c b/coregrind/m_syswrap/syswrap-arm64-linux.c
index 6b579e8..1be6629 100644
--- a/coregrind/m_syswrap/syswrap-arm64-linux.c
+++ b/coregrind/m_syswrap/syswrap-arm64-linux.c
@@ -138,14 +138,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-Long do_syscall_clone_arm64_linux ( Word (*fn)(void *), 
-                                    void* child_stack, 
-                                    Long  flags, 
-                                    void* arg,
-                                    Int*  child_tid,
-                                    Int*  parent_tid,
-                                    void* tls );
+// See priv_syswrap-linux.h for arg profile.
 asm(
 ".text\n"
 ".globl do_syscall_clone_arm64_linux\n"
@@ -196,121 +189,6 @@ static void setup_child ( ThreadArchState*, ThreadArchState* );
 static void assign_guest_tls(ThreadId ctid, Addr tlsptr);
 //ZZ static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr );
             
-/* 
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for IP, and a separate stack
-   for SP.
- */
-static SysRes do_clone ( ThreadId ptid, 
-                         ULong flags,
-                         Addr  child_xsp, 
-                         Int*  parent_tidptr, 
-                         Int*  child_tidptr, 
-                         Addr  child_tls )
-{
-   ThreadId     ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   UWord*       stack;
-   SysRes       res;
-   ULong        x0;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-   /* Copy register state
-
-      Both parent and child return to the same place, and the code
-      following the clone syscall works out which is which, so we
-      don't need to worry about it.
-
-      The parent gets the child's new tid returned from clone, but the
-      child gets 0.
-
-      If the clone call specifies a NULL xsp for the new thread, then
-      it actually gets a copy of the parent's xsp.
-   */
-   setup_child( &ctst->arch, &ptst->arch );
-
-   /* Make sys_clone appear to have returned Success(0) in the
-      child. */
-   ctst->arch.vex.guest_X0 = 0;
-
-   if (child_xsp != 0)
-      ctst->arch.vex.guest_XSP = child_xsp;
-
-   ctst->os_state.parent = ptid;
-
-   /* inherit signal mask */
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* Start the child with its threadgroup being the same as the
-      parent's.  This is so that any exit_group calls that happen
-      after the child is created but before it sets its
-      os_state.threadgroup field for real (in thread_wrapper in
-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-      a race condition in which the thread is unkillable (via
-      exit_group) because its threadgroup is not set.  The race window
-      is probably only a few hundred or a few thousand cycles long.
-      See #226116. */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack)(child_xsp, ctst);
-
-   /* Assume the clone will succeed, and tell any tool that wants to
-      know that this thread has come into existence.  If the clone
-      fails, we'll send out a ll_exit notification for it at the out:
-      label below, to clean up. */
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      /* Just assign the tls pointer in the guest TPIDR_EL0. */
-      assign_guest_tls(ctid, child_tls);
-   }
-    
-   flags &= ~VKI_CLONE_SETTLS;
-
-   /* start the thread with everything blocked */
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   x0 = do_syscall_clone_arm64_linux(
-      ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
-      child_tidptr, parent_tidptr, NULL
-   );
-    
-   res = VG_(mk_SysRes_arm64_linux)( x0 );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-  out:
-   if (sr_isError(res)) {
-      /* clone failed */
-      VG_(cleanup_thread)(&ctst->arch);
-      ctst->status = VgTs_Empty;
-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-}
-
-
 /* ---------------------------------------------------------------------
    More thread stuff
    ------------------------------------------------------------------ */
@@ -397,7 +275,6 @@ DECL_TEMPLATE(arm64_linux, sys_mmap);
 //ZZ DECL_TEMPLATE(arm_linux, sys_lstat64);
 //ZZ DECL_TEMPLATE(arm_linux, sys_fstatat64);
 //ZZ DECL_TEMPLATE(arm_linux, sys_fstat64);
-DECL_TEMPLATE(arm64_linux, sys_clone);
 //ZZ DECL_TEMPLATE(arm_linux, sys_sigreturn);
 DECL_TEMPLATE(arm64_linux, sys_rt_sigreturn);
 //ZZ DECL_TEMPLATE(arm_linux, sys_sigsuspend);
@@ -512,110 +389,6 @@ PRE(sys_mmap)
 //ZZ    POST_MEM_WRITE( ARG2, sizeof(struct vki_stat64) );
 //ZZ }
 
-/* Aarch64 seems to use CONFIG_CLONE_BACKWARDS in the kernel.  See:
-      http://dev.gentoo.org/~vapier/aarch64/linux-3.12.6.config
-      http://people.redhat.com/wcohen/aarch64/aarch64_config
-   from linux-3.10.5/kernel/fork.c 
-    #ifdef CONFIG_CLONE_BACKWARDS
-    SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
-                     int __user *, parent_tidptr,
-                     int, tls_val,
-                     int __user *, child_tidptr)
-*/
-PRE(sys_clone)
-{
-   UInt cloneflags;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-   PRE_REG_READ5(int, "clone",
-                 unsigned long, flags,
-                 void *, child_stack,
-                 int *, parent_tidptr,
-                 void *, child_tls,
-                 int *, child_tidptr);
-
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-//ZZ    if (ARG1 & VKI_CLONE_SETTLS) {
-//ZZ       PRE_MEM_READ("clone(tls_user_desc)", ARG4, sizeof(vki_modify_ldt_t));
-//ZZ       if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
-//ZZ                                              VKI_PROT_READ)) {
-//ZZ          SET_STATUS_Failure( VKI_EFAULT );
-//ZZ          return;
-//ZZ       }
-//ZZ    }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-
-   cloneflags = ARG1;
-
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  ARG1,         /* flags */
-                  (Addr)ARG2,   /* child SP */
-                  (Int*)ARG3,   /* parent_tidptr */
-                  (Int*)ARG5,   /* child_tidptr */
-                  (Addr)ARG4)); /* tls_val */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,     /* flags */
-                       (Int*)ARG3,     /* parent_tidptr */
-                       (Int*)ARG5));   /* child_tidptr */
-      break;
-
-   default:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
-      VG_(message)(Vg_UserMsg, " - for the Quadrics Elan3 user-space driver\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG5, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 //ZZ PRE(sys_sigreturn)
 //ZZ {
 //ZZ    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
@@ -1072,7 +845,7 @@ static SyscallTableEntry syscall_main_table[] = {
    LINX_(__NR_add_key,           sys_add_key),           // 217
 
    LINXY(__NR_keyctl,            sys_keyctl),            // 219
-   PLAX_(__NR_clone,             sys_clone),             // 220
+   LINX_(__NR_clone,             sys_clone),             // 220
    GENX_(__NR_execve,            sys_execve),            // 221
    PLAX_(__NR_mmap,              sys_mmap),              // 222
    PLAX_(__NR_fadvise64,         sys_fadvise64),         // 223
diff --git a/coregrind/m_syswrap/syswrap-linux.c b/coregrind/m_syswrap/syswrap-linux.c
index b3ffdb1..aa00a5f 100644
--- a/coregrind/m_syswrap/syswrap-linux.c
+++ b/coregrind/m_syswrap/syswrap-linux.c
@@ -93,9 +93,8 @@ static VgSchedReturnCode thread_wrapper(Word /*ThreadId*/ tidW)
    VG_TRACK(pre_thread_first_insn, tid);
 
    tst->os_state.lwpid = VG_(gettid)();
-   /* Set the threadgroup for real.  This overwrites the provisional
-      value set in do_clone() syswrap-*-linux.c.  See comments in
-      do_clone for background, also #226116. */
+   /* Set the threadgroup for real.  This overwrites the provisional value set
+      in do_clone().  See comments in do_clone for background, also #226116. */
    tst->os_state.threadgroup = VG_(getpid)();
 
    /* Thread created with all signals blocked; scheduler will set the
@@ -430,17 +429,327 @@ void VG_(main_thread_wrapper_NORETURN)(ThreadId tid)
    vg_assert(0);
 }
 
+/* Clone a new thread. Note that in the clone syscalls, we hard-code
+   tlsaddr argument as NULL : the guest TLS is emulated via guest
+   registers, and Valgrind itself has no thread local storage. */
+static SysRes clone_new_thread ( Word (*fn)(void *), 
+                                 void* stack, 
+                                 Word  flags, 
+                                 ThreadState* ctst,
+                                 Int* child_tidptr, 
+                                 Int* parent_tidptr)
+{
+   SysRes res;
+   /* Note that in all the below, we make sys_clone appear to have returned
+      Success(0) in the child, by assigning the relevant child guest
+      register(s) just before the clone syscall. */
+#if defined(VGP_x86_linux)
+   Int          eax;
+   ctst->arch.vex.guest_EAX = 0;
+   eax = do_syscall_clone_x86_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   res = VG_(mk_SysRes_x86_linux)( eax );
+#elif defined(VGP_amd64_linux)
+   Long         rax;
+   ctst->arch.vex.guest_RAX = 0;
+   rax = do_syscall_clone_amd64_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   res = VG_(mk_SysRes_amd64_linux)( rax );
+#elif defined(VGP_ppc32_linux)
+   ULong        word64;
+   UInt old_cr = LibVEX_GuestPPC32_get_CR( &ctst->arch.vex );
+   /* %r3 = 0 */
+   ctst->arch.vex.guest_GPR3 = 0;
+   /* %cr0.so = 0 */
+   LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
+   word64 = do_syscall_clone_ppc32_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   /* High half word64 is syscall return value.  Low half is
+      the entire CR, from which we need to extract CR0.SO. */
+   /* VG_(printf)("word64 = 0x%llx\n", word64); */
+   res = VG_(mk_SysRes_ppc32_linux)(/*val*/(UInt)(word64 >> 32), 
+                                    /*errflag*/ (((UInt)word64) >> 28) & 1);
+#elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+   ULong        word64;
+   UInt old_cr = LibVEX_GuestPPC64_get_CR( &ctst->arch.vex );
+   /* %r3 = 0 */
+   ctst->arch.vex.guest_GPR3 = 0;
+   /* %cr0.so = 0 */
+   LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
+   word64 = do_syscall_clone_ppc64_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   /* Low half word64 is syscall return value.  Hi half is
+      the entire CR, from which we need to extract CR0.SO. */
+   /* VG_(printf)("word64 = 0x%llx\n", word64); */
+   res = VG_(mk_SysRes_ppc64_linux)
+      (/*val*/(UInt)(word64 & 0xFFFFFFFFULL), 
+       /*errflag*/ (UInt)((word64 >> (32+28)) & 1));
+#elif defined(VGP_s390x_linux)
+   ULong        r2;
+   ctst->arch.vex.guest_r2 = 0;
+   r2 = do_syscall_clone_s390x_linux
+      (stack, flags, parent_tidptr, child_tidptr, NULL,
+       ML_(start_thread_NORETURN), ctst);
+   res = VG_(mk_SysRes_s390x_linux)( r2 );
+#elif defined(VGP_arm64_linux)
+   ULong        x0;
+   ctst->arch.vex.guest_X0 = 0;
+   x0 = do_syscall_clone_arm64_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   res = VG_(mk_SysRes_arm64_linux)( x0 );
+#elif defined(VGP_arm_linux)
+   UInt r0;
+   ctst->arch.vex.guest_R0 = 0;
+   r0 = do_syscall_clone_arm_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   res = VG_(mk_SysRes_arm_linux)( r0 );
+#elif defined(VGP_mips64_linux)
+   UInt ret = 0;
+   ctst->arch.vex.guest_r2 = 0;
+   ctst->arch.vex.guest_r7 = 0;
+   ret = do_syscall_clone_mips64_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       parent_tidptr, NULL, child_tidptr);
+   res = VG_(mk_SysRes_mips64_linux)( /* val */ ret, 0, /* errflag */ 0);
+#elif defined(VGP_mips32_linux)
+   UInt ret = 0;
+   ctst->arch.vex.guest_r2 = 0;
+   ctst->arch.vex.guest_r7 = 0;
+   ret = do_syscall_clone_mips_linux
+      (ML_(start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   /* High half word64 is syscall return value.  Low half is
+      the entire CR, from which we need to extract CR0.SO. */ 
+   res = VG_ (mk_SysRes_mips32_linux) (/*val */ ret, 0, /*errflag */ 0);
+#elif defined(VGP_tilegx_linux)
+   Long ret = 0;
+   ctst->arch.vex.guest_r0 = 0;
+   ctst->arch.vex.guest_r3 = 0;
+   ret = do_syscall_clone_tilegx_linux
+      (ML_ (start_thread_NORETURN), stack, flags, ctst,
+       child_tidptr, parent_tidptr, NULL);
+   /* High half word64 is syscall return value. */
+   res = VG_(mk_SysRes_tilegx_linux) (/*val */ ret);
+#else
+# error Unknown platform
+#endif
+   return res;
+}
+
+static void setup_child ( /*OUT*/ ThreadArchState *child, 
+                          /*IN*/  ThreadArchState *parent )
+{  
+   /* We inherit our parent's guest state. */
+   child->vex = parent->vex;
+   child->vex_shadow1 = parent->vex_shadow1;
+   child->vex_shadow2 = parent->vex_shadow2;
 
-/* Do a clone which is really a fork() */
-SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
-                            Int* parent_tidptr, Int* child_tidptr )
+#if defined(VGP_x86_linux)
+   extern void ML_(x86_setup_LDT_GDT) ( /*OUT*/ ThreadArchState *child, 
+                                        /*IN*/  ThreadArchState *parent );
+   ML_(x86_setup_LDT_GDT)(child, parent);
+#endif
+}
+
+static SysRes setup_child_tls (ThreadId ctid, Addr tlsaddr)
+{
+   static const Bool debug = False;
+   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+   // res is succesful by default, overriden if a real syscall is needed/done.
+   SysRes res = VG_(mk_SysRes_Success)(0);
+
+   if (debug)
+      VG_(printf)("clone child has SETTLS: tls at %#lx\n", tlsaddr);
+
+#if defined(VGP_x86_linux)
+   vki_modify_ldt_t* tlsinfo = (vki_modify_ldt_t*)tlsaddr;
+   if (debug)
+      VG_(printf)("clone child has SETTLS: tls info at %p: idx=%u "
+                  "base=%#lx limit=%x; esp=%#x fs=%x gs=%x\n",
+                  tlsinfo, tlsinfo->entry_number, 
+                  tlsinfo->base_addr, tlsinfo->limit,
+                  ctst->arch.vex.guest_ESP,
+                  ctst->arch.vex.guest_FS, ctst->arch.vex.guest_GS);
+   res = ML_(x86_sys_set_thread_area)(ctid, tlsinfo);
+#elif defined(VGP_amd64_linux)
+   ctst->arch.vex.guest_FS_CONST = tlsaddr;
+#elif defined(VGP_ppc32_linux)
+   ctst->arch.vex.guest_GPR2 = tlsaddr;
+#elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
+   ctst->arch.vex.guest_GPR13 = tlsaddr;
+#elif defined(VGP_s390x_linux)
+   ctst->arch.vex.guest_a0 = (UInt) (tlsaddr >> 32);
+   ctst->arch.vex.guest_a1 = (UInt) tlsaddr;
+#elif defined(VGP_arm64_linux)
+   /* Just assign the tls pointer in the guest TPIDR_EL0. */
+   ctst->arch.vex.guest_TPIDR_EL0 = tlsaddr;
+#elif defined(VGP_arm_linux)
+   /* Just assign the tls pointer in the guest TPIDRURO. */
+   ctst->arch.vex.guest_TPIDRURO = tlsaddr;
+#elif defined(VGP_mips64_linux)
+   ctst->arch.vex.guest_ULR = tlsaddr;
+   ctst->arch.vex.guest_r27 = tlsaddr;
+#elif defined(VGP_mips32_linux)
+   ctst->arch.vex.guest_ULR = tlsaddr;
+   ctst->arch.vex.guest_r27 = tlsaddr;
+#elif defined(VGP_tilegx_linux)
+   ctst->arch.vex.guest_r53 = tlsaddr;
+#else
+# error Unknown platform
+#endif
+   return res;
+} 
+
+/* 
+   When a client clones, we need to keep track of the new thread.  This means:
+   1. allocate a ThreadId+ThreadState+stack for the thread
+
+   2. initialize the thread's new VCPU state
+
+   3. create the thread using the same args as the client requested,
+   but using the scheduler entrypoint for EIP, and a separate stack
+   for ESP.
+ */
+static SysRes do_clone ( ThreadId ptid, 
+                         UWord flags, Addr sp, 
+                         Int* parent_tidptr, 
+                         Int* child_tidptr, 
+                         Addr tlsaddr)
+{
+   ThreadId     ctid = VG_(alloc_ThreadState)();
+   ThreadState* ptst = VG_(get_ThreadState)(ptid);
+   ThreadState* ctst = VG_(get_ThreadState)(ctid);
+   UWord*       stack;
+   SysRes       res;
+   vki_sigset_t blockall, savedmask;
+
+   VG_(sigfillset)(&blockall);
+
+   vg_assert(VG_(is_running_thread)(ptid));
+   vg_assert(VG_(is_valid_tid)(ctid));
+
+   stack = (UWord*)ML_(allocstack)(ctid);
+   if (stack == NULL) {
+      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
+      goto out;
+   }
+
+   /* Copy register state
+
+      Both parent and child return to the same place, and the code
+      following the clone syscall works out which is which, so we
+      don't need to worry about it.
+
+      The parent gets the child's new tid returned from clone, but the
+      child gets 0.
+
+      If the clone call specifies a NULL sp for the new thread, then
+      it actually gets a copy of the parent's sp.
+   */
+   setup_child( &ctst->arch, &ptst->arch );
+
+   if (sp != 0)
+      VG_(set_SP)(ctid, sp);
+
+   ctst->os_state.parent = ptid;
+
+   /* inherit signal mask */
+   ctst->sig_mask     = ptst->sig_mask;
+   ctst->tmp_sig_mask = ptst->sig_mask;
+
+   /* Start the child with its threadgroup being the same as the
+      parent's.  This is so that any exit_group calls that happen
+      after the child is created but before it sets its
+      os_state.threadgroup field for real (in thread_wrapper in
+      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
+      a race condition in which the thread is unkillable (via
+      exit_group) because its threadgroup is not set.  The race window
+      is probably only a few hundred or a few thousand cycles long.
+      See #226116. */
+   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
+
+   ML_(guess_and_register_stack) (sp, ctst);
+   
+   /* Assume the clone will succeed, and tell any tool that wants to
+      know that this thread has come into existence.  We cannot defer
+      it beyond this point because setup_tls, just below,
+      causes checks to assert by making references to the new ThreadId
+      if we don't state the new thread exists prior to that point.
+      If the clone fails, we'll send out a ll_exit notification for it
+      at the out: label below, to clean up. */
+   vg_assert(VG_(owns_BigLock_LL)(ptid));
+   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
+
+   if (flags & VKI_CLONE_SETTLS) {
+      res = setup_child_tls(ctid, tlsaddr);
+      if (sr_isError(res))
+	 goto out;
+   }
+   flags &= ~VKI_CLONE_SETTLS;
+
+   /* start the thread with everything blocked */
+   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
+
+   /* Create the new thread */
+   res = clone_new_thread ( ML_(start_thread_NORETURN), stack, flags, ctst,
+                            child_tidptr, parent_tidptr);
+
+   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
+
+  out:
+   if (sr_isError(res)) {
+      /* clone failed */
+      VG_(cleanup_thread)(&ctst->arch);
+      ctst->status = VgTs_Empty;
+      /* oops.  Better tell the tool the thread exited in a hurry :-) */
+      VG_TRACK( pre_thread_ll_exit, ctid );
+   }
+
+#if defined(VGP_mips64_linux) || defined(VGP_mips32_linux)
+   // ??? why do we set unconditionally r2 to 0, even when error out ???
+   ptst->arch.vex.guest_r2 = 0;
+#elif defined(VGP_tilegx_linux)
+   // ??? why do we set unconditionally r0 to 0, even when error out ???
+   ptst->arch.vex.guest_r0 = 0;
+#endif
+
+   return res;
+}
+
+/* Do a clone which is really a fork().
+   ML_(do_fork_clone) uses the clone syscall to fork a child process.
+   Note that this should not be called for a thread creation.
+   Also, some flags combinations are not supported, and such combinations
+   are handled either by masking the non supported flags or by asserting.
+
+   The CLONE_VFORK flag is accepted, as this just tells that the parent is
+   suspended till the child exits or calls execve. We better keep this flag,
+   just in case the guests parent/client code depends on this synchronisation.
+
+   We cannot keep the flag CLONE_VM, as Valgrind will do whatever host
+   instructions in the child process, that will mess up the parent host
+   memory. So, we hope for the best and assumes that the guest application does
+   not (really) depends on sharing the memory between parent and child in the
+   interval between clone and exits/execve.
+
+   If child_sp != 0, the child (guest) sp will be set to child_sp just after the
+   clone syscall, before child guest instructions are executed. */
+static SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
+                                   Int* parent_tidptr, Int* child_tidptr,
+                                   Addr child_sp)
 {
    vki_sigset_t fork_saved_mask;
    vki_sigset_t mask;
    SysRes       res;
 
    if (flags & (VKI_CLONE_SETTLS | VKI_CLONE_FS | VKI_CLONE_VM 
-                | VKI_CLONE_FILES | VKI_CLONE_VFORK))
+                | VKI_CLONE_FILES))
       return VG_(mk_SysRes_Error)( VKI_EINVAL );
 
    /* Block all signals during fork, so that we can fix things up in
@@ -476,6 +785,8 @@ SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
 
    if (!sr_isError(res) && sr_Res(res) == 0) {
       /* child */
+      if (child_sp != 0)
+          VG_(set_SP)(tid, child_sp);
       VG_(do_atfork_child)(tid);
 
       /* restore signal mask */
@@ -508,7 +819,6 @@ SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
    return res;
 }
 
-
 /* ---------------------------------------------------------------------
    PRE/POST wrappers for arch-generic, Linux-specific syscalls
    ------------------------------------------------------------------ */
@@ -519,6 +829,157 @@ SysRes ML_(do_fork_clone) ( ThreadId tid, UInt flags,
 #error Unknown endianness
 #endif
 
+PRE(sys_clone)
+{
+   UInt cloneflags;
+   Bool badarg = False;
+
+   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
+
+// Order of arguments differs between platforms.
+#if defined(VGP_x86_linux) \
+    || defined(VGP_ppc32_linux) \
+    || defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)	\
+    || defined(VGP_arm_linux) || defined(VGP_mips32_linux) \
+    || defined(VGP_mips64_linux) || defined(VGP_arm64_linux)
+#define ARG_CHILD_TIDPTR ARG5
+#define PRA_CHILD_TIDPTR PRA5
+#define ARG_TLS          ARG4
+#define PRA_TLS          PRA4
+#elif defined(VGP_amd64_linux) || defined(VGP_tilegx_linux) \
+    || defined(VGP_s390x_linux)
+#define ARG_CHILD_TIDPTR ARG4
+#define PRA_CHILD_TIDPTR PRA4
+#define ARG_TLS          ARG5
+#define PRA_TLS          PRA5
+#else
+# error Unknown platform
+#endif
+// And s390x is even more special, and inverts flags and child stack args
+#if defined(VGP_s390x_linux)
+#define ARG_FLAGS       ARG2
+#define PRA_FLAGS       PRA2
+#define ARG_CHILD_STACK ARG1
+#define PRA_CHILD_STACK PRA1
+#else
+#define ARG_FLAGS       ARG1
+#define PRA_FLAGS       PRA1
+#define ARG_CHILD_STACK ARG2
+#define PRA_CHILD_STACK PRA2
+#endif
+
+   if (VG_(tdict).track_pre_reg_read) {
+      PRA_FLAGS("clone", unsigned long, flags);
+      PRA_CHILD_STACK("clone",  void *, child_stack);
+   }
+
+   if (ARG_FLAGS & VKI_CLONE_PARENT_SETTID) {
+      if (VG_(tdict).track_pre_reg_read) {
+         PRA3("clone", int *, parent_tidptr);
+      }
+      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
+      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
+                                             VKI_PROT_WRITE)) {
+         badarg = True;
+      }
+   }
+   if (ARG_FLAGS & VKI_CLONE_SETTLS) {
+      if (VG_(tdict).track_pre_reg_read) {
+         PRA_TLS("clone", vki_modify_ldt_t *, tlsinfo);
+      }
+      /* Not very clear what is vki_modify_ldt_t: for many platforms, it is a
+         dummy type (that we define as a char). We only dereference/check the
+         ARG_TLS pointer if the type looks like a real type, i.e. sizeof > 1. */
+      if (sizeof(vki_modify_ldt_t) > 1) {
+         PRE_MEM_READ("clone(tlsinfo)", ARG_TLS, sizeof(vki_modify_ldt_t));
+         if (!VG_(am_is_valid_for_client)(ARG_TLS, sizeof(vki_modify_ldt_t), 
+                                          VKI_PROT_READ)) {
+            badarg = True;
+         }
+      }
+   }
+   if (ARG_FLAGS & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
+      if (VG_(tdict).track_pre_reg_read) {
+         PRA_CHILD_TIDPTR("clone", int *, child_tidptr);
+      }
+      PRE_MEM_WRITE("clone(child_tidptr)", ARG_CHILD_TIDPTR, sizeof(Int));
+      if (!VG_(am_is_valid_for_client)(ARG_CHILD_TIDPTR, sizeof(Int), 
+                                             VKI_PROT_WRITE)) {
+         badarg = True;
+      }
+   }
+
+   if (badarg) {
+      SET_STATUS_Failure( VKI_EFAULT );
+      return;
+   }
+
+   cloneflags = ARG_FLAGS;
+
+   if (!ML_(client_signal_OK)(ARG_FLAGS & VKI_CSIGNAL)) {
+      SET_STATUS_Failure( VKI_EINVAL );
+      return;
+   }
+
+   /* Only look at the flags we really care about */
+   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
+                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
+   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
+      /* thread creation */
+      SET_STATUS_from_SysRes(
+         do_clone(tid,
+                  ARG_FLAGS,               /* flags */
+                  (Addr)ARG_CHILD_STACK,   /* child ESP */
+                  (Int*)ARG3,              /* parent_tidptr */
+                  (Int*)ARG_CHILD_TIDPTR,  /* child_tidptr */
+                  (Addr)ARG_TLS));         /* set_tls */
+      break;
+
+   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
+      // FALLTHROUGH - assume vfork (somewhat) == fork, see ML_(do_fork_clone).
+      cloneflags &= ~VKI_CLONE_VM;
+
+   case 0: /* plain fork */
+      SET_STATUS_from_SysRes(
+         ML_(do_fork_clone)(tid,
+                       cloneflags,      /* flags */
+                       (Int*)ARG3,     /* parent_tidptr */
+                       (Int*)ARG_CHILD_TIDPTR,     /* child_tidptr */
+                       (Addr)ARG_CHILD_STACK));
+      break;
+
+   default:
+      /* should we just ENOSYS? */
+      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG_FLAGS);
+      VG_(message)(Vg_UserMsg, "\n");
+      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
+      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
+      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
+      VG_(unimplemented)
+         ("Valgrind does not support general clone().");
+   }
+
+   if (SUCCESS) {
+      if (ARG_FLAGS & VKI_CLONE_PARENT_SETTID)
+         POST_MEM_WRITE(ARG3, sizeof(Int));
+      if (ARG_FLAGS & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
+         POST_MEM_WRITE(ARG_CHILD_TIDPTR, sizeof(Int));
+
+      /* Thread creation was successful; let the child have the chance
+         to run */
+      *flags |= SfYieldAfter;
+   }
+
+#undef ARG_CHILD_TIDPTR
+#undef PRA_CHILD_TIDPTR
+#undef ARG_TLS
+#undef PRA_TLS
+#undef ARG_FLAGS
+#undef PRA_FLAGS
+#undef ARG_CHILD_STACK
+#undef PRA_CHILD_STACK
+}
+
 /* ---------------------------------------------------------------------
    *mount wrappers
    ------------------------------------------------------------------ */
diff --git a/coregrind/m_syswrap/syswrap-mips64-linux.c b/coregrind/m_syswrap/syswrap-mips64-linux.c
index 6e3db74..d3d70c5 100644
--- a/coregrind/m_syswrap/syswrap-mips64-linux.c
+++ b/coregrind/m_syswrap/syswrap-mips64-linux.c
@@ -136,14 +136,7 @@ asm (
 #define __NR_CLONE        __NR_clone
 #define __NR_EXIT         __NR_exit
 
-ULong do_syscall_clone_mips64_linux ( Word (*fn) (void *),  /* a0 - 4 */
-                                      void* stack,          /* a1 - 5 */
-                                      Int   flags,          /* a2 - 6 */
-                                      void* arg,            /* a3 - 7 */
-                                      Int*  parent_tid,     /* a4 - 8 */
-                                      void* /* Int tls */,  /* a5 - 9 */
-                                      Int*  child_tid );    /* a6 - 10 */
-
+// See priv_syswrap-linux.h for arg profile.
 asm(
 ".text\n" 
 ".set noreorder\n"
@@ -199,104 +192,13 @@ asm(
 #undef __NR_EXIT
 
 /* forward declarations */
-static void setup_child ( ThreadArchState *, ThreadArchState *);
 static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr);
 
-/* When a client clones, we need to keep track of the new thread. This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested, but using
-      the scheduler entrypoint for IP, and a separate stack for SP. */
-static SysRes do_clone ( ThreadId ptid,
-                         UInt flags, Addr sp,
-                         Int* parent_tidptr,
-                         Int* child_tidptr,
-                         Addr child_tls )
-{
-   const Bool debug = False;
-   ThreadId ctid = VG_ (alloc_ThreadState) ();
-   ThreadState * ptst = VG_ (get_ThreadState) (ptid);
-   ThreadState * ctst = VG_ (get_ThreadState) (ctid);
-   UInt ret = 0;
-   UWord * stack;
-   SysRes res;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-   stack = (UWord *)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)(VKI_ENOMEM);
-      goto out;
-   }
-   setup_child(&ctst->arch, &ptst->arch);
-
-   /* on MIPS we need to set V0 and A3 to zero */
-   ctst->arch.vex.guest_r2 = 0;
-   ctst->arch.vex.guest_r7 = 0;
-   if (sp != 0)
-      ctst->arch.vex.guest_r29 = sp;
-
-   ctst->os_state.parent = ptid;
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (sp, ctst);
-
-   VG_TRACK(pre_thread_ll_create, ptid, ctid);
-   if (flags & VKI_CLONE_SETTLS) {
-       if (debug)
-         VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
-       res = sys_set_tls(ctid, child_tls);
-       if (sr_isError(res))
-          goto out;
-       ctst->arch.vex.guest_r27 = child_tls;
-   }
-
-   flags &= ~VKI_CLONE_SETTLS;
-   VG_ (sigprocmask) (VKI_SIG_SETMASK, &blockall, &savedmask);
-   /* Create the new thread */
-   ret = do_syscall_clone_mips64_linux(ML_(start_thread_NORETURN),
-                                       stack, flags, &VG_(threads)[ctid],
-                                       parent_tidptr, NULL /*child_tls*/,
-                                       child_tidptr);
-   if (debug)
-     VG_(printf)("ret: 0x%x\n", ret);
-
-   res = VG_(mk_SysRes_mips64_linux)( /* val */ ret, 0, /* errflag */ 0);
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-   out:
-   if (sr_isError (res)) {
-      VG_ (cleanup_thread) (&ctst->arch);
-      ctst->status = VgTs_Empty;
-      VG_TRACK (pre_thread_ll_exit, ctid);
-   }
-   ptst->arch.vex.guest_r2 = 0;
-
-   return res;
-}
-
 /* ---------------------------------------------------------------------
                           More thread stuff
    ------------------------------------------------------------------ */
 void VG_(cleanup_thread) ( ThreadArchState * arch ) { };
 
-void setup_child ( /* OUT */ ThreadArchState * child,
-                   /* IN  */ ThreadArchState * parent )
-{
-   /* We inherit our parent's guest state. */
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-}
-
 SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
 {
    VG_(threads)[tid].arch.vex.guest_ULR = tlsptr;
@@ -316,7 +218,6 @@ SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
    file, but that requires even more macro magic. */
 
 DECL_TEMPLATE (mips_linux, sys_set_thread_area);
-DECL_TEMPLATE (mips_linux, sys_clone);
 DECL_TEMPLATE (mips_linux, sys_tee);
 DECL_TEMPLATE (mips_linux, sys_splice);
 DECL_TEMPLATE (mips_linux, sys_vmsplice);
@@ -494,84 +395,6 @@ PRE(sys_mmap)
                                  (Off64T) ARG6);
    SET_STATUS_from_SysRes(r);
 }
-
-PRE(sys_clone)
-{
-   Bool badarg = False;
-   UInt cloneflags;
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )", ARG1, ARG2, ARG3,
-                                                      ARG4, ARG5);
-   PRE_REG_READ2(int, "clone", unsigned long, flags, void *, child_stack);
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA3("clone", int *, parent_tidptr);
-      }
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), VKI_PROT_WRITE)) {
-         badarg = True;
-      }
-   }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA5("clone", int *, child_tidptr);
-      }
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof (Int));
-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof (Int), VKI_PROT_WRITE))
-         badarg = True;
-   }
-   if (badarg) {
-      SET_STATUS_Failure(VKI_EFAULT);
-      return;
-   }
-   cloneflags = ARG1;
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure(VKI_EINVAL);
-      return;
-   }
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS
-           |VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-      case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-         /* thread creation */
-         SET_STATUS_from_SysRes(do_clone(tid,
-                                         ARG1,          /* flags */
-                                         (Addr)ARG2,    /* child SP */
-                                         (Int *)ARG3,   /* parent_tidptr */
-                                         (Int *)ARG5,   /* child_tidptr */
-                                         (Addr)ARG4));  /* child_tls */
-         break;
-
-      case VKI_CLONE_VFORK | VKI_CLONE_VM:  /* vfork */
-         /* FALLTHROUGH - assume vfork == fork */
-         cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-      case 0:  /* plain fork */
-         SET_STATUS_from_SysRes(ML_(do_fork_clone)(tid,
-                                cloneflags,     /* flags */
-                                (Int *)ARG3,    /* parent_tidptr */
-                                (Int *)ARG5));  /* child_tidptr */
-         break;
-
-      default:
-         /* should we just ENOSYS? */
-         VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
-         VG_(message)(Vg_UserMsg, "\n");
-         VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-         VG_(message)(Vg_UserMsg,
-                       " - via a threads library (LinuxThreads or NPTL)\n");
-         VG_(message)(Vg_UserMsg,
-                       " - via the implementation of fork or vfork\n");
-         VG_(unimplemented)("Valgrind does not support general clone().");
-   }
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG5, sizeof(Int));
-      /* Thread creation was successful; let the child have the chance to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_rt_sigreturn)
 {
    /* See comments on PRE(sys_rt_sigreturn) in syswrap-s390x-linux.c for
@@ -766,7 +589,7 @@ static SyscallTableEntry syscall_main_table[] = {
    LINXY (__NR_socketpair, sys_socketpair),
    LINX_ (__NR_setsockopt, sys_setsockopt),
    LINXY (__NR_getsockopt, sys_getsockopt),
-   PLAX_ (__NR_clone, sys_clone),
+   LINX_ (__NR_clone, sys_clone),
    GENX_ (__NR_fork, sys_fork),
    GENX_ (__NR_execve, sys_execve),
    GENX_ (__NR_exit, sys_exit),
diff --git a/coregrind/m_syswrap/syswrap-ppc32-linux.c b/coregrind/m_syswrap/syswrap-ppc32-linux.c
index 379fcb3..a654a90 100644
--- a/coregrind/m_syswrap/syswrap-ppc32-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc32-linux.c
@@ -146,14 +146,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-ULong do_syscall_clone_ppc32_linux ( Word (*fn)(void *), 
-                                     void* stack, 
-                                     Int   flags, 
-                                     void* arg,
-                                     Int*  child_tid, 
-                                     Int*  parent_tid, 
-                                     vki_modify_ldt_t * );
+// See priv_syswrap-linux.h for arg profile.
 asm(
 ".text\n"
 ".globl do_syscall_clone_ppc32_linux\n"
@@ -216,145 +209,6 @@ asm(
 #undef __NR_CLONE
 #undef __NR_EXIT
 
-// forward declarations
-static void setup_child ( ThreadArchState*, ThreadArchState* );
-
-/* 
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for IP, and a separate stack
-   for SP.
- */
-static SysRes do_clone ( ThreadId ptid, 
-                         UInt flags, Addr sp, 
-                         Int *parent_tidptr, 
-                         Int *child_tidptr, 
-                         Addr child_tls)
-{
-   const Bool debug = False;
-
-   ThreadId     ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   ULong        word64;
-   UWord*       stack;
-   SysRes       res;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-//?   /* make a stack frame */
-//?   stack -= 16;
-//?   *(UWord *)stack = 0;
-
-
-   /* Copy register state
-
-      Both parent and child return to the same place, and the code
-      following the clone syscall works out which is which, so we
-      don't need to worry about it.
-
-      The parent gets the child's new tid returned from clone, but the
-      child gets 0.
-
-      If the clone call specifies a NULL SP for the new thread, then
-      it actually gets a copy of the parent's SP.
-
-      The child's TLS register (r2) gets set to the tlsaddr argument
-      if the CLONE_SETTLS flag is set.
-   */
-   setup_child( &ctst->arch, &ptst->arch );
-
-   /* Make sys_clone appear to have returned Success(0) in the
-      child. */
-   { UInt old_cr = LibVEX_GuestPPC32_get_CR( &ctst->arch.vex );
-     /* %r3 = 0 */
-     ctst->arch.vex.guest_GPR3 = 0;
-     /* %cr0.so = 0 */
-     LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
-   }
-
-   if (sp != 0)
-      ctst->arch.vex.guest_GPR1 = sp;
-
-   ctst->os_state.parent = ptid;
-
-   /* inherit signal mask */
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* Start the child with its threadgroup being the same as the
-      parent's.  This is so that any exit_group calls that happen
-      after the child is created but before it sets its
-      os_state.threadgroup field for real (in thread_wrapper in
-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-      a race condition in which the thread is unkillable (via
-      exit_group) because its threadgroup is not set.  The race window
-      is probably only a few hundred or a few thousand cycles long.
-      See #226116. */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (sp, ctst);
-
-   /* Assume the clone will succeed, and tell any tool that wants to
-      know that this thread has come into existence.  If the clone
-      fails, we'll send out a ll_exit notification for it at the out:
-      label below, to clean up. */
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      if (debug)
-         VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
-      ctst->arch.vex.guest_GPR2 = child_tls;
-   }
-
-   flags &= ~VKI_CLONE_SETTLS;
-
-   /* start the thread with everything blocked */
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   /* Create the new thread */
-   word64 = do_syscall_clone_ppc32_linux(
-               ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
-               child_tidptr, parent_tidptr, NULL
-            );
-   /* High half word64 is syscall return value.  Low half is
-      the entire CR, from which we need to extract CR0.SO. */
-   /* VG_(printf)("word64 = 0x%llx\n", word64); */
-   res = VG_(mk_SysRes_ppc32_linux)( 
-            /*val*/(UInt)(word64 >> 32), 
-            /*errflag*/ (((UInt)word64) >> 28) & 1 
-         );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-  out:
-   if (sr_isError(res)) {
-      /* clone failed */
-      VG_(cleanup_thread)(&ctst->arch);
-      ctst->status = VgTs_Empty;
-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-}
-
-
 
 /* ---------------------------------------------------------------------
    More thread stuff
@@ -364,16 +218,6 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
 {
 }  
 
-void setup_child ( /*OUT*/ ThreadArchState *child,
-                   /*IN*/  ThreadArchState *parent )
-{
-   /* We inherit our parent's guest state. */
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-}
-
-
 /* ---------------------------------------------------------------------
    PRE/POST wrappers for ppc32/Linux-specific syscalls
    ------------------------------------------------------------------ */
@@ -393,7 +237,6 @@ DECL_TEMPLATE(ppc32_linux, sys_stat64);
 DECL_TEMPLATE(ppc32_linux, sys_lstat64);
 DECL_TEMPLATE(ppc32_linux, sys_fstatat64);
 DECL_TEMPLATE(ppc32_linux, sys_fstat64);
-DECL_TEMPLATE(ppc32_linux, sys_clone);
 DECL_TEMPLATE(ppc32_linux, sys_sigreturn);
 DECL_TEMPLATE(ppc32_linux, sys_rt_sigreturn);
 DECL_TEMPLATE(ppc32_linux, sys_sigsuspend);
@@ -530,91 +373,6 @@ POST(sys_fstat64)
 //..    }
 //.. }
 
-PRE(sys_clone)
-{
-   UInt cloneflags;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-   PRE_REG_READ5(int, "clone",
-                 unsigned long, flags,
-                 void *,        child_stack,
-                 int *,         parent_tidptr,
-                 void *,        child_tls,
-                 int *,         child_tidptr);
-
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-
-   cloneflags = ARG1;
-
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  ARG1,         /* flags */
-                  (Addr)ARG2,   /* child SP */
-                  (Int *)ARG3,  /* parent_tidptr */
-                  (Int *)ARG5,  /* child_tidptr */
-                  (Addr)ARG4)); /* child_tls */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,      /* flags */
-                       (Int *)ARG3,     /* parent_tidptr */
-                       (Int *)ARG5));   /* child_tidptr */
-      break;
-
-   default:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG5, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_sigreturn)
 {
    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
@@ -999,7 +757,7 @@ static SyscallTableEntry syscall_table[] = {
    GENX_(__NR_fsync,             sys_fsync),             // 118
    PLAX_(__NR_sigreturn,         sys_sigreturn),         // 119 ?/Linux
 //.. 
-   PLAX_(__NR_clone,             sys_clone),             // 120
+   LINX_(__NR_clone,             sys_clone),             // 120
 //..    //   (__NR_setdomainname,     sys_setdomainname),     // 121 */*(?)
    GENXY(__NR_uname,             sys_newuname),          // 122
 //..    PLAX_(__NR_modify_ldt,        sys_modify_ldt),        // 123
diff --git a/coregrind/m_syswrap/syswrap-ppc64-linux.c b/coregrind/m_syswrap/syswrap-ppc64-linux.c
index 1ae4454..f90140d 100644
--- a/coregrind/m_syswrap/syswrap-ppc64-linux.c
+++ b/coregrind/m_syswrap/syswrap-ppc64-linux.c
@@ -209,14 +209,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-ULong do_syscall_clone_ppc64_linux ( Word (*fn)(void *), 
-                                     void* stack, 
-                                     Int   flags, 
-                                     void* arg,
-                                     Int*  child_tid, 
-                                     Int*  parent_tid, 
-                                     void/*vki_modify_ldt_t*/ * );
+// See priv_syswrap-linux.h for arg profile.
 asm(
 #if defined(VGP_ppc64be_linux)
 "   .align   2\n"
@@ -366,148 +359,6 @@ asm(
 #undef __NR_CLONE
 #undef __NR_EXIT
 
-// forward declarations
-static void setup_child ( ThreadArchState*, ThreadArchState* );
-
-/* 
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for IP, and a separate stack
-   for SP.
- */
-static SysRes do_clone ( ThreadId ptid, 
-                         UInt flags, Addr sp, 
-                         Int *parent_tidptr, 
-                         Int *child_tidptr, 
-                         Addr child_tls)
-{
-   const Bool debug = False;
-
-   ThreadId     ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   ULong        word64;
-   UWord*       stack;
-   SysRes       res;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-//?   /* make a stack frame */
-//?   stack -= 16;
-//?   *(UWord *)stack = 0;
-
-
-   /* Copy register state
-
-      Both parent and child return to the same place, and the code
-      following the clone syscall works out which is which, so we
-      don't need to worry about it.
-
-      The parent gets the child's new tid returned from clone, but the
-      child gets 0.
-
-      If the clone call specifies a NULL SP for the new thread, then
-      it actually gets a copy of the parent's SP.
-
-      The child's TLS register (r2) gets set to the tlsaddr argument
-      if the CLONE_SETTLS flag is set.
-   */
-   setup_child( &ctst->arch, &ptst->arch );
-
-   /* Make sys_clone appear to have returned Success(0) in the
-      child. */
-   { UInt old_cr = LibVEX_GuestPPC64_get_CR( &ctst->arch.vex );
-     /* %r3 = 0 */
-     ctst->arch.vex.guest_GPR3 = 0;
-     /* %cr0.so = 0 */
-     LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), &ctst->arch.vex );
-   }
-
-   if (sp != 0)
-      ctst->arch.vex.guest_GPR1 = sp;
-
-   ctst->os_state.parent = ptid;
-
-   /* inherit signal mask */
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* Start the child with its threadgroup being the same as the
-      parent's.  This is so that any exit_group calls that happen
-      after the child is created but before it sets its
-      os_state.threadgroup field for real (in thread_wrapper in
-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-      a race condition in which the thread is unkillable (via
-      exit_group) because its threadgroup is not set.  The race window
-      is probably only a few hundred or a few thousand cycles long.
-      See #226116. */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (sp, ctst);
-
-   /* Assume the clone will succeed, and tell any tool that wants to
-      know that this thread has come into existence.  If the clone
-      fails, we'll send out a ll_exit notification for it at the out:
-      label below, to clean up. */
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      if (debug)
-         VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
-      ctst->arch.vex.guest_GPR13 = child_tls;
-   }
-
-   flags &= ~VKI_CLONE_SETTLS;
-
-   /* start the thread with everything blocked */
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   /* Create the new thread */
-   word64 = do_syscall_clone_ppc64_linux(
-               ML_(start_thread_NORETURN),
-               stack, flags, &VG_(threads)[ctid],
-               child_tidptr, parent_tidptr, NULL
-            );
-
-   /* Low half word64 is syscall return value.  Hi half is
-      the entire CR, from which we need to extract CR0.SO. */
-   /* VG_(printf)("word64 = 0x%llx\n", word64); */
-   res = VG_(mk_SysRes_ppc64_linux)( 
-            /*val*/(UInt)(word64 & 0xFFFFFFFFULL), 
-            /*errflag*/ (UInt)((word64 >> (32+28)) & 1)
-         );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-  out:
-   if (sr_isError(res)) {
-      /* clone failed */
-      VG_(cleanup_thread)(&ctst->arch);
-      ctst->status = VgTs_Empty;
-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-}
-
-
-
 /* ---------------------------------------------------------------------
    More thread stuff
    ------------------------------------------------------------------ */
@@ -516,16 +367,6 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
 {
 }  
 
-void setup_child ( /*OUT*/ ThreadArchState *child,
-                   /*IN*/  ThreadArchState *parent )
-{
-   /* We inherit our parent's guest state. */
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-}
-
-
 /* ---------------------------------------------------------------------
    PRE/POST wrappers for ppc64/Linux-specific syscalls
    ------------------------------------------------------------------ */
@@ -544,7 +385,6 @@ DECL_TEMPLATE(ppc64_linux, sys_mmap);
 //zz DECL_TEMPLATE(ppc64_linux, sys_stat64);
 //zz DECL_TEMPLATE(ppc64_linux, sys_lstat64);
 //zz DECL_TEMPLATE(ppc64_linux, sys_fstat64);
-DECL_TEMPLATE(ppc64_linux, sys_clone);
 //zz DECL_TEMPLATE(ppc64_linux, sys_sigreturn);
 DECL_TEMPLATE(ppc64_linux, sys_rt_sigreturn);
 DECL_TEMPLATE(ppc64_linux, sys_fadvise64);
@@ -629,92 +469,6 @@ PRE(sys_mmap)
 //zz   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat64) );
 //zz }
 
-
-PRE(sys_clone)
-{
-   UInt cloneflags;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-   PRE_REG_READ5(int, "clone",
-                 unsigned long, flags,
-                 void *,        child_stack,
-                 int *,         parent_tidptr,
-                 void *,        child_tls,
-                 int *,         child_tidptr);
-
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-
-   cloneflags = ARG1;
-
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  ARG1,         /* flags */
-                  (Addr)ARG2,   /* child SP */
-                  (Int *)ARG3,  /* parent_tidptr */
-                  (Int *)ARG5,  /* child_tidptr */
-                  (Addr)ARG4)); /* child_tls */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,      /* flags */
-                       (Int *)ARG3,     /* parent_tidptr */
-                       (Int *)ARG5));   /* child_tidptr */
-      break;
-
-   default:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG5, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_fadvise64)
 {
    PRINT("sys_fadvise64 ( %ld, %ld, %lu, %ld )",  SARG1, SARG2, SARG3, SARG4);
@@ -922,7 +676,7 @@ static SyscallTableEntry syscall_table[] = {
    GENX_(__NR_fsync,             sys_fsync),              // 118
 // _____(__NR_sigreturn,         sys_sigreturn),          // 119
 
-   PLAX_(__NR_clone,             sys_clone),              // 120
+   LINX_(__NR_clone,             sys_clone),              // 120
 // _____(__NR_setdomainname,     sys_setdomainname),      // 121
    GENXY(__NR_uname,             sys_newuname),           // 122
 // _____(__NR_modify_ldt,        sys_modify_ldt),         // 123
diff --git a/coregrind/m_syswrap/syswrap-s390x-linux.c b/coregrind/m_syswrap/syswrap-s390x-linux.c
index ebb8295..f596341 100644
--- a/coregrind/m_syswrap/syswrap-s390x-linux.c
+++ b/coregrind/m_syswrap/syswrap-s390x-linux.c
@@ -138,14 +138,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-ULong do_syscall_clone_s390x_linux ( void  *stack,
-                                     ULong flags,
-                                     Int   *parent_tid,
-                                     Int   *child_tid,
-                                     Addr  tlsaddr,
-                                     Word (*fn)(void *),
-                                     void  *arg);
+// See priv_syswrap-linux.h for arg profile.
 asm(
    "   .text\n"
    "   .align  4\n"
@@ -182,126 +175,6 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
   /* only used on x86 for descriptor tables */
 }
 
-static void setup_child ( /*OUT*/ ThreadArchState *child,
-                   /*IN*/  ThreadArchState *parent )
-{
-   /* We inherit our parent's guest state. */
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-}
-
-
-/*
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for IP, and a separate stack
-   for SP.
- */
-static SysRes do_clone ( ThreadId ptid,
-                         Addr sp, ULong flags,
-                         Int *parent_tidptr,
-                         Int *child_tidptr,
-                         Addr tlsaddr)
-{
-   static const Bool debug = False;
-
-   ThreadId     ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   UWord*       stack;
-   SysRes       res;
-   ULong        r2;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-   /* Copy register state
-
-      Both parent and child return to the same place, and the code
-      following the clone syscall works out which is which, so we
-      don't need to worry about it.
-
-      The parent gets the child's new tid returned from clone, but the
-      child gets 0.
-
-      If the clone call specifies a NULL sp for the new thread, then
-      it actually gets a copy of the parent's sp.
-   */
-   setup_child( &ctst->arch, &ptst->arch );
-
-   /* Make sys_clone appear to have returned Success(0) in the
-      child. */
-   ctst->arch.vex.guest_r2 = 0;
-
-   if (sp != 0)
-      ctst->arch.vex.guest_SP = sp;
-
-   ctst->os_state.parent = ptid;
-
-   /* inherit signal mask */
-   ctst->sig_mask = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* have the parents thread group */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (sp, ctst);
-
-   /* Assume the clone will succeed, and tell any tool that wants to
-      know that this thread has come into existence.  If the clone
-      fails, we'll send out a ll_exit notification for it at the out:
-      label below, to clean up. */
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      if (debug)
-	 VG_(printf)("clone child has SETTLS: tls at %#lx\n", tlsaddr);
-      ctst->arch.vex.guest_a0 = (UInt) (tlsaddr >> 32);
-      ctst->arch.vex.guest_a1 = (UInt) tlsaddr;
-   }
-   flags &= ~VKI_CLONE_SETTLS;
-
-   /* start the thread with everything blocked */
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   /* Create the new thread */
-   r2 = do_syscall_clone_s390x_linux(
-            stack, flags, parent_tidptr, child_tidptr, tlsaddr,
-            ML_(start_thread_NORETURN), &VG_(threads)[ctid]);
-
-   res = VG_(mk_SysRes_s390x_linux)( r2 );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-  out:
-   if (sr_isError(res)) {
-      /* clone failed */
-      ctst->status = VgTs_Empty;
-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-
-}
-
-
-
 /* ---------------------------------------------------------------------
    PRE/POST wrappers for s390x/Linux-specific syscalls
    ------------------------------------------------------------------ */
@@ -317,7 +190,6 @@ static SysRes do_clone ( ThreadId ptid,
 
 DECL_TEMPLATE(s390x_linux, sys_ptrace);
 DECL_TEMPLATE(s390x_linux, sys_mmap);
-DECL_TEMPLATE(s390x_linux, sys_clone);
 DECL_TEMPLATE(s390x_linux, sys_sigreturn);
 DECL_TEMPLATE(s390x_linux, sys_rt_sigreturn);
 DECL_TEMPLATE(s390x_linux, sys_fadvise64);
@@ -452,99 +324,6 @@ PRE(sys_mmap)
    SET_STATUS_from_SysRes(r);
 }
 
-PRE(sys_clone)
-{
-   UInt cloneflags;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4, ARG5);
-   PRE_REG_READ2(int, "clone",
-                 void *,        child_stack,
-                 unsigned long, flags);
-
-   if (ARG2 & VKI_CLONE_PARENT_SETTID) {
-      if (VG_(tdict).track_pre_reg_read)
-         PRA3("clone(parent_tidptr)", int *, parent_tidptr);
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int),
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-   if (ARG2 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      if (VG_(tdict).track_pre_reg_read)
-         PRA4("clone(child_tidptr)", int *, child_tidptr);
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG4, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(Int),
-                                             VKI_PROT_WRITE)) {
-         SET_STATUS_Failure( VKI_EFAULT );
-         return;
-      }
-   }
-
-   /* The kernel simply copies reg6 (ARG5) into AR0 and AR1, no checks */
-   if (ARG2 & VKI_CLONE_SETTLS) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA5("clone", Addr, tlsinfo);
-      }
-   }
-
-   cloneflags = ARG2;
-
-   if (!ML_(client_signal_OK)(ARG2 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  (Addr)ARG1,   /* child SP */
-                  ARG2,         /* flags */
-                  (Int *)ARG3,  /* parent_tidptr */
-                  (Int *)ARG4, /* child_tidptr */
-                  (Addr)ARG5)); /*  tlsaddr */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,      /* flags */
-                       (Int *)ARG3,     /* parent_tidptr */
-                       (Int *)ARG4));   /* child_tidptr */
-      break;
-
-   default:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG2);
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg, " - via a threads library (NPTL)\n");
-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG2 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG2 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG4, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_sigreturn)
 {
    ThreadState* tst;
@@ -775,7 +554,7 @@ static SyscallTableEntry syscall_table[] = {
    GENX_(__NR_fsync,  sys_fsync),                                     // 118
    PLAX_(__NR_sigreturn, sys_sigreturn),                              // 119
 
-   PLAX_(__NR_clone,  sys_clone),                                     // 120
+   LINX_(__NR_clone,  sys_clone),                                     // 120
 // ?????(__NR_setdomainname, ),                                       // 121
    GENXY(__NR_uname, sys_newuname),                                   // 122
    GENX_(123, sys_ni_syscall), /* unimplemented (by the kernel) */    // 123
diff --git a/coregrind/m_syswrap/syswrap-tilegx-linux.c b/coregrind/m_syswrap/syswrap-tilegx-linux.c
index 7501b20..05d81e8 100644
--- a/coregrind/m_syswrap/syswrap-tilegx-linux.c
+++ b/coregrind/m_syswrap/syswrap-tilegx-linux.c
@@ -224,14 +224,7 @@ void ML_(call_on_new_stack_0_1) (Addr stack, Addr retaddr,
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
-                                     void *stack,          //r1
-                                     Long flags,           //r2
-                                     void *arg,            //r3
-                                     Long * child_tid,     //r4
-                                     Long * parent_tid,    //r5
-                                     Long   tls );         //r6
-    /*
+   /*
       stack
       high -> 4  r29
       3
@@ -239,6 +232,7 @@ Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
       1  r10
       low  -> 0  lr    <- sp
     */
+// See priv_syswrap-linux.h for arg profile.
      asm (
        ".text\n"
        "   .globl   do_syscall_clone_tilegx_linux\n"
@@ -315,101 +309,6 @@ Long do_syscall_clone_tilegx_linux ( Word (*fn) (void *),  //r0
 #undef __NR_EXIT
 
 // forward declarations
-static void setup_child ( ThreadArchState *, ThreadArchState * );
-static SysRes sys_set_tls ( ThreadId tid, Addr tlsptr );
- /*
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-   2. initialize the thread's new VCPU state
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for IP, and a separate stack
-   for SP.
- */
-static SysRes do_clone ( ThreadId ptid,
-                         Long flags, Addr sp,
-                         Long * parent_tidptr,
-                         Long * child_tidptr,
-                         Addr child_tls )
-{
-  const Bool debug = False;
-  ThreadId ctid = VG_ (alloc_ThreadState) ();
-  ThreadState * ptst = VG_ (get_ThreadState) (ptid);
-  ThreadState * ctst = VG_ (get_ThreadState) (ctid);
-  Long ret = 0;
-  Long * stack;
-  SysRes res;
-  vki_sigset_t blockall, savedmask;
-
-  VG_ (sigfillset) (&blockall);
-  vg_assert (VG_ (is_running_thread) (ptid));
-  vg_assert (VG_ (is_valid_tid) (ctid));
-  stack = (Long *) ML_ (allocstack) (ctid);
-  if (stack == NULL) {
-    res = VG_ (mk_SysRes_Error) (VKI_ENOMEM);
-    goto out;
-  }
-  setup_child (&ctst->arch, &ptst->arch);
-
-  /* On TILEGX we need to set r0 and r3 to zero */
-  ctst->arch.vex.guest_r0 = 0;
-  ctst->arch.vex.guest_r3 = 0;
-  if (sp != 0)
-    ctst->arch.vex.guest_r54 = sp;
-
-  ctst->os_state.parent = ptid;
-  ctst->sig_mask = ptst->sig_mask;
-  ctst->tmp_sig_mask = ptst->sig_mask;
-
-  /* Start the child with its threadgroup being the same as the
-     parent's.  This is so that any exit_group calls that happen
-     after the child is created but before it sets its
-     os_state.threadgroup field for real (in thread_wrapper in
-     syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-     a race condition in which the thread is unkillable (via
-     exit_group) because its threadgroup is not set.  The race window
-     is probably only a few hundred or a few thousand cycles long.
-     See #226116. */
-
-  ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-  ML_(guess_and_register_stack) (sp, ctst);
-
-  VG_TRACK (pre_thread_ll_create, ptid, ctid);
-  if (flags & VKI_CLONE_SETTLS) {
-    if (debug)
-      VG_(printf)("clone child has SETTLS: tls at %#lx\n", child_tls);
-    ctst->arch.vex.guest_r53 = child_tls;
-    res = sys_set_tls(ctid, child_tls);
-    if (sr_isError(res))
-      goto out;
-  }
-
-  flags &= ~VKI_CLONE_SETTLS;
-  VG_ (sigprocmask) (VKI_SIG_SETMASK, &blockall, &savedmask);
-  /* Create the new thread */
-  ret = do_syscall_clone_tilegx_linux (ML_ (start_thread_NORETURN),
-                                       stack, flags, &VG_ (threads)[ctid],
-                                       child_tidptr, parent_tidptr,
-                                       (Long)NULL /*child_tls*/);
-
-  /* High half word64 is syscall return value. */
-  if (debug)
-    VG_(printf)("ret: 0x%llx\n", (ULong)ret);
-
-  res = VG_(mk_SysRes_tilegx_linux) (/*val */ ret);
-
-  VG_ (sigprocmask) (VKI_SIG_SETMASK, &savedmask, NULL);
-
- out:
-  if (sr_isError (res)) {
-    VG_(cleanup_thread) (&ctst->arch);
-    ctst->status = VgTs_Empty;
-    VG_TRACK (pre_thread_ll_exit, ctid);
-  }
-  ptst->arch.vex.guest_r0 = 0;
-
-  return res;
-}
-
 extern Addr do_brk ( Addr newbrk );
 
 extern
@@ -428,23 +327,6 @@ extern Bool linux_kernel_2_6_22(void);
 void
 VG_ (cleanup_thread) ( ThreadArchState * arch ) { }
 
-void
-setup_child ( /*OUT*/ ThreadArchState * child,
-              /*IN*/ ThreadArchState * parent )
-{
-  /* We inherit our parent's guest state. */
-  child->vex = parent->vex;
-  child->vex_shadow1 = parent->vex_shadow1;
-  child->vex_shadow2 = parent->vex_shadow2;
-}
-
-SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
-{
-  VG_(threads)[tid].arch.vex.guest_r53 = tlsptr;
-  return VG_(mk_SysRes_Success)( 0 );
-}
-
-
 /* ---------------------------------------------------------------------
    PRE/POST wrappers for tilegx/Linux-specific syscalls
    ------------------------------------------------------------------ */
@@ -457,7 +339,6 @@ SysRes sys_set_tls ( ThreadId tid, Addr tlsptr )
    aren't visible outside this file, but that requires even more macro
    magic. */
 
-DECL_TEMPLATE (tilegx_linux, sys_clone);
 DECL_TEMPLATE (tilegx_linux, sys_rt_sigreturn);
 DECL_TEMPLATE (tilegx_linux, sys_socket);
 DECL_TEMPLATE (tilegx_linux, sys_setsockopt);
@@ -496,94 +377,6 @@ DECL_TEMPLATE (tilegx_linux, sys_syscall184);
 DECL_TEMPLATE (tilegx_linux, sys_cacheflush);
 DECL_TEMPLATE (tilegx_linux, sys_set_dataplane);
 
-PRE(sys_clone)
-{
-  ULong cloneflags;
-
-  PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-  PRE_REG_READ5(int, "clone",
-                unsigned long, flags,
-                void *, child_stack,
-                int *, parent_tidptr,
-                int *, child_tidptr,
-                void *, tlsaddr);
-
-  if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-    PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-    if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), VKI_PROT_WRITE)) {
-      SET_STATUS_Failure( VKI_EFAULT );
-      return;
-    }
-  }
-  if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-    PRE_MEM_WRITE("clone(child_tidptr)", ARG4, sizeof(Int));
-    if (!VG_(am_is_valid_for_client)(ARG4, sizeof(Int), VKI_PROT_WRITE)) {
-      SET_STATUS_Failure( VKI_EFAULT );
-      return;
-    }
-  }
-
-  cloneflags = ARG1;
-
-  if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-    SET_STATUS_Failure( VKI_EINVAL );
-    return;
-  }
-
-  /* Only look at the flags we really care about */
-  switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS
-                        | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-  case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-    /* thread creation */
-    SET_STATUS_from_SysRes(
-      do_clone(tid,
-               ARG1,          /* flags */
-               (Addr)ARG2,    /* child ESP */
-               (Long *)ARG3,  /* parent_tidptr */
-               (Long *)ARG4,  /* child_tidptr */
-               (Addr)ARG5));  /* set_tls */
-    break;
-
-  case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-    /* FALLTHROUGH - assume vfork == fork */
-    cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-  case 0: /* plain fork */
-    SET_STATUS_from_SysRes(
-      ML_(do_fork_clone)(tid,
-                         cloneflags,      /* flags */
-                         (Int *)ARG3,     /* parent_tidptr */
-                         (Int *)ARG4));   /* child_tidptr */
-    break;
-
-  default:
-    /* should we just ENOSYS? */
-    VG_(message)(Vg_UserMsg,
-                 "Unsupported clone() flags: 0x%lx\n", ARG1);
-    VG_(message)(Vg_UserMsg,
-                 "\n");
-    VG_(message)(Vg_UserMsg,
-                 "The only supported clone() uses are:\n");
-    VG_(message)(Vg_UserMsg,
-                 " - via a threads library (LinuxThreads or NPTL)\n");
-    VG_(message)(Vg_UserMsg,
-                 " - via the implementation of fork or vfork\n");
-    VG_(unimplemented)
-      ("Valgrind does not support general clone().");
-  }
-
-  if (SUCCESS) {
-    if (ARG1 & VKI_CLONE_PARENT_SETTID)
-      POST_MEM_WRITE(ARG3, sizeof(Int));
-    if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-      POST_MEM_WRITE(ARG4, sizeof(Int));
-
-    /* Thread creation was successful; let the child have the chance
-       to run */
-    *flags |= SfYieldAfter;
-  }
-}
-
 PRE(sys_rt_sigreturn)
 {
   /* This isn't really a syscall at all - it's a misuse of the
@@ -1344,7 +1137,7 @@ static SyscallTableEntry syscall_table[] = {
   LINX_(__NR_add_key,           sys_add_key),              // 217
   LINX_(__NR_request_key,       sys_request_key),          // 218
   LINXY(__NR_keyctl,            sys_keyctl),               // 219
-  PLAX_(__NR_clone,             sys_clone),                // 220
+  LINX_(__NR_clone,             sys_clone),                // 220
   GENX_(__NR_execve,            sys_execve),               // 221
   PLAX_(__NR_mmap,              sys_mmap),                 // 222
   GENXY(__NR_mprotect,          sys_mprotect),             // 226
diff --git a/coregrind/m_syswrap/syswrap-x86-linux.c b/coregrind/m_syswrap/syswrap-x86-linux.c
index 0e5af98..f8c4eb4 100644
--- a/coregrind/m_syswrap/syswrap-x86-linux.c
+++ b/coregrind/m_syswrap/syswrap-x86-linux.c
@@ -131,14 +131,7 @@ asm(
 #define __NR_CLONE        VG_STRINGIFY(__NR_clone)
 #define __NR_EXIT         VG_STRINGIFY(__NR_exit)
 
-extern
-Int do_syscall_clone_x86_linux ( Word (*fn)(void *), 
-                                 void* stack, 
-                                 Int   flags, 
-                                 void* arg,
-                                 Int*  child_tid, 
-                                 Int*  parent_tid, 
-                                 vki_modify_ldt_t * );
+// See priv_syswrap-linux.h for arg profile.
 asm(
 ".text\n"
 ".globl do_syscall_clone_x86_linux\n"
@@ -191,141 +184,6 @@ asm(
 #undef __NR_EXIT
 
 
-// forward declarations
-static void setup_child ( ThreadArchState*, ThreadArchState*, Bool );
-static SysRes sys_set_thread_area ( ThreadId, vki_modify_ldt_t* );
-
-/* 
-   When a client clones, we need to keep track of the new thread.  This means:
-   1. allocate a ThreadId+ThreadState+stack for the thread
-
-   2. initialize the thread's new VCPU state
-
-   3. create the thread using the same args as the client requested,
-   but using the scheduler entrypoint for EIP, and a separate stack
-   for ESP.
- */
-static SysRes do_clone ( ThreadId ptid, 
-                         UInt flags, Addr esp, 
-                         Int* parent_tidptr, 
-                         Int* child_tidptr, 
-                         vki_modify_ldt_t *tlsinfo)
-{
-   static const Bool debug = False;
-
-   ThreadId     ctid = VG_(alloc_ThreadState)();
-   ThreadState* ptst = VG_(get_ThreadState)(ptid);
-   ThreadState* ctst = VG_(get_ThreadState)(ctid);
-   UWord*       stack;
-   SysRes       res;
-   Int          eax;
-   vki_sigset_t blockall, savedmask;
-
-   VG_(sigfillset)(&blockall);
-
-   vg_assert(VG_(is_running_thread)(ptid));
-   vg_assert(VG_(is_valid_tid)(ctid));
-
-   stack = (UWord*)ML_(allocstack)(ctid);
-   if (stack == NULL) {
-      res = VG_(mk_SysRes_Error)( VKI_ENOMEM );
-      goto out;
-   }
-
-   /* Copy register state
-
-      Both parent and child return to the same place, and the code
-      following the clone syscall works out which is which, so we
-      don't need to worry about it.
-
-      The parent gets the child's new tid returned from clone, but the
-      child gets 0.
-
-      If the clone call specifies a NULL esp for the new thread, then
-      it actually gets a copy of the parent's esp.
-   */
-   /* Note: the clone call done by the Quadrics Elan3 driver specifies
-      clone flags of 0xF00, and it seems to rely on the assumption
-      that the child inherits a copy of the parent's GDT.  
-      setup_child takes care of setting that up. */
-   setup_child( &ctst->arch, &ptst->arch, True );
-
-   /* Make sys_clone appear to have returned Success(0) in the
-      child. */
-   ctst->arch.vex.guest_EAX = 0;
-
-   if (esp != 0)
-      ctst->arch.vex.guest_ESP = esp;
-
-   ctst->os_state.parent = ptid;
-
-   /* inherit signal mask */
-   ctst->sig_mask     = ptst->sig_mask;
-   ctst->tmp_sig_mask = ptst->sig_mask;
-
-   /* Start the child with its threadgroup being the same as the
-      parent's.  This is so that any exit_group calls that happen
-      after the child is created but before it sets its
-      os_state.threadgroup field for real (in thread_wrapper in
-      syswrap-linux.c), really kill the new thread.  a.k.a this avoids
-      a race condition in which the thread is unkillable (via
-      exit_group) because its threadgroup is not set.  The race window
-      is probably only a few hundred or a few thousand cycles long.
-      See #226116. */
-   ctst->os_state.threadgroup = ptst->os_state.threadgroup;
-
-   ML_(guess_and_register_stack) (esp, ctst);
-   
-   /* Assume the clone will succeed, and tell any tool that wants to
-      know that this thread has come into existence.  We cannot defer
-      it beyond this point because sys_set_thread_area, just below,
-      causes tCheck to assert by making references to the new ThreadId
-      if we don't state the new thread exists prior to that point.
-      If the clone fails, we'll send out a ll_exit notification for it
-      at the out: label below, to clean up. */
-   vg_assert(VG_(owns_BigLock_LL)(ptid));
-   VG_TRACK ( pre_thread_ll_create, ptid, ctid );
-
-   if (flags & VKI_CLONE_SETTLS) {
-      if (debug)
-	 VG_(printf)("clone child has SETTLS: tls info at %p: idx=%u "
-                     "base=%#lx limit=%x; esp=%#x fs=%x gs=%x\n",
-		     tlsinfo, tlsinfo->entry_number, 
-                     tlsinfo->base_addr, tlsinfo->limit,
-		     ptst->arch.vex.guest_ESP,
-		     ctst->arch.vex.guest_FS, ctst->arch.vex.guest_GS);
-      res = sys_set_thread_area(ctid, tlsinfo);
-      if (sr_isError(res))
-	 goto out;
-   }
-
-   flags &= ~VKI_CLONE_SETTLS;
-
-   /* start the thread with everything blocked */
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &blockall, &savedmask);
-
-   /* Create the new thread */
-   eax = do_syscall_clone_x86_linux(
-            ML_(start_thread_NORETURN), stack, flags, &VG_(threads)[ctid],
-            child_tidptr, parent_tidptr, NULL
-         );
-   res = VG_(mk_SysRes_x86_linux)( eax );
-
-   VG_(sigprocmask)(VKI_SIG_SETMASK, &savedmask, NULL);
-
-  out:
-   if (sr_isError(res)) {
-      /* clone failed */
-      VG_(cleanup_thread)(&ctst->arch);
-      ctst->status = VgTs_Empty;
-      /* oops.  Better tell the tool the thread exited in a hurry :-) */
-      VG_TRACK( pre_thread_ll_exit, ctid );
-   }
-
-   return res;
-}
-
-
 /* ---------------------------------------------------------------------
    LDT/GDT simulation
    ------------------------------------------------------------------ */
@@ -630,7 +488,7 @@ static SysRes sys_modify_ldt ( ThreadId tid,
 }
 
 
-static SysRes sys_set_thread_area ( ThreadId tid, vki_modify_ldt_t* info )
+SysRes ML_(x86_sys_set_thread_area) ( ThreadId tid, vki_modify_ldt_t* info )
 {
    Int                  idx;
    VexGuestX86SegDescr* gdt;
@@ -738,15 +596,9 @@ void VG_(cleanup_thread) ( ThreadArchState* arch )
 }  
 
 
-static void setup_child ( /*OUT*/ ThreadArchState *child, 
-                          /*IN*/  ThreadArchState *parent,
-                          Bool inherit_parents_GDT )
+void ML_(x86_setup_LDT_GDT) ( /*OUT*/ ThreadArchState *child, 
+                              /*IN*/  ThreadArchState *parent )
 {
-   /* We inherit our parent's guest state. */
-   child->vex = parent->vex;
-   child->vex_shadow1 = parent->vex_shadow1;
-   child->vex_shadow2 = parent->vex_shadow2;
-
    /* We inherit our parent's LDT. */
    if (parent->vex.guest_LDT == (HWord)NULL) {
       /* We hope this is the common case. */
@@ -763,7 +615,7 @@ static void setup_child ( /*OUT*/ ThreadArchState *child,
       only). */
    child->vex.guest_GDT = (HWord)NULL;
 
-   if (inherit_parents_GDT && parent->vex.guest_GDT != (HWord)NULL) {
+   if (parent->vex.guest_GDT != (HWord)NULL) {
       child->vex.guest_GDT = (HWord)alloc_zeroed_x86_GDT();
       copy_GDT_from_to( (VexGuestX86SegDescr*)parent->vex.guest_GDT,
                         (VexGuestX86SegDescr*)child->vex.guest_GDT );
@@ -787,7 +639,6 @@ DECL_TEMPLATE(x86_linux, sys_stat64);
 DECL_TEMPLATE(x86_linux, sys_fstatat64);
 DECL_TEMPLATE(x86_linux, sys_fstat64);
 DECL_TEMPLATE(x86_linux, sys_lstat64);
-DECL_TEMPLATE(x86_linux, sys_clone);
 DECL_TEMPLATE(x86_linux, old_mmap);
 DECL_TEMPLATE(x86_linux, sys_mmap2);
 DECL_TEMPLATE(x86_linux, sys_sigreturn);
@@ -835,137 +686,6 @@ PRE(old_select)
    }
 }
 
-PRE(sys_clone)
-{
-   UInt cloneflags;
-   Bool badarg = False;
-
-   PRINT("sys_clone ( %lx, %#lx, %#lx, %#lx, %#lx )",ARG1,ARG2,ARG3,ARG4,ARG5);
-   PRE_REG_READ2(int, "clone",
-                 unsigned long, flags,
-                 void *, child_stack);
-
-   if (ARG1 & VKI_CLONE_PARENT_SETTID) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA3("clone", int *, parent_tidptr);
-      }
-      PRE_MEM_WRITE("clone(parent_tidptr)", ARG3, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG3, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         badarg = True;
-      }
-   }
-   if (ARG1 & VKI_CLONE_SETTLS) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA4("clone", vki_modify_ldt_t *, tlsinfo);
-      }
-      PRE_MEM_READ("clone(tlsinfo)", ARG4, sizeof(vki_modify_ldt_t));
-      if (!VG_(am_is_valid_for_client)(ARG4, sizeof(vki_modify_ldt_t), 
-                                             VKI_PROT_READ)) {
-         badarg = True;
-      }
-   }
-   if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID)) {
-      if (VG_(tdict).track_pre_reg_read) {
-         PRA5("clone", int *, child_tidptr);
-      }
-      PRE_MEM_WRITE("clone(child_tidptr)", ARG5, sizeof(Int));
-      if (!VG_(am_is_valid_for_client)(ARG5, sizeof(Int), 
-                                             VKI_PROT_WRITE)) {
-         badarg = True;
-      }
-   }
-
-   if (badarg) {
-      SET_STATUS_Failure( VKI_EFAULT );
-      return;
-   }
-
-   cloneflags = ARG1;
-
-   if (!ML_(client_signal_OK)(ARG1 & VKI_CSIGNAL)) {
-      SET_STATUS_Failure( VKI_EINVAL );
-      return;
-   }
-
-   /* Be ultra-paranoid and filter out any clone-variants we don't understand:
-      - ??? specifies clone flags of 0x100011
-      - ??? specifies clone flags of 0x1200011.
-      - NPTL specifies clone flags of 0x7D0F00.
-      - The Quadrics Elan3 driver specifies clone flags of 0xF00.
-      - Newer Quadrics Elan3 drivers with NTPL support specify 0x410F00.
-      Everything else is rejected. 
-   */
-   if (
-        1 ||
-        /* 11 Nov 05: for the time being, disable this ultra-paranoia.
-           The switch below probably does a good enough job. */
-          (cloneflags == 0x100011 || cloneflags == 0x1200011
-                                  || cloneflags == 0x7D0F00
-                                  || cloneflags == 0x790F00
-                                  || cloneflags == 0x3D0F00
-                                  || cloneflags == 0x410F00
-                                  || cloneflags == 0xF00
-                                  || cloneflags == 0xF21)) {
-     /* OK */
-   }
-   else {
-      /* Nah.  We don't like it.  Go away. */
-      goto reject;
-   }
-
-   /* Only look at the flags we really care about */
-   switch (cloneflags & (VKI_CLONE_VM | VKI_CLONE_FS 
-                         | VKI_CLONE_FILES | VKI_CLONE_VFORK)) {
-   case VKI_CLONE_VM | VKI_CLONE_FS | VKI_CLONE_FILES:
-      /* thread creation */
-      SET_STATUS_from_SysRes(
-         do_clone(tid,
-                  ARG1,         /* flags */
-                  (Addr)ARG2,   /* child ESP */
-                  (Int *)ARG3,  /* parent_tidptr */
-                  (Int *)ARG5,  /* child_tidptr */
-                  (vki_modify_ldt_t *)ARG4)); /* set_tls */
-      break;
-
-   case VKI_CLONE_VFORK | VKI_CLONE_VM: /* vfork */
-      /* FALLTHROUGH - assume vfork == fork */
-      cloneflags &= ~(VKI_CLONE_VFORK | VKI_CLONE_VM);
-
-   case 0: /* plain fork */
-      SET_STATUS_from_SysRes(
-         ML_(do_fork_clone)(tid,
-                       cloneflags,      /* flags */
-                       (Int *)ARG3,     /* parent_tidptr */
-                       (Int *)ARG5));   /* child_tidptr */
-      break;
-
-   default:
-   reject:
-      /* should we just ENOSYS? */
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "Unsupported clone() flags: 0x%lx\n", ARG1);
-      VG_(message)(Vg_UserMsg, "\n");
-      VG_(message)(Vg_UserMsg, "The only supported clone() uses are:\n");
-      VG_(message)(Vg_UserMsg, " - via a threads library (LinuxThreads or NPTL)\n");
-      VG_(message)(Vg_UserMsg, " - via the implementation of fork or vfork\n");
-      VG_(message)(Vg_UserMsg, " - for the Quadrics Elan3 user-space driver\n");
-      VG_(unimplemented)
-         ("Valgrind does not support general clone().");
-   }
-
-   if (SUCCESS) {
-      if (ARG1 & VKI_CLONE_PARENT_SETTID)
-         POST_MEM_WRITE(ARG3, sizeof(Int));
-      if (ARG1 & (VKI_CLONE_CHILD_SETTID | VKI_CLONE_CHILD_CLEARTID))
-         POST_MEM_WRITE(ARG5, sizeof(Int));
-
-      /* Thread creation was successful; let the child have the chance
-         to run */
-      *flags |= SfYieldAfter;
-   }
-}
-
 PRE(sys_sigreturn)
 {
    /* See comments on PRE(sys_rt_sigreturn) in syswrap-amd64-linux.c for
@@ -1063,7 +783,7 @@ PRE(sys_set_thread_area)
    PRE_MEM_READ( "set_thread_area(u_info)", ARG1, sizeof(vki_modify_ldt_t) );
 
    /* "do" the syscall ourselves; the kernel never sees it */
-   SET_STATUS_from_SysRes( sys_set_thread_area( tid, (void *)ARG1 ) );
+   SET_STATUS_from_SysRes( ML_(x86_sys_set_thread_area)( tid, (void *)ARG1 ) );
 }
 
 PRE(sys_get_thread_area)
@@ -1553,7 +1273,7 @@ static SyscallTableEntry syscall_table[] = {
    GENX_(__NR_fsync,             sys_fsync),          // 118
    PLAX_(__NR_sigreturn,         sys_sigreturn),      // 119 ?/Linux
 
-   PLAX_(__NR_clone,             sys_clone),          // 120
+   LINX_(__NR_clone,             sys_clone),          // 120
 //zz    //   (__NR_setdomainname,     sys_setdomainname),  // 121 */*(?)
    GENXY(__NR_uname,             sys_newuname),       // 122
    PLAX_(__NR_modify_ldt,        sys_modify_ldt),     // 123
diff --git a/include/vki/vki-arm64-linux.h b/include/vki/vki-arm64-linux.h
index df34dd6..5a3b08f 100644
--- a/include/vki/vki-arm64-linux.h
+++ b/include/vki/vki-arm64-linux.h
@@ -586,7 +586,8 @@ struct vki_ucontext {
 //ZZ };
 //ZZ 
 //ZZ // [[Nb: for our convenience within Valgrind, use a more specific name]]
-//ZZ typedef struct vki_user_desc vki_modify_ldt_t;
+
+typedef char vki_modify_ldt_t;
 
 //----------------------------------------------------------------------
 // From linux-3.10.5/include/asm-generic/ipcbuf.h
diff --git a/include/vki/vki-mips32-linux.h b/include/vki/vki-mips32-linux.h
index 5be8e15..b6c9914 100644
--- a/include/vki/vki-mips32-linux.h
+++ b/include/vki/vki-mips32-linux.h
@@ -679,7 +679,7 @@ struct vki_ucontext {
 };
 
 // CAB: TODO
-typedef void vki_modify_ldt_t;
+typedef char vki_modify_ldt_t;
 
 //----------------------------------------------------------------------
 // From linux-2.6.35.5/include/asm-mips/ipcbuf.h
diff --git a/include/vki/vki-mips64-linux.h b/include/vki/vki-mips64-linux.h
index 26b8e9f..ca49b10 100644
--- a/include/vki/vki-mips64-linux.h
+++ b/include/vki/vki-mips64-linux.h
@@ -710,6 +710,7 @@ struct vki_ucontext {
        vki_sigset_t           uc_sigmask;  /* mask last for extensibility */
 };
 
+typedef char vki_modify_ldt_t;
 //----------------------------------------------------------------------
 // From linux-2.6.35.9/include/asm-mips/ipcbuf.h
 //----------------------------------------------------------------------
diff --git a/include/vki/vki-ppc32-linux.h b/include/vki/vki-ppc32-linux.h
index 70c2835..0fd3c79 100644
--- a/include/vki/vki-ppc32-linux.h
+++ b/include/vki/vki-ppc32-linux.h
@@ -811,10 +811,9 @@ struct vki_ucontext {
 //.. };
 //.. 
 //.. // [[Nb: for our convenience within Valgrind, use a more specific name]]
-//.. typedef struct vki_user_desc vki_modify_ldt_t;
 
 // CAB: TODO
-typedef void vki_modify_ldt_t;
+typedef char vki_modify_ldt_t;
 
 
 //----------------------------------------------------------------------
diff --git a/include/vki/vki-ppc64-linux.h b/include/vki/vki-ppc64-linux.h
index b410663..fd5cea6 100644
--- a/include/vki/vki-ppc64-linux.h
+++ b/include/vki/vki-ppc64-linux.h
@@ -685,6 +685,9 @@ struct vki_ucontext {
   struct vki_sigcontext uc_mcontext;  /* last for extensibility */
 };
 
+// CAB: TODO
+typedef char vki_modify_ldt_t;
+
 //----------------------------------------------------------------------
 // From linux-2.6.13/include/asm-ppc64/ipcbuf.h
 //----------------------------------------------------------------------
diff --git a/include/vki/vki-s390x-linux.h b/include/vki/vki-s390x-linux.h
index c3f6d00..1ef5cf7 100644
--- a/include/vki/vki-s390x-linux.h
+++ b/include/vki/vki-s390x-linux.h
@@ -822,6 +822,8 @@ struct vki_ucontext {
 	vki_sigset_t	      uc_sigmask; /* mask last for extensibility */
 };
 
+typedef char vki_modify_ldt_t;
+
 //----------------------------------------------------------------------
 // From linux-2.6.16.60/include/asm-s390/ipcbuf.h
 //----------------------------------------------------------------------