00db10
commit 8f9450a0b7a9e78267e8ae1ab1000ebca08e473e
00db10
Author: Torvald Riegel <triegel@redhat.com>
00db10
Date:   Sat Dec 24 00:40:46 2016 +0100
00db10
00db10
    Add compiler barriers around modifications of the robust mutex list.
00db10
    
00db10
    Any changes to the per-thread list of robust mutexes currently acquired as
00db10
    well as the pending-operations entry are not simply sequential code but
00db10
    basically concurrent with any actions taken by the kernel when it tries
00db10
    to clean up after a crash.  This is not quite like multi-thread concurrency
00db10
    but more like signal-handler concurrency.
00db10
    This patch fixes latent bugs by adding compiler barriers where necessary so
00db10
    that it is ensured that the kernel crash handling sees consistent data.
00db10
    
00db10
    This is meant to be easy to backport, so we do not use C11-style signal
00db10
    fences yet.
00db10
    
00db10
            * nptl/descr.h (ENQUEUE_MUTEX_BOTH, DEQUEUE_MUTEX): Add compiler
00db10
            barriers and comments.
00db10
            * nptl/pthread_mutex_lock.c (__pthread_mutex_lock_full): Likewise.
00db10
            * nptl/pthread_mutex_timedlock.c (pthread_mutex_timedlock): Likewise.
00db10
            * nptl/pthread_mutex_unlock.c (__pthread_mutex_unlock_full): Likewise.
00db10
00db10
Index: glibc-2.17-c758a686/nptl/descr.h
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/nptl/descr.h
00db10
+++ glibc-2.17-c758a686/nptl/descr.h
00db10
@@ -180,7 +180,16 @@ struct pthread
00db10
      but the pointer to the next/previous element of the list points
00db10
      in the middle of the object, the __next element.  Whenever
00db10
      casting to __pthread_list_t we need to adjust the pointer
00db10
-     first.  */
00db10
+     first.
00db10
+     These operations are effectively concurrent code in that the thread
00db10
+     can get killed at any point in time and the kernel takes over.  Thus,
00db10
+     the __next elements are a kind of concurrent list and we need to
00db10
+     enforce using compiler barriers that the individual operations happen
00db10
+     in such a way that the kernel always sees a consistent list.  The
00db10
+     backward links (ie, the __prev elements) are not used by the kernel.
00db10
+     FIXME We should use relaxed MO atomic operations here and signal fences
00db10
+     because this kind of concurrency is similar to synchronizing with a
00db10
+     signal handler.  */
00db10
 # define QUEUE_PTR_ADJUST (offsetof (__pthread_list_t, __next))
00db10
 
00db10
 # define ENQUEUE_MUTEX_BOTH(mutex, val)					      \
00db10
@@ -192,6 +201,8 @@ struct pthread
00db10
     mutex->__data.__list.__next = THREAD_GETMEM (THREAD_SELF,		      \
00db10
 						 robust_head.list);	      \
00db10
     mutex->__data.__list.__prev = (void *) &THREAD_SELF->robust_head;	      \
00db10
+    /* Ensure that the new list entry is ready before we insert it.  */	      \
00db10
+    __asm ("" ::: "memory");						      \
00db10
     THREAD_SETMEM (THREAD_SELF, robust_head.list,			      \
00db10
 		   (void *) (((uintptr_t) &mutex->__data.__list.__next)	      \
00db10
 			     | val));					      \
00db10
@@ -206,6 +217,9 @@ struct pthread
00db10
       ((char *) (((uintptr_t) mutex->__data.__list.__prev) & ~1ul)	      \
00db10
        - QUEUE_PTR_ADJUST);						      \
00db10
     prev->__next = mutex->__data.__list.__next;				      \
00db10
+    /* Ensure that we remove the entry from the list before we change the     \
00db10
+       __next pointer of the entry, which is read by the kernel.  */	      \
00db10
+    __asm ("" ::: "memory");						      \
00db10
     mutex->__data.__list.__prev = NULL;					      \
00db10
     mutex->__data.__list.__next = NULL;					      \
00db10
   } while (0)
00db10
@@ -220,6 +234,8 @@ struct pthread
00db10
   do {									      \
00db10
     mutex->__data.__list.__next						      \
00db10
       = THREAD_GETMEM (THREAD_SELF, robust_list.__next);		      \
00db10
+    /* Ensure that the new list entry is ready before we insert it.  */	      \
00db10
+    __asm ("" ::: "memory");						      \
00db10
     THREAD_SETMEM (THREAD_SELF, robust_list.__next,			      \
00db10
 		   (void *) (((uintptr_t) &mutex->__data.__list) | val));     \
00db10
   } while (0)
00db10
@@ -240,6 +256,9 @@ struct pthread
00db10
 	  }								      \
00db10
 									      \
00db10
 	runp->__next = next->__next;					      \
00db10
+	/* Ensure that we remove the entry from the list before we change the \
00db10
+	   __next pointer of the entry, which is read by the kernel.  */      \
00db10
+	    __asm ("" ::: "memory");					      \
00db10
 	mutex->__data.__list.__next = NULL;				      \
00db10
       }									      \
00db10
   } while (0)
00db10
Index: glibc-2.17-c758a686/nptl/pthread_mutex_lock.c
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/nptl/pthread_mutex_lock.c
00db10
+++ glibc-2.17-c758a686/nptl/pthread_mutex_lock.c
00db10
@@ -181,6 +181,9 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
     case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP:
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 		     &mutex->__data.__list.__next);
00db10
+      /* We need to set op_pending before starting the operation.  Also
00db10
+	 see comments at ENQUEUE_MUTEX.  */
00db10
+      __asm ("" ::: "memory");
00db10
 
00db10
       oldval = mutex->__data.__lock;
00db10
       /* This is set to FUTEX_WAITERS iff we might have shared the
00db10
@@ -228,7 +231,12 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	      /* But it is inconsistent unless marked otherwise.  */
00db10
 	      mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
00db10
 
00db10
+	      /* We must not enqueue the mutex before we have acquired it.
00db10
+		 Also see comments at ENQUEUE_MUTEX.  */
00db10
+	      __asm ("" ::: "memory");
00db10
 	      ENQUEUE_MUTEX (mutex);
00db10
+	      /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+	      __asm ("" ::: "memory");
00db10
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 
00db10
 	      /* Note that we deliberately exit here.  If we fall
00db10
@@ -250,6 +258,8 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	      int kind = PTHREAD_MUTEX_TYPE (mutex);
00db10
 	      if (kind == PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP)
00db10
 		{
00db10
+		  /* We do not need to ensure ordering wrt another memory
00db10
+		     access.  Also see comments at ENQUEUE_MUTEX. */
00db10
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 				 NULL);
00db10
 		  return EDEADLK;
00db10
@@ -257,6 +267,8 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 
00db10
 	      if (kind == PTHREAD_MUTEX_ROBUST_RECURSIVE_NP)
00db10
 		{
00db10
+		  /* We do not need to ensure ordering wrt another memory
00db10
+		     access.  */
00db10
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 				 NULL);
00db10
 
00db10
@@ -309,12 +321,19 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	  mutex->__data.__count = 0;
00db10
 	  int private = PTHREAD_ROBUST_MUTEX_PSHARED (mutex);
00db10
 	  lll_unlock (mutex->__data.__lock, private);
00db10
+	  /* FIXME This violates the mutex destruction requirements.  See
00db10
+	     __pthread_mutex_unlock_full.  */
00db10
 	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 	  return ENOTRECOVERABLE;
00db10
 	}
00db10
 
00db10
       mutex->__data.__count = 1;
00db10
+      /* We must not enqueue the mutex before we have acquired it.
00db10
+	 Also see comments at ENQUEUE_MUTEX.  */
00db10
+      __asm ("" ::: "memory");
00db10
       ENQUEUE_MUTEX (mutex);
00db10
+      /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+      __asm ("" ::: "memory");
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
       break;
00db10
 
00db10
@@ -331,10 +350,15 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP;
00db10
 
00db10
 	if (robust)
00db10
-	  /* Note: robust PI futexes are signaled by setting bit 0.  */
00db10
-	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
-			 (void *) (((uintptr_t) &mutex->__data.__list.__next)
00db10
-				   | 1));
00db10
+	  {
00db10
+	    /* Note: robust PI futexes are signaled by setting bit 0.  */
00db10
+	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
+			   (void *) (((uintptr_t) &mutex->__data.__list.__next)
00db10
+				     | 1));
00db10
+	    /* We need to set op_pending before starting the operation.  Also
00db10
+	       see comments at ENQUEUE_MUTEX.  */
00db10
+	    __asm ("" ::: "memory");
00db10
+	  }
00db10
 
00db10
 	oldval = mutex->__data.__lock;
00db10
 
00db10
@@ -343,12 +367,16 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	  {
00db10
 	    if (kind == PTHREAD_MUTEX_ERRORCHECK_NP)
00db10
 	      {
00db10
+		/* We do not need to ensure ordering wrt another memory
00db10
+		   access.  */
00db10
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 		return EDEADLK;
00db10
 	      }
00db10
 
00db10
 	    if (kind == PTHREAD_MUTEX_RECURSIVE_NP)
00db10
 	      {
00db10
+		/* We do not need to ensure ordering wrt another memory
00db10
+		   access.  */
00db10
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 
00db10
 		/* Just bump the counter.  */
00db10
@@ -411,7 +439,12 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	    /* But it is inconsistent unless marked otherwise.  */
00db10
 	    mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
00db10
 
00db10
+	    /* We must not enqueue the mutex before we have acquired it.
00db10
+	       Also see comments at ENQUEUE_MUTEX.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    ENQUEUE_MUTEX_PI (mutex);
00db10
+	    /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 
00db10
 	    /* Note that we deliberately exit here.  If we fall
00db10
@@ -439,6 +472,8 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 						  PTHREAD_ROBUST_MUTEX_PSHARED (mutex)),
00db10
 			      0, 0);
00db10
 
00db10
+	    /* To the kernel, this will be visible after the kernel has
00db10
+	       acquired the mutex in the syscall.  */
00db10
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 	    return ENOTRECOVERABLE;
00db10
 	  }
00db10
@@ -446,7 +481,12 @@ __pthread_mutex_lock_full (pthread_mutex
00db10
 	mutex->__data.__count = 1;
00db10
 	if (robust)
00db10
 	  {
00db10
+	    /* We must not enqueue the mutex before we have acquired it.
00db10
+	       Also see comments at ENQUEUE_MUTEX.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    ENQUEUE_MUTEX_PI (mutex);
00db10
+	    /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 	  }
00db10
       }
00db10
Index: glibc-2.17-c758a686/nptl/pthread_mutex_timedlock.c
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/nptl/pthread_mutex_timedlock.c
00db10
+++ glibc-2.17-c758a686/nptl/pthread_mutex_timedlock.c
00db10
@@ -140,6 +140,9 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
     case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP:
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 		     &mutex->__data.__list.__next);
00db10
+      /* We need to set op_pending before starting the operation.  Also
00db10
+	 see comments at ENQUEUE_MUTEX.  */
00db10
+      __asm ("" ::: "memory");
00db10
 
00db10
       oldval = mutex->__data.__lock;
00db10
       /* This is set to FUTEX_WAITERS iff we might have shared the
00db10
@@ -177,7 +180,12 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	      /* But it is inconsistent unless marked otherwise.  */
00db10
 	      mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
00db10
 
00db10
+	      /* We must not enqueue the mutex before we have acquired it.
00db10
+		 Also see comments at ENQUEUE_MUTEX.  */
00db10
+	      __asm ("" ::: "memory");
00db10
 	      ENQUEUE_MUTEX (mutex);
00db10
+	      /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+	      __asm ("" ::: "memory");
00db10
 	      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 
00db10
 	      /* Note that we deliberately exit here.  If we fall
00db10
@@ -193,6 +201,8 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	      int kind = PTHREAD_MUTEX_TYPE (mutex);
00db10
 	      if (kind == PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP)
00db10
 		{
00db10
+		  /* We do not need to ensure ordering wrt another memory
00db10
+		     access.  Also see comments at ENQUEUE_MUTEX. */
00db10
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 				 NULL);
00db10
 		  return EDEADLK;
00db10
@@ -200,6 +210,8 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 
00db10
 	      if (kind == PTHREAD_MUTEX_ROBUST_RECURSIVE_NP)
00db10
 		{
00db10
+		  /* We do not need to ensure ordering wrt another memory
00db10
+		     access.  */
00db10
 		  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 				 NULL);
00db10
 
00db10
@@ -294,12 +306,19 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	  mutex->__data.__count = 0;
00db10
 	  int private = PTHREAD_ROBUST_MUTEX_PSHARED (mutex);
00db10
 	  lll_unlock (mutex->__data.__lock, private);
00db10
+	  /* FIXME This violates the mutex destruction requirements.  See
00db10
+	     __pthread_mutex_unlock_full.  */
00db10
 	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 	  return ENOTRECOVERABLE;
00db10
 	}
00db10
 
00db10
       mutex->__data.__count = 1;
00db10
+      /* We must not enqueue the mutex before we have acquired it.
00db10
+	 Also see comments at ENQUEUE_MUTEX.  */
00db10
+      __asm ("" ::: "memory");
00db10
       ENQUEUE_MUTEX (mutex);
00db10
+      /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+      __asm ("" ::: "memory");
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
       break;
00db10
 
00db10
@@ -316,10 +335,15 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP;
00db10
 
00db10
 	if (robust)
00db10
-	  /* Note: robust PI futexes are signaled by setting bit 0.  */
00db10
-	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
-			 (void *) (((uintptr_t) &mutex->__data.__list.__next)
00db10
-				   | 1));
00db10
+	  {
00db10
+	    /* Note: robust PI futexes are signaled by setting bit 0.  */
00db10
+	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
+			   (void *) (((uintptr_t) &mutex->__data.__list.__next)
00db10
+				     | 1));
00db10
+	    /* We need to set op_pending before starting the operation.  Also
00db10
+	       see comments at ENQUEUE_MUTEX.  */
00db10
+	    __asm ("" ::: "memory");
00db10
+	  }
00db10
 
00db10
 	oldval = mutex->__data.__lock;
00db10
 
00db10
@@ -328,12 +352,16 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	  {
00db10
 	    if (kind == PTHREAD_MUTEX_ERRORCHECK_NP)
00db10
 	      {
00db10
+		/* We do not need to ensure ordering wrt another memory
00db10
+		   access.  */
00db10
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 		return EDEADLK;
00db10
 	      }
00db10
 
00db10
 	    if (kind == PTHREAD_MUTEX_RECURSIVE_NP)
00db10
 	      {
00db10
+		/* We do not need to ensure ordering wrt another memory
00db10
+		   access.  */
00db10
 		THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 
00db10
 		/* Just bump the counter.  */
00db10
@@ -420,7 +448,12 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	    /* But it is inconsistent unless marked otherwise.  */
00db10
 	    mutex->__data.__owner = PTHREAD_MUTEX_INCONSISTENT;
00db10
 
00db10
+	    /* We must not enqueue the mutex before we have acquired it.
00db10
+	       Also see comments at ENQUEUE_MUTEX.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    ENQUEUE_MUTEX_PI (mutex);
00db10
+	    /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 
00db10
 	    /* Note that we deliberately exit here.  If we fall
00db10
@@ -443,6 +476,8 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 						  PTHREAD_ROBUST_MUTEX_PSHARED (mutex)),
00db10
 			      0, 0);
00db10
 
00db10
+	    /* To the kernel, this will be visible after the kernel has
00db10
+	       acquired the mutex in the syscall.  */
00db10
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 	    return ENOTRECOVERABLE;
00db10
 	  }
00db10
@@ -450,7 +485,12 @@ pthread_mutex_timedlock (pthread_mutex_t
00db10
 	mutex->__data.__count = 1;
00db10
 	if (robust)
00db10
 	  {
00db10
+	    /* We must not enqueue the mutex before we have acquired it.
00db10
+	       Also see comments at ENQUEUE_MUTEX.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    ENQUEUE_MUTEX_PI (mutex);
00db10
+	    /* We need to clear op_pending after we enqueue the mutex.  */
00db10
+	    __asm ("" ::: "memory");
00db10
 	    THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
 	  }
00db10
 	}
00db10
Index: glibc-2.17-c758a686/nptl/pthread_mutex_unlock.c
00db10
===================================================================
00db10
--- glibc-2.17-c758a686.orig/nptl/pthread_mutex_unlock.c
00db10
+++ glibc-2.17-c758a686/nptl/pthread_mutex_unlock.c
00db10
@@ -143,6 +143,9 @@ __pthread_mutex_unlock_full (pthread_mut
00db10
       /* Remove mutex from the list.  */
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 		     &mutex->__data.__list.__next);
00db10
+      /* We must set op_pending before we dequeue the mutex.  Also see
00db10
+	 comments at ENQUEUE_MUTEX.  */
00db10
+      __asm ("" ::: "memory");
00db10
       DEQUEUE_MUTEX (mutex);
00db10
 
00db10
       mutex->__data.__owner = newowner;
00db10
@@ -159,6 +162,14 @@ __pthread_mutex_unlock_full (pthread_mut
00db10
 			     & FUTEX_WAITERS) != 0))
00db10
 	lll_futex_wake (&mutex->__data.__lock, 1, private);
00db10
 
00db10
+      /* We must clear op_pending after we release the mutex.
00db10
+	 FIXME However, this violates the mutex destruction requirements
00db10
+	 because another thread could acquire the mutex, destroy it, and
00db10
+	 reuse the memory for something else; then, if this thread crashes,
00db10
+	 and the memory happens to have a value equal to the TID, the kernel
00db10
+	 will believe it is still related to the mutex (which has been
00db10
+	 destroyed already) and will modify some other random object.  */
00db10
+      __asm ("" ::: "memory");
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
       break;
00db10
 
00db10
@@ -223,6 +234,9 @@ __pthread_mutex_unlock_full (pthread_mut
00db10
 	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
00db10
 			 (void *) (((uintptr_t) &mutex->__data.__list.__next)
00db10
 				   | 1));
00db10
+	  /* We must set op_pending before we dequeue the mutex.  Also see
00db10
+	     comments at ENQUEUE_MUTEX.  */
00db10
+	  __asm ("" ::: "memory");
00db10
 	  DEQUEUE_MUTEX (mutex);
00db10
 	}
00db10
 
00db10
@@ -247,6 +261,9 @@ __pthread_mutex_unlock_full (pthread_mut
00db10
 			    __lll_private_flag (FUTEX_UNLOCK_PI, private));
00db10
 	}
00db10
 
00db10
+      /* This happens after the kernel releases the mutex but violates the
00db10
+	 mutex destruction requirements; see comments in the code handling
00db10
+	 PTHREAD_MUTEX_ROBUST_NORMAL_NP.  */
00db10
       THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
00db10
       break;
00db10