6ca6e8
commit a6b81f605dfba8650ea1f80122f41eb8e6c73dc7
6ca6e8
Author: H.J. Lu <hjl.tools@gmail.com>
6ca6e8
Date:   Tue Nov 2 18:33:07 2021 -0700
6ca6e8
6ca6e8
    Add LLL_MUTEX_READ_LOCK [BZ #28537]
6ca6e8
    
6ca6e8
    CAS instruction is expensive.  From the x86 CPU's point of view, getting
6ca6e8
    a cache line for writing is more expensive than reading.  See Appendix
6ca6e8
    A.2 Spinlock in:
6ca6e8
    
6ca6e8
    https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
6ca6e8
    
6ca6e8
    The full compare and swap will grab the cache line exclusive and cause
6ca6e8
    excessive cache line bouncing.
6ca6e8
    
6ca6e8
    Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
6ca6e8
    loop if compare may fail to reduce cache line bouncing on contended locks.
6ca6e8
    
6ca6e8
    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
6ca6e8
    (cherry picked from commit d672a98a1af106bd68deb15576710cd61363f7a6)
6ca6e8
6ca6e8
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
6ca6e8
index a04e0158451c8fff..9f40928cc6b9a067 100644
6ca6e8
--- a/nptl/pthread_mutex_lock.c
6ca6e8
+++ b/nptl/pthread_mutex_lock.c
6ca6e8
@@ -65,6 +65,11 @@ lll_mutex_lock_optimized (pthread_mutex_t *mutex)
6ca6e8
 # define PTHREAD_MUTEX_VERSIONS 1
6ca6e8
 #endif
6ca6e8
 
6ca6e8
+#ifndef LLL_MUTEX_READ_LOCK
6ca6e8
+# define LLL_MUTEX_READ_LOCK(mutex) \
6ca6e8
+  atomic_load_relaxed (&(mutex)->__data.__lock)
6ca6e8
+#endif
6ca6e8
+
6ca6e8
 static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
6ca6e8
      __attribute_noinline__;
6ca6e8
 
6ca6e8
@@ -142,6 +147,8 @@ PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
6ca6e8
 		  break;
6ca6e8
 		}
6ca6e8
 	      atomic_spin_nop ();
6ca6e8
+	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
6ca6e8
+		continue;
6ca6e8
 	    }
6ca6e8
 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
6ca6e8