25c7f1
2016-01-22  Torvald Riegel  <triegel@redhat.com>
25c7f1
25c7f1
	* beginend.cc (GTM::gtm_thread::serial_lock): Put on cacheline
25c7f1
	boundary.
25c7f1
	(htm_fastpath): Remove.
25c7f1
	(gtm_thread::begin_transaction): Fix HTM fastpath.
25c7f1
	(_ITM_commitTransaction): Adapt.
25c7f1
	(_ITM_commitTransactionEH): Adapt.
25c7f1
	* libitm/config/linux/rwlock.h (gtm_rwlock): Add htm_fastpath member
25c7f1
	and accessors.
25c7f1
	* libitm/config/posix/rwlock.h (gtm_rwlock): Likewise.
25c7f1
	* libitm/config/posix/rwlock.cc (gtm_rwlock::gtm_rwlock): Adapt.
25c7f1
	* libitm/libitm_i.h (htm_fastpath): Remove declaration.
25c7f1
	* libitm/method-serial.cc (htm_mg): Adapt.
25c7f1
	(gtm_thread::serialirr_mode): Adapt.
25c7f1
	* libitm/query.cc (_ITM_inTransaction, _ITM_getTransactionId): Adapt.
25c7f1
25c7f1
--- libitm/beginend.cc
25c7f1
+++ libitm/beginend.cc
25c7f1
@@ -32,7 +32,11 @@ using namespace GTM;
25c7f1
 extern __thread gtm_thread_tls _gtm_thr_tls;
25c7f1
 #endif
25c7f1
 
25c7f1
-gtm_rwlock GTM::gtm_thread::serial_lock;
25c7f1
+// Put this at the start of a cacheline so that serial_lock's writers and
25c7f1
+// htm_fastpath fields are on the same cacheline, so that HW transactions
25c7f1
+// only have to pay one cacheline capacity to monitor both.
25c7f1
+gtm_rwlock GTM::gtm_thread::serial_lock
25c7f1
+  __attribute__((aligned(HW_CACHELINE_SIZE)));
25c7f1
 gtm_thread *GTM::gtm_thread::list_of_threads = 0;
25c7f1
 unsigned GTM::gtm_thread::number_of_threads = 0;
25c7f1
 
25c7f1
@@ -54,9 +58,6 @@ static pthread_mutex_t global_tid_lock = PTHREAD_MUTEX_INITIALIZER;
25c7f1
 static pthread_key_t thr_release_key;
25c7f1
 static pthread_once_t thr_release_once = PTHREAD_ONCE_INIT;
25c7f1
 
25c7f1
-// See gtm_thread::begin_transaction.
25c7f1
-uint32_t GTM::htm_fastpath = 0;
25c7f1
-
25c7f1
 /* Allocate a transaction structure.  */
25c7f1
 void *
25c7f1
 GTM::gtm_thread::operator new (size_t s)
25c7f1
@@ -174,9 +175,11 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
25c7f1
   // lock's writer flag and thus abort if another thread is or becomes a
25c7f1
   // serial transaction.  Therefore, if the fastpath is enabled, then a
25c7f1
   // transaction is not executing as a HW transaction iff the serial lock is
25c7f1
-  // write-locked.  This allows us to use htm_fastpath and the serial lock's
25c7f1
-  // writer flag to reliable determine whether the current thread runs a HW
25c7f1
-  // transaction, and thus we do not need to maintain this information in
25c7f1
+  // write-locked.  Also, HW transactions monitor the fastpath control
25c7f1
+  // variable, so that they will only execute if dispatch_htm is still the
25c7f1
+  // current method group.  This allows us to use htm_fastpath and the serial
25c7f1
+  // lock's writers flag to reliable determine whether the current thread runs
25c7f1
+  // a HW transaction, and thus we do not need to maintain this information in
25c7f1
   // per-thread state.
25c7f1
   // If an uninstrumented code path is not available, we can still run
25c7f1
   // instrumented code from a HW transaction because the HTM fastpath kicks
25c7f1
@@ -187,9 +190,14 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
25c7f1
   // indeed in serial mode, and HW transactions should never need serial mode
25c7f1
   // for any internal changes (e.g., they never abort visibly to the STM code
25c7f1
   // and thus do not trigger the standard retry handling).
25c7f1
-  if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
25c7f1
+  if (likely(serial_lock.get_htm_fastpath() && (prop & pr_hasNoAbort)))
25c7f1
     {
25c7f1
-      for (uint32_t t = htm_fastpath; t; t--)
25c7f1
+      // Note that the snapshot of htm_fastpath that we take here could be
25c7f1
+      // outdated, and a different method group than dispatch_htm may have
25c7f1
+      // been chosen in the meantime.  Therefore, take care not not touch
25c7f1
+      // anything besides the serial lock, which is independent of method
25c7f1
+      // groups.
25c7f1
+      for (uint32_t t = serial_lock.get_htm_fastpath(); t; t--)
25c7f1
 	{
25c7f1
 	  uint32_t ret = htm_begin();
25c7f1
 	  if (htm_begin_success(ret))
25c7f1
@@ -197,9 +205,11 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
25c7f1
 	      // We are executing a transaction now.
25c7f1
 	      // Monitor the writer flag in the serial-mode lock, and abort
25c7f1
 	      // if there is an active or waiting serial-mode transaction.
25c7f1
+	      // Also checks that htm_fastpath is still nonzero and thus
25c7f1
+	      // HW transactions are allowed to run.
25c7f1
 	      // Note that this can also happen due to an enclosing
25c7f1
 	      // serial-mode transaction; we handle this case below.
25c7f1
-	      if (unlikely(serial_lock.is_write_locked()))
25c7f1
+	      if (unlikely(serial_lock.htm_fastpath_disabled()))
25c7f1
 		htm_abort();
25c7f1
 	      else
25c7f1
 		// We do not need to set a_saveLiveVariables because of HTM.
25c7f1
@@ -210,9 +220,12 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
25c7f1
 	  // retrying the transaction will be successful.
25c7f1
 	  if (!htm_abort_should_retry(ret))
25c7f1
 	    break;
25c7f1
+	  // Check whether the HTM fastpath has been disabled.
25c7f1
+	  if (!serial_lock.get_htm_fastpath())
25c7f1
+	    break;
25c7f1
 	  // Wait until any concurrent serial-mode transactions have finished.
25c7f1
 	  // This is an empty critical section, but won't be elided.
25c7f1
-	  if (serial_lock.is_write_locked())
25c7f1
+	  if (serial_lock.htm_fastpath_disabled())
25c7f1
 	    {
25c7f1
 	      tx = gtm_thr();
25c7f1
 	      if (unlikely(tx == NULL))
25c7f1
@@ -618,7 +631,7 @@ _ITM_commitTransaction(void)
25c7f1
   // a serial-mode transaction.  If we are, then there will be no other
25c7f1
   // concurrent serial-mode transaction.
25c7f1
   // See gtm_thread::begin_transaction.
25c7f1
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
25c7f1
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
25c7f1
     {
25c7f1
       htm_commit();
25c7f1
       return;
25c7f1
@@ -634,7 +647,7 @@ _ITM_commitTransactionEH(void *exc_ptr)
25c7f1
 {
25c7f1
 #if defined(USE_HTM_FASTPATH)
25c7f1
   // See _ITM_commitTransaction.
25c7f1
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
25c7f1
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
25c7f1
     {
25c7f1
       htm_commit();
25c7f1
       return;
25c7f1
--- libitm/config/linux/rwlock.h
25c7f1
+++ libitm/config/linux/rwlock.h
25c7f1
@@ -39,16 +39,29 @@ struct gtm_thread;
25c7f1
 //
25c7f1
 // In this implementation, writers are given highest priority access but
25c7f1
 // read-to-write upgrades do not have a higher priority than writers.
25c7f1
+//
25c7f1
+// Do not change the layout of this class; it must remain a POD type with
25c7f1
+// standard layout, and the writers field must be first (i.e., so the
25c7f1
+// assembler code can assume that its address is equal to the address of the
25c7f1
+// respective instance of the class), and htm_fastpath must be second.
25c7f1
 
25c7f1
 class gtm_rwlock
25c7f1
 {
25c7f1
-  // TODO Put futexes on different cachelines?
25c7f1
   std::atomic<int> writers;       // Writers' futex.
25c7f1
+  // We put the HTM fastpath control variable here so that HTM fastpath
25c7f1
+  // transactions can check efficiently whether they are allowed to run.
25c7f1
+  // This must be accessed atomically because threads can load this value
25c7f1
+  // when they are neither a registered reader nor writer (i.e., when they
25c7f1
+  // attempt to execute the HTM fastpath).
25c7f1
+  std::atomic<uint32_t> htm_fastpath;
25c7f1
+  // TODO Put these futexes on different cachelines?  (writers and htm_fastpath
25c7f1
+  // should remain on the same cacheline.
25c7f1
   std::atomic<int> writer_readers;// A confirmed writer waits here for readers.
25c7f1
   std::atomic<int> readers;       // Readers wait here for writers (iff true).
25c7f1
 
25c7f1
  public:
25c7f1
-  gtm_rwlock() : writers(0), writer_readers(0), readers(0) {};
25c7f1
+  gtm_rwlock() : writers(0), htm_fastpath(0), writer_readers(0), readers(0)
25c7f1
+  { }
25c7f1
 
25c7f1
   void read_lock (gtm_thread *tx);
25c7f1
   void read_unlock (gtm_thread *tx);
25c7f1
@@ -59,12 +72,28 @@ class gtm_rwlock
25c7f1
   bool write_upgrade (gtm_thread *tx);
25c7f1
   void write_upgrade_finish (gtm_thread *tx);
25c7f1
 
25c7f1
-  // Returns true iff there is a concurrent active or waiting writer.
25c7f1
-  // This is primarily useful for simple HyTM approaches, and the value being
25c7f1
-  // checked is loaded with memory_order_relaxed.
25c7f1
-  bool is_write_locked()
25c7f1
+  // Returns true iff there is a concurrent active or waiting writer, or
25c7f1
+  // htm_fastpath is zero. This is primarily useful for simple HyTM
25c7f1
+  // approaches, and the values being checked are loaded with
25c7f1
+  // memory_order_relaxed.
25c7f1
+  bool htm_fastpath_disabled ()
25c7f1
+  {
25c7f1
+    return writers.load (memory_order_relaxed) != 0
25c7f1
+	|| htm_fastpath.load (memory_order_relaxed) == 0;
25c7f1
+  }
25c7f1
+
25c7f1
+  // This does not need to return an exact value, hence relaxed MO is
25c7f1
+  // sufficient.
25c7f1
+  uint32_t get_htm_fastpath ()
25c7f1
+  {
25c7f1
+    return htm_fastpath.load (memory_order_relaxed);
25c7f1
+  }
25c7f1
+  // This must only be called while having acquired the write lock, and other
25c7f1
+  // threads do not need to load an exact value; hence relaxed MO is
25c7f1
+  // sufficient.
25c7f1
+  void set_htm_fastpath (uint32_t val)
25c7f1
   {
25c7f1
-    return writers.load (memory_order_relaxed) != 0;
25c7f1
+    htm_fastpath.store (val, memory_order_relaxed);
25c7f1
   }
25c7f1
 
25c7f1
  protected:
25c7f1
--- libitm/config/posix/rwlock.h
25c7f1
+++ libitm/config/posix/rwlock.h
25c7f1
@@ -44,19 +44,32 @@ struct gtm_thread;
25c7f1
 //
25c7f1
 // In this implementation, writers are given highest priority access but
25c7f1
 // read-to-write upgrades do not have a higher priority than writers.
25c7f1
+//
25c7f1
+// Do not change the layout of this class; it must remain a POD type with
25c7f1
+// standard layout, and the summary field must be first (i.e., so the
25c7f1
+// assembler code can assume that its address is equal to the address of the
25c7f1
+// respective instance of the class), and htm_fastpath must be second.
25c7f1
 
25c7f1
 class gtm_rwlock
25c7f1
 {
25c7f1
-  pthread_mutex_t mutex;	        // Held if manipulating any field.
25c7f1
-  pthread_cond_t c_readers;	        // Readers wait here
25c7f1
-  pthread_cond_t c_writers;	        // Writers wait here for writers
25c7f1
-  pthread_cond_t c_confirmed_writers;	// Writers wait here for readers
25c7f1
-
25c7f1
   static const unsigned a_writer  = 1;	// An active writer.
25c7f1
   static const unsigned w_writer  = 2;	// The w_writers field != 0
25c7f1
   static const unsigned w_reader  = 4;  // The w_readers field != 0
25c7f1
 
25c7f1
   std::atomic<unsigned int> summary;	// Bitmask of the above.
25c7f1
+
25c7f1
+  // We put the HTM fastpath control variable here so that HTM fastpath
25c7f1
+  // transactions can check efficiently whether they are allowed to run.
25c7f1
+  // This must be accessed atomically because threads can load this value
25c7f1
+  // when they are neither a registered reader nor writer (i.e., when they
25c7f1
+  // attempt to execute the HTM fastpath).
25c7f1
+  std::atomic<uint32_t> htm_fastpath;
25c7f1
+
25c7f1
+  pthread_mutex_t mutex;	        // Held if manipulating any field.
25c7f1
+  pthread_cond_t c_readers;	        // Readers wait here
25c7f1
+  pthread_cond_t c_writers;	        // Writers wait here for writers
25c7f1
+  pthread_cond_t c_confirmed_writers;	// Writers wait here for readers
25c7f1
+
25c7f1
   unsigned int a_readers;	// Nr active readers as observed by a writer
25c7f1
   unsigned int w_readers;	// Nr waiting readers
25c7f1
   unsigned int w_writers;	// Nr waiting writers
25c7f1
@@ -74,12 +87,28 @@ class gtm_rwlock
25c7f1
   bool write_upgrade (gtm_thread *tx);
25c7f1
   void write_upgrade_finish (gtm_thread *tx);
25c7f1
 
25c7f1
-  // Returns true iff there is a concurrent active or waiting writer.
25c7f1
-  // This is primarily useful for simple HyTM approaches, and the value being
25c7f1
-  // checked is loaded with memory_order_relaxed.
25c7f1
-  bool is_write_locked()
25c7f1
+  // Returns true iff there is a concurrent active or waiting writer, or
25c7f1
+  // htm_fastpath is zero. This is primarily useful for simple HyTM
25c7f1
+  // approaches, and the values being checked are loaded with
25c7f1
+  // memory_order_relaxed.
25c7f1
+  bool htm_fastpath_disabled ()
25c7f1
+  {
25c7f1
+    return (summary.load (memory_order_relaxed) & (a_writer | w_writer))
25c7f1
+	|| htm_fastpath.load (memory_order_relaxed) == 0;
25c7f1
+  }
25c7f1
+
25c7f1
+  // This does not need to return an exact value, hence relaxed MO is
25c7f1
+  // sufficient.
25c7f1
+  uint32_t get_htm_fastpath ()
25c7f1
+  {
25c7f1
+    return htm_fastpath.load (memory_order_relaxed);
25c7f1
+  }
25c7f1
+  // This must only be called while having acquired the write lock, and other
25c7f1
+  // threads do not need to load an exact value; hence relaxed MO is
25c7f1
+  // sufficient.
25c7f1
+  void set_htm_fastpath (uint32_t val)
25c7f1
   {
25c7f1
-    return summary.load (memory_order_relaxed) & (a_writer | w_writer);
25c7f1
+    htm_fastpath.store (val, memory_order_relaxed);
25c7f1
   }
25c7f1
 
25c7f1
  protected:
25c7f1
--- libitm/config/posix/rwlock.cc
25c7f1
+++ libitm/config/posix/rwlock.cc
25c7f1
@@ -30,11 +30,12 @@ namespace GTM HIDDEN {
25c7f1
 // ??? Move this back to the header file when constexpr is implemented.
25c7f1
 
25c7f1
 gtm_rwlock::gtm_rwlock()
25c7f1
-  : mutex (PTHREAD_MUTEX_INITIALIZER),
25c7f1
+  : summary (0),
25c7f1
+    htm_fastpath (0),
25c7f1
+    mutex (PTHREAD_MUTEX_INITIALIZER),
25c7f1
     c_readers (PTHREAD_COND_INITIALIZER),
25c7f1
     c_writers (PTHREAD_COND_INITIALIZER),
25c7f1
     c_confirmed_writers (PTHREAD_COND_INITIALIZER),
25c7f1
-    summary (0),
25c7f1
     a_readers (0),
25c7f1
     w_readers (0),
25c7f1
     w_writers (0)
25c7f1
--- libitm/libitm_i.h
25c7f1
+++ libitm/libitm_i.h
25c7f1
@@ -336,10 +336,6 @@ extern abi_dispatch *dispatch_htm();
25c7f1
 
25c7f1
 extern gtm_cacheline_mask gtm_mask_stack(gtm_cacheline *, gtm_cacheline_mask);
25c7f1
 
25c7f1
-// Control variable for the HTM fastpath that uses serial mode as fallback.
25c7f1
-// Non-zero if the HTM fastpath is enabled. See gtm_thread::begin_transaction.
25c7f1
-extern uint32_t htm_fastpath;
25c7f1
-
25c7f1
 } // namespace GTM
25c7f1
 
25c7f1
 #endif // LIBITM_I_H
25c7f1
--- libitm/method-serial.cc
25c7f1
+++ libitm/method-serial.cc
25c7f1
@@ -222,13 +222,13 @@ struct htm_mg : public method_group
25c7f1
     // Enable the HTM fastpath if the HW is available.  The fastpath is
25c7f1
     // initially disabled.
25c7f1
 #ifdef USE_HTM_FASTPATH
25c7f1
-    htm_fastpath = htm_init();
25c7f1
+    gtm_thread::serial_lock.set_htm_fastpath(htm_init());
25c7f1
 #endif
25c7f1
   }
25c7f1
   virtual void fini()
25c7f1
   {
25c7f1
     // Disable the HTM fastpath.
25c7f1
-    htm_fastpath = 0;
25c7f1
+    gtm_thread::serial_lock.set_htm_fastpath(0);
25c7f1
   }
25c7f1
 };
25c7f1
 
25c7f1
@@ -288,7 +288,7 @@ GTM::gtm_thread::serialirr_mode ()
25c7f1
 #if defined(USE_HTM_FASTPATH)
25c7f1
   // HTM fastpath.  If we are executing a HW transaction, don't go serial but
25c7f1
   // continue.  See gtm_thread::begin_transaction.
25c7f1
-  if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
25c7f1
+  if (likely(!gtm_thread::serial_lock.htm_fastpath_disabled()))
25c7f1
     return;
25c7f1
 #endif
25c7f1
 
25c7f1
--- libitm/query.cc
25c7f1
+++ libitm/query.cc
25c7f1
@@ -49,7 +49,7 @@ _ITM_inTransaction (void)
25c7f1
   // a transaction and thus we can't deduce this by looking at just the serial
25c7f1
   // lock.  This function isn't used in practice currently, so the easiest
25c7f1
   // way to handle it is to just abort.
25c7f1
-  if (htm_fastpath && htm_transaction_active())
25c7f1
+  if (gtm_thread::serial_lock.get_htm_fastpath() && htm_transaction_active())
25c7f1
     htm_abort();
25c7f1
 #endif
25c7f1
   struct gtm_thread *tx = gtm_thr();
25c7f1
@@ -69,7 +69,7 @@ _ITM_getTransactionId (void)
25c7f1
 {
25c7f1
 #if defined(USE_HTM_FASTPATH)
25c7f1
   // See ITM_inTransaction.
25c7f1
-  if (htm_fastpath && htm_transaction_active())
25c7f1
+  if (gtm_thread::serial_lock.get_htm_fastpath() && htm_transaction_active())
25c7f1
     htm_abort();
25c7f1
 #endif
25c7f1
   struct gtm_thread *tx = gtm_thr();