e3c68b
From 75a9d946d252ce70460144615ca17dbdf2e80fab Mon Sep 17 00:00:00 2001
e3c68b
From: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Date: Fri, 7 Feb 2020 10:19:57 +0100
e3c68b
Subject: [PATCH 354/355] core: fix memory pool management races
e3c68b
e3c68b
Objects allocated from a per-thread memory pool keep a reference to it
e3c68b
to be able to return the object to the pool when not used anymore. The
e3c68b
object holding this reference can have a long life cycle that could
e3c68b
survive a glfs_fini() call.
e3c68b
e3c68b
This means that it's unsafe to destroy memory pools from glfs_fini().
e3c68b
e3c68b
Another side effect of destroying memory pools from glfs_fini() is that
e3c68b
the TLS variable that points to one of those pools cannot be reset for
e3c68b
all alive threads.  This means that any attempt to allocate memory from
e3c68b
those threads will access already free'd memory, which is very
e3c68b
dangerous.
e3c68b
e3c68b
To fix these issues, mem_pools_fini() doesn't destroy pool lists
e3c68b
anymore. Only at process termination the pools are destroyed.
e3c68b
e3c68b
Upatream patch:
e3c68b
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24099
e3c68b
> Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
e3c68b
> Fixes: bz#1801684
e3c68b
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
e3c68b
Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
e3c68b
BUG: 1800703
e3c68b
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
e3c68b
Reviewed-on: https://code.engineering.redhat.com/gerrit/192262
e3c68b
Tested-by: RHGS Build Bot <nigelb@redhat.com>
e3c68b
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
e3c68b
---
e3c68b
 libglusterfs/src/globals.c            |  13 ++-
e3c68b
 libglusterfs/src/glusterfs/globals.h  |   3 +
e3c68b
 libglusterfs/src/glusterfs/mem-pool.h |  28 ++---
e3c68b
 libglusterfs/src/mem-pool.c           | 201 ++++++++++++++++++----------------
e3c68b
 libglusterfs/src/syncop.c             |   7 ++
e3c68b
 5 files changed, 146 insertions(+), 106 deletions(-)
e3c68b
e3c68b
diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c
e3c68b
index 02098e6..e433ee8 100644
e3c68b
--- a/libglusterfs/src/globals.c
e3c68b
+++ b/libglusterfs/src/globals.c
e3c68b
@@ -319,7 +319,18 @@ glusterfs_cleanup(void *ptr)
e3c68b
         GF_FREE(thread_syncopctx.groups);
e3c68b
     }
e3c68b
 
e3c68b
-    mem_pool_thread_destructor();
e3c68b
+    mem_pool_thread_destructor(NULL);
e3c68b
+}
e3c68b
+
e3c68b
+void
e3c68b
+gf_thread_needs_cleanup(void)
e3c68b
+{
e3c68b
+    /* The value stored in free_key TLS is not really used for anything, but
e3c68b
+     * pthread implementation doesn't call the TLS destruction function unless
e3c68b
+     * it's != NULL. This function must be called whenever something is
e3c68b
+     * allocated for this thread so that glusterfs_cleanup() will be called
e3c68b
+     * and resources can be released. */
e3c68b
+    (void)pthread_setspecific(free_key, (void *)1);
e3c68b
 }
e3c68b
 
e3c68b
 static void
e3c68b
diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h
e3c68b
index e218285..31717ed 100644
e3c68b
--- a/libglusterfs/src/glusterfs/globals.h
e3c68b
+++ b/libglusterfs/src/glusterfs/globals.h
e3c68b
@@ -181,6 +181,9 @@ glusterfs_leaseid_exist(void);
e3c68b
 int
e3c68b
 glusterfs_globals_init(glusterfs_ctx_t *ctx);
e3c68b
 
e3c68b
+void
e3c68b
+gf_thread_needs_cleanup(void);
e3c68b
+
e3c68b
 struct tvec_base *
e3c68b
 glusterfs_ctx_tw_get(glusterfs_ctx_t *ctx);
e3c68b
 void
e3c68b
diff --git a/libglusterfs/src/glusterfs/mem-pool.h b/libglusterfs/src/glusterfs/mem-pool.h
e3c68b
index be0a26d..97bf76c 100644
e3c68b
--- a/libglusterfs/src/glusterfs/mem-pool.h
e3c68b
+++ b/libglusterfs/src/glusterfs/mem-pool.h
e3c68b
@@ -245,24 +245,26 @@ typedef struct per_thread_pool {
e3c68b
 } per_thread_pool_t;
e3c68b
 
e3c68b
 typedef struct per_thread_pool_list {
e3c68b
-    /*
e3c68b
-     * These first two members are protected by the global pool lock.  When
e3c68b
-     * a thread first tries to use any pool, we create one of these.  We
e3c68b
-     * link it into the global list using thr_list so the pool-sweeper
e3c68b
-     * thread can find it, and use pthread_setspecific so this thread can
e3c68b
-     * find it.  When the per-thread destructor runs, we "poison" the pool
e3c68b
-     * list to prevent further allocations.  This also signals to the
e3c68b
-     * pool-sweeper thread that the list should be detached and freed after
e3c68b
-     * the next time it's swept.
e3c68b
-     */
e3c68b
+    /* thr_list is used to place the TLS pool_list into the active global list
e3c68b
+     * (pool_threads) or the inactive global list (pool_free_threads). It's
e3c68b
+     * protected by the global pool_lock. */
e3c68b
     struct list_head thr_list;
e3c68b
-    unsigned int poison;
e3c68b
+
e3c68b
+    /* This lock is used to update poison and the hot/cold lists of members
e3c68b
+     * of 'pools' array. */
e3c68b
+    pthread_spinlock_t lock;
e3c68b
+
e3c68b
+    /* This field is used to mark a pool_list as not being owned by any thread.
e3c68b
+     * This means that the sweeper thread won't be cleaning objects stored in
e3c68b
+     * its pools. mem_put() uses it to decide if the object being released is
e3c68b
+     * placed into its original pool_list or directly destroyed. */
e3c68b
+    bool poison;
e3c68b
+
e3c68b
     /*
e3c68b
      * There's really more than one pool, but the actual number is hidden
e3c68b
      * in the implementation code so we just make it a single-element array
e3c68b
      * here.
e3c68b
      */
e3c68b
-    pthread_spinlock_t lock;
e3c68b
     per_thread_pool_t pools[1];
e3c68b
 } per_thread_pool_list_t;
e3c68b
 
e3c68b
@@ -307,7 +309,7 @@ void
e3c68b
 mem_pool_destroy(struct mem_pool *pool);
e3c68b
 
e3c68b
 void
e3c68b
-mem_pool_thread_destructor(void);
e3c68b
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list);
e3c68b
 
e3c68b
 void
e3c68b
 gf_mem_acct_enable_set(void *ctx);
e3c68b
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
e3c68b
index d88041d..2b41c01 100644
e3c68b
--- a/libglusterfs/src/mem-pool.c
e3c68b
+++ b/libglusterfs/src/mem-pool.c
e3c68b
@@ -367,7 +367,6 @@ static __thread per_thread_pool_list_t *thread_pool_list = NULL;
e3c68b
 #define POOL_SWEEP_SECS 30
e3c68b
 
e3c68b
 typedef struct {
e3c68b
-    struct list_head death_row;
e3c68b
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
e3c68b
     unsigned int n_cold_lists;
e3c68b
 } sweep_state_t;
e3c68b
@@ -384,36 +383,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
e3c68b
 static unsigned int init_count = 0;
e3c68b
 static pthread_t sweeper_tid;
e3c68b
 
e3c68b
-gf_boolean_t
e3c68b
+static bool
e3c68b
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
e3c68b
 {
e3c68b
     unsigned int i;
e3c68b
     per_thread_pool_t *pt_pool;
e3c68b
-    gf_boolean_t poisoned;
e3c68b
 
e3c68b
     (void)pthread_spin_lock(&pool_list->lock);
e3c68b
 
e3c68b
-    poisoned = pool_list->poison != 0;
e3c68b
-    if (!poisoned) {
e3c68b
-        for (i = 0; i < NPOOLS; ++i) {
e3c68b
-            pt_pool = &pool_list->pools[i];
e3c68b
-            if (pt_pool->cold_list) {
e3c68b
-                if (state->n_cold_lists >= N_COLD_LISTS) {
e3c68b
-                    break;
e3c68b
-                }
e3c68b
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
e3c68b
+    for (i = 0; i < NPOOLS; ++i) {
e3c68b
+        pt_pool = &pool_list->pools[i];
e3c68b
+        if (pt_pool->cold_list) {
e3c68b
+            if (state->n_cold_lists >= N_COLD_LISTS) {
e3c68b
+                (void)pthread_spin_unlock(&pool_list->lock);
e3c68b
+                return true;
e3c68b
             }
e3c68b
-            pt_pool->cold_list = pt_pool->hot_list;
e3c68b
-            pt_pool->hot_list = NULL;
e3c68b
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
e3c68b
         }
e3c68b
+        pt_pool->cold_list = pt_pool->hot_list;
e3c68b
+        pt_pool->hot_list = NULL;
e3c68b
     }
e3c68b
 
e3c68b
     (void)pthread_spin_unlock(&pool_list->lock);
e3c68b
 
e3c68b
-    return poisoned;
e3c68b
+    return false;
e3c68b
 }
e3c68b
 
e3c68b
-void
e3c68b
+static void
e3c68b
 free_obj_list(pooled_obj_hdr_t *victim)
e3c68b
 {
e3c68b
     pooled_obj_hdr_t *next;
e3c68b
@@ -425,82 +421,96 @@ free_obj_list(pooled_obj_hdr_t *victim)
e3c68b
     }
e3c68b
 }
e3c68b
 
e3c68b
-void *
e3c68b
+static void *
e3c68b
 pool_sweeper(void *arg)
e3c68b
 {
e3c68b
     sweep_state_t state;
e3c68b
     per_thread_pool_list_t *pool_list;
e3c68b
-    per_thread_pool_list_t *next_pl;
e3c68b
-    per_thread_pool_t *pt_pool;
e3c68b
-    unsigned int i;
e3c68b
-    gf_boolean_t poisoned;
e3c68b
+    uint32_t i;
e3c68b
+    bool pending;
e3c68b
 
e3c68b
     /*
e3c68b
      * This is all a bit inelegant, but the point is to avoid doing
e3c68b
      * expensive things (like freeing thousands of objects) while holding a
e3c68b
-     * global lock.  Thus, we split each iteration into three passes, with
e3c68b
+     * global lock.  Thus, we split each iteration into two passes, with
e3c68b
      * only the first and fastest holding the lock.
e3c68b
      */
e3c68b
 
e3c68b
+    pending = true;
e3c68b
+
e3c68b
     for (;;) {
e3c68b
-        sleep(POOL_SWEEP_SECS);
e3c68b
+        /* If we know there's pending work to do (or it's the first run), we
e3c68b
+         * do collect garbage more often. */
e3c68b
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
e3c68b
+
e3c68b
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
e3c68b
-        INIT_LIST_HEAD(&state.death_row);
e3c68b
         state.n_cold_lists = 0;
e3c68b
+        pending = false;
e3c68b
 
e3c68b
         /* First pass: collect stuff that needs our attention. */
e3c68b
         (void)pthread_mutex_lock(&pool_lock);
e3c68b
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
e3c68b
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
e3c68b
         {
e3c68b
-            (void)pthread_mutex_unlock(&pool_lock);
e3c68b
-            poisoned = collect_garbage(&state, pool_list);
e3c68b
-            (void)pthread_mutex_lock(&pool_lock);
e3c68b
-
e3c68b
-            if (poisoned) {
e3c68b
-                list_move(&pool_list->thr_list, &state.death_row);
e3c68b
+            if (collect_garbage(&state, pool_list)) {
e3c68b
+                pending = true;
e3c68b
             }
e3c68b
         }
e3c68b
         (void)pthread_mutex_unlock(&pool_lock);
e3c68b
 
e3c68b
-        /* Second pass: free dead pools. */
e3c68b
-        (void)pthread_mutex_lock(&pool_free_lock);
e3c68b
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
e3c68b
-        {
e3c68b
-            for (i = 0; i < NPOOLS; ++i) {
e3c68b
-                pt_pool = &pool_list->pools[i];
e3c68b
-                free_obj_list(pt_pool->cold_list);
e3c68b
-                free_obj_list(pt_pool->hot_list);
e3c68b
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
e3c68b
-            }
e3c68b
-            list_del(&pool_list->thr_list);
e3c68b
-            list_add(&pool_list->thr_list, &pool_free_threads);
e3c68b
-        }
e3c68b
-        (void)pthread_mutex_unlock(&pool_free_lock);
e3c68b
-
e3c68b
-        /* Third pass: free cold objects from live pools. */
e3c68b
+        /* Second pass: free cold objects from live pools. */
e3c68b
         for (i = 0; i < state.n_cold_lists; ++i) {
e3c68b
             free_obj_list(state.cold_lists[i]);
e3c68b
         }
e3c68b
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
e3c68b
     }
e3c68b
+
e3c68b
+    return NULL;
e3c68b
 }
e3c68b
 
e3c68b
 void
e3c68b
-mem_pool_thread_destructor(void)
e3c68b
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
e3c68b
 {
e3c68b
-    per_thread_pool_list_t *pool_list = thread_pool_list;
e3c68b
-
e3c68b
-    /* The pool-sweeper thread will take it from here.
e3c68b
-     *
e3c68b
-     * We can change 'poison' here without taking locks because the change
e3c68b
-     * itself doesn't interact with other parts of the code and a simple write
e3c68b
-     * is already atomic from the point of view of the processor.
e3c68b
-     *
e3c68b
-     * This change can modify what mem_put() does, but both possibilities are
e3c68b
-     * fine until the sweeper thread kicks in. The real synchronization must be
e3c68b
-     * between mem_put() and the sweeper thread. */
e3c68b
+    per_thread_pool_t *pt_pool;
e3c68b
+    uint32_t i;
e3c68b
+
e3c68b
+    if (pool_list == NULL) {
e3c68b
+        pool_list = thread_pool_list;
e3c68b
+    }
e3c68b
+
e3c68b
+    /* The current thread is terminating. None of the allocated objects will
e3c68b
+     * be used again. We can directly destroy them here instead of delaying
e3c68b
+     * it until the next sweeper loop. */
e3c68b
     if (pool_list != NULL) {
e3c68b
-        pool_list->poison = 1;
e3c68b
+        /* Remove pool_list from the global list to avoid that sweeper
e3c68b
+         * could touch it. */
e3c68b
+        pthread_mutex_lock(&pool_lock);
e3c68b
+        list_del(&pool_list->thr_list);
e3c68b
+        pthread_mutex_unlock(&pool_lock);
e3c68b
+
e3c68b
+        /* We need to protect hot/cold changes from potential mem_put() calls
e3c68b
+         * that reference this pool_list. Once poison is set to true, we are
e3c68b
+         * sure that no one else will touch hot/cold lists. The only possible
e3c68b
+         * race is when at the same moment a mem_put() is adding a new item
e3c68b
+         * to the hot list. We protect from that by taking pool_list->lock.
e3c68b
+         * After that we don't need the lock to destroy the hot/cold lists. */
e3c68b
+        pthread_spin_lock(&pool_list->lock);
e3c68b
+        pool_list->poison = true;
e3c68b
+        pthread_spin_unlock(&pool_list->lock);
e3c68b
+
e3c68b
+        for (i = 0; i < NPOOLS; i++) {
e3c68b
+            pt_pool = &pool_list->pools[i];
e3c68b
+
e3c68b
+            free_obj_list(pt_pool->hot_list);
e3c68b
+            pt_pool->hot_list = NULL;
e3c68b
+
e3c68b
+            free_obj_list(pt_pool->cold_list);
e3c68b
+            pt_pool->cold_list = NULL;
e3c68b
+        }
e3c68b
+
e3c68b
+        pthread_mutex_lock(&pool_free_lock);
e3c68b
+        list_add(&pool_list->thr_list, &pool_free_threads);
e3c68b
+        pthread_mutex_unlock(&pool_free_lock);
e3c68b
+
e3c68b
         thread_pool_list = NULL;
e3c68b
     }
e3c68b
 }
e3c68b
@@ -528,6 +538,30 @@ mem_pools_preinit(void)
e3c68b
     init_done = GF_MEMPOOL_INIT_EARLY;
e3c68b
 }
e3c68b
 
e3c68b
+static __attribute__((destructor)) void
e3c68b
+mem_pools_postfini(void)
e3c68b
+{
e3c68b
+    per_thread_pool_list_t *pool_list, *next;
e3c68b
+
e3c68b
+    /* This is part of a process shutdown (or dlclose()) which means that
e3c68b
+     * most probably all threads should be stopped. However this is not the
e3c68b
+     * case for gluster and there are even legitimate situations in which we
e3c68b
+     * could have some threads alive. What is sure is that none of those
e3c68b
+     * threads should be using anything from this library, so destroying
e3c68b
+     * everything here should be fine and safe. */
e3c68b
+
e3c68b
+    list_for_each_entry_safe(pool_list, next, &pool_threads, thr_list)
e3c68b
+    {
e3c68b
+        mem_pool_thread_destructor(pool_list);
e3c68b
+    }
e3c68b
+
e3c68b
+    list_for_each_entry_safe(pool_list, next, &pool_free_threads, thr_list)
e3c68b
+    {
e3c68b
+        list_del(&pool_list->thr_list);
e3c68b
+        FREE(pool_list);
e3c68b
+    }
e3c68b
+}
e3c68b
+
e3c68b
 /* Call mem_pools_init() once threading has been configured completely. This
e3c68b
  * prevent the pool_sweeper thread from getting killed once the main() thread
e3c68b
  * exits during deamonizing. */
e3c68b
@@ -560,10 +594,6 @@ mem_pools_fini(void)
e3c68b
              */
e3c68b
             break;
e3c68b
         case 1: {
e3c68b
-            per_thread_pool_list_t *pool_list;
e3c68b
-            per_thread_pool_list_t *next_pl;
e3c68b
-            unsigned int i;
e3c68b
-
e3c68b
             /* if mem_pools_init() was not called, sweeper_tid will be invalid
e3c68b
              * and the functions will error out. That is not critical. In all
e3c68b
              * other cases, the sweeper_tid will be valid and the thread gets
e3c68b
@@ -571,32 +601,11 @@ mem_pools_fini(void)
e3c68b
             (void)pthread_cancel(sweeper_tid);
e3c68b
             (void)pthread_join(sweeper_tid, NULL);
e3c68b
 
e3c68b
-            /* At this point all threads should have already terminated, so
e3c68b
-             * it should be safe to destroy all pending per_thread_pool_list_t
e3c68b
-             * structures that are stored for each thread. */
e3c68b
-            mem_pool_thread_destructor();
e3c68b
-
e3c68b
-            /* free all objects from all pools */
e3c68b
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
e3c68b
-                                     thr_list)
e3c68b
-            {
e3c68b
-                for (i = 0; i < NPOOLS; ++i) {
e3c68b
-                    free_obj_list(pool_list->pools[i].hot_list);
e3c68b
-                    free_obj_list(pool_list->pools[i].cold_list);
e3c68b
-                    pool_list->pools[i].hot_list = NULL;
e3c68b
-                    pool_list->pools[i].cold_list = NULL;
e3c68b
-                }
e3c68b
-
e3c68b
-                list_del(&pool_list->thr_list);
e3c68b
-                FREE(pool_list);
e3c68b
-            }
e3c68b
-
e3c68b
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
e3c68b
-                                     thr_list)
e3c68b
-            {
e3c68b
-                list_del(&pool_list->thr_list);
e3c68b
-                FREE(pool_list);
e3c68b
-            }
e3c68b
+            /* There could be threads still running in some cases, so we can't
e3c68b
+             * destroy pool_lists in use. We can also not destroy unused
e3c68b
+             * pool_lists because some allocated objects may still be pointing
e3c68b
+             * to them. */
e3c68b
+            mem_pool_thread_destructor(NULL);
e3c68b
 
e3c68b
             init_done = GF_MEMPOOL_INIT_DESTROY;
e3c68b
             /* Fall through. */
e3c68b
@@ -617,7 +626,7 @@ mem_pools_fini(void)
e3c68b
 {
e3c68b
 }
e3c68b
 void
e3c68b
-mem_pool_thread_destructor(void)
e3c68b
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
e3c68b
 {
e3c68b
 }
e3c68b
 
e3c68b
@@ -738,13 +747,21 @@ mem_get_pool_list(void)
e3c68b
         }
e3c68b
     }
e3c68b
 
e3c68b
+    /* There's no need to take pool_list->lock, because this is already an
e3c68b
+     * atomic operation and we don't need to synchronize it with any change
e3c68b
+     * in hot/cold lists. */
e3c68b
+    pool_list->poison = false;
e3c68b
+
e3c68b
     (void)pthread_mutex_lock(&pool_lock);
e3c68b
-    pool_list->poison = 0;
e3c68b
     list_add(&pool_list->thr_list, &pool_threads);
e3c68b
     (void)pthread_mutex_unlock(&pool_lock);
e3c68b
 
e3c68b
     thread_pool_list = pool_list;
e3c68b
 
e3c68b
+    /* Ensure that all memory objects associated to the new pool_list are
e3c68b
+     * destroyed when the thread terminates. */
e3c68b
+    gf_thread_needs_cleanup();
e3c68b
+
e3c68b
     return pool_list;
e3c68b
 }
e3c68b
 
e3c68b
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
e3c68b
index 2eb7b49..0de53c6 100644
e3c68b
--- a/libglusterfs/src/syncop.c
e3c68b
+++ b/libglusterfs/src/syncop.c
e3c68b
@@ -97,6 +97,13 @@ syncopctx_setfsgroups(int count, const void *groups)
e3c68b
 
e3c68b
     /* set/reset the ngrps, this is where reset of groups is handled */
e3c68b
     opctx->ngrps = count;
e3c68b
+
e3c68b
+    if ((opctx->valid & SYNCOPCTX_GROUPS) == 0) {
e3c68b
+        /* This is the first time we are storing groups into the TLS structure
e3c68b
+         * so we mark the current thread so that it will be properly cleaned
e3c68b
+         * up when the thread terminates. */
e3c68b
+        gf_thread_needs_cleanup();
e3c68b
+    }
e3c68b
     opctx->valid |= SYNCOPCTX_GROUPS;
e3c68b
 
e3c68b
 out:
e3c68b
-- 
e3c68b
1.8.3.1
e3c68b