1df6c8
From 75a9d946d252ce70460144615ca17dbdf2e80fab Mon Sep 17 00:00:00 2001
1df6c8
From: Xavi Hernandez <xhernandez@redhat.com>
1df6c8
Date: Fri, 7 Feb 2020 10:19:57 +0100
1df6c8
Subject: [PATCH 354/355] core: fix memory pool management races
1df6c8
1df6c8
Objects allocated from a per-thread memory pool keep a reference to it
1df6c8
to be able to return the object to the pool when not used anymore. The
1df6c8
object holding this reference can have a long life cycle that could
1df6c8
survive a glfs_fini() call.
1df6c8
1df6c8
This means that it's unsafe to destroy memory pools from glfs_fini().
1df6c8
1df6c8
Another side effect of destroying memory pools from glfs_fini() is that
1df6c8
the TLS variable that points to one of those pools cannot be reset for
1df6c8
all alive threads.  This means that any attempt to allocate memory from
1df6c8
those threads will access already free'd memory, which is very
1df6c8
dangerous.
1df6c8
1df6c8
To fix these issues, mem_pools_fini() doesn't destroy pool lists
1df6c8
anymore. Only at process termination the pools are destroyed.
1df6c8
1df6c8
Upatream patch:
1df6c8
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24099
1df6c8
> Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
1df6c8
> Fixes: bz#1801684
1df6c8
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
1df6c8
1df6c8
Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
1df6c8
BUG: 1800703
1df6c8
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
1df6c8
Reviewed-on: https://code.engineering.redhat.com/gerrit/192262
1df6c8
Tested-by: RHGS Build Bot <nigelb@redhat.com>
1df6c8
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
1df6c8
---
1df6c8
 libglusterfs/src/globals.c            |  13 ++-
1df6c8
 libglusterfs/src/glusterfs/globals.h  |   3 +
1df6c8
 libglusterfs/src/glusterfs/mem-pool.h |  28 ++---
1df6c8
 libglusterfs/src/mem-pool.c           | 201 ++++++++++++++++++----------------
1df6c8
 libglusterfs/src/syncop.c             |   7 ++
1df6c8
 5 files changed, 146 insertions(+), 106 deletions(-)
1df6c8
1df6c8
diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c
1df6c8
index 02098e6..e433ee8 100644
1df6c8
--- a/libglusterfs/src/globals.c
1df6c8
+++ b/libglusterfs/src/globals.c
1df6c8
@@ -319,7 +319,18 @@ glusterfs_cleanup(void *ptr)
1df6c8
         GF_FREE(thread_syncopctx.groups);
1df6c8
     }
1df6c8
 
1df6c8
-    mem_pool_thread_destructor();
1df6c8
+    mem_pool_thread_destructor(NULL);
1df6c8
+}
1df6c8
+
1df6c8
+void
1df6c8
+gf_thread_needs_cleanup(void)
1df6c8
+{
1df6c8
+    /* The value stored in free_key TLS is not really used for anything, but
1df6c8
+     * pthread implementation doesn't call the TLS destruction function unless
1df6c8
+     * it's != NULL. This function must be called whenever something is
1df6c8
+     * allocated for this thread so that glusterfs_cleanup() will be called
1df6c8
+     * and resources can be released. */
1df6c8
+    (void)pthread_setspecific(free_key, (void *)1);
1df6c8
 }
1df6c8
 
1df6c8
 static void
1df6c8
diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h
1df6c8
index e218285..31717ed 100644
1df6c8
--- a/libglusterfs/src/glusterfs/globals.h
1df6c8
+++ b/libglusterfs/src/glusterfs/globals.h
1df6c8
@@ -181,6 +181,9 @@ glusterfs_leaseid_exist(void);
1df6c8
 int
1df6c8
 glusterfs_globals_init(glusterfs_ctx_t *ctx);
1df6c8
 
1df6c8
+void
1df6c8
+gf_thread_needs_cleanup(void);
1df6c8
+
1df6c8
 struct tvec_base *
1df6c8
 glusterfs_ctx_tw_get(glusterfs_ctx_t *ctx);
1df6c8
 void
1df6c8
diff --git a/libglusterfs/src/glusterfs/mem-pool.h b/libglusterfs/src/glusterfs/mem-pool.h
1df6c8
index be0a26d..97bf76c 100644
1df6c8
--- a/libglusterfs/src/glusterfs/mem-pool.h
1df6c8
+++ b/libglusterfs/src/glusterfs/mem-pool.h
1df6c8
@@ -245,24 +245,26 @@ typedef struct per_thread_pool {
1df6c8
 } per_thread_pool_t;
1df6c8
 
1df6c8
 typedef struct per_thread_pool_list {
1df6c8
-    /*
1df6c8
-     * These first two members are protected by the global pool lock.  When
1df6c8
-     * a thread first tries to use any pool, we create one of these.  We
1df6c8
-     * link it into the global list using thr_list so the pool-sweeper
1df6c8
-     * thread can find it, and use pthread_setspecific so this thread can
1df6c8
-     * find it.  When the per-thread destructor runs, we "poison" the pool
1df6c8
-     * list to prevent further allocations.  This also signals to the
1df6c8
-     * pool-sweeper thread that the list should be detached and freed after
1df6c8
-     * the next time it's swept.
1df6c8
-     */
1df6c8
+    /* thr_list is used to place the TLS pool_list into the active global list
1df6c8
+     * (pool_threads) or the inactive global list (pool_free_threads). It's
1df6c8
+     * protected by the global pool_lock. */
1df6c8
     struct list_head thr_list;
1df6c8
-    unsigned int poison;
1df6c8
+
1df6c8
+    /* This lock is used to update poison and the hot/cold lists of members
1df6c8
+     * of 'pools' array. */
1df6c8
+    pthread_spinlock_t lock;
1df6c8
+
1df6c8
+    /* This field is used to mark a pool_list as not being owned by any thread.
1df6c8
+     * This means that the sweeper thread won't be cleaning objects stored in
1df6c8
+     * its pools. mem_put() uses it to decide if the object being released is
1df6c8
+     * placed into its original pool_list or directly destroyed. */
1df6c8
+    bool poison;
1df6c8
+
1df6c8
     /*
1df6c8
      * There's really more than one pool, but the actual number is hidden
1df6c8
      * in the implementation code so we just make it a single-element array
1df6c8
      * here.
1df6c8
      */
1df6c8
-    pthread_spinlock_t lock;
1df6c8
     per_thread_pool_t pools[1];
1df6c8
 } per_thread_pool_list_t;
1df6c8
 
1df6c8
@@ -307,7 +309,7 @@ void
1df6c8
 mem_pool_destroy(struct mem_pool *pool);
1df6c8
 
1df6c8
 void
1df6c8
-mem_pool_thread_destructor(void);
1df6c8
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list);
1df6c8
 
1df6c8
 void
1df6c8
 gf_mem_acct_enable_set(void *ctx);
1df6c8
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
1df6c8
index d88041d..2b41c01 100644
1df6c8
--- a/libglusterfs/src/mem-pool.c
1df6c8
+++ b/libglusterfs/src/mem-pool.c
1df6c8
@@ -367,7 +367,6 @@ static __thread per_thread_pool_list_t *thread_pool_list = NULL;
1df6c8
 #define POOL_SWEEP_SECS 30
1df6c8
 
1df6c8
 typedef struct {
1df6c8
-    struct list_head death_row;
1df6c8
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
1df6c8
     unsigned int n_cold_lists;
1df6c8
 } sweep_state_t;
1df6c8
@@ -384,36 +383,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
1df6c8
 static unsigned int init_count = 0;
1df6c8
 static pthread_t sweeper_tid;
1df6c8
 
1df6c8
-gf_boolean_t
1df6c8
+static bool
1df6c8
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
1df6c8
 {
1df6c8
     unsigned int i;
1df6c8
     per_thread_pool_t *pt_pool;
1df6c8
-    gf_boolean_t poisoned;
1df6c8
 
1df6c8
     (void)pthread_spin_lock(&pool_list->lock);
1df6c8
 
1df6c8
-    poisoned = pool_list->poison != 0;
1df6c8
-    if (!poisoned) {
1df6c8
-        for (i = 0; i < NPOOLS; ++i) {
1df6c8
-            pt_pool = &pool_list->pools[i];
1df6c8
-            if (pt_pool->cold_list) {
1df6c8
-                if (state->n_cold_lists >= N_COLD_LISTS) {
1df6c8
-                    break;
1df6c8
-                }
1df6c8
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
1df6c8
+    for (i = 0; i < NPOOLS; ++i) {
1df6c8
+        pt_pool = &pool_list->pools[i];
1df6c8
+        if (pt_pool->cold_list) {
1df6c8
+            if (state->n_cold_lists >= N_COLD_LISTS) {
1df6c8
+                (void)pthread_spin_unlock(&pool_list->lock);
1df6c8
+                return true;
1df6c8
             }
1df6c8
-            pt_pool->cold_list = pt_pool->hot_list;
1df6c8
-            pt_pool->hot_list = NULL;
1df6c8
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
1df6c8
         }
1df6c8
+        pt_pool->cold_list = pt_pool->hot_list;
1df6c8
+        pt_pool->hot_list = NULL;
1df6c8
     }
1df6c8
 
1df6c8
     (void)pthread_spin_unlock(&pool_list->lock);
1df6c8
 
1df6c8
-    return poisoned;
1df6c8
+    return false;
1df6c8
 }
1df6c8
 
1df6c8
-void
1df6c8
+static void
1df6c8
 free_obj_list(pooled_obj_hdr_t *victim)
1df6c8
 {
1df6c8
     pooled_obj_hdr_t *next;
1df6c8
@@ -425,82 +421,96 @@ free_obj_list(pooled_obj_hdr_t *victim)
1df6c8
     }
1df6c8
 }
1df6c8
 
1df6c8
-void *
1df6c8
+static void *
1df6c8
 pool_sweeper(void *arg)
1df6c8
 {
1df6c8
     sweep_state_t state;
1df6c8
     per_thread_pool_list_t *pool_list;
1df6c8
-    per_thread_pool_list_t *next_pl;
1df6c8
-    per_thread_pool_t *pt_pool;
1df6c8
-    unsigned int i;
1df6c8
-    gf_boolean_t poisoned;
1df6c8
+    uint32_t i;
1df6c8
+    bool pending;
1df6c8
 
1df6c8
     /*
1df6c8
      * This is all a bit inelegant, but the point is to avoid doing
1df6c8
      * expensive things (like freeing thousands of objects) while holding a
1df6c8
-     * global lock.  Thus, we split each iteration into three passes, with
1df6c8
+     * global lock.  Thus, we split each iteration into two passes, with
1df6c8
      * only the first and fastest holding the lock.
1df6c8
      */
1df6c8
 
1df6c8
+    pending = true;
1df6c8
+
1df6c8
     for (;;) {
1df6c8
-        sleep(POOL_SWEEP_SECS);
1df6c8
+        /* If we know there's pending work to do (or it's the first run), we
1df6c8
+         * do collect garbage more often. */
1df6c8
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
1df6c8
+
1df6c8
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
1df6c8
-        INIT_LIST_HEAD(&state.death_row);
1df6c8
         state.n_cold_lists = 0;
1df6c8
+        pending = false;
1df6c8
 
1df6c8
         /* First pass: collect stuff that needs our attention. */
1df6c8
         (void)pthread_mutex_lock(&pool_lock);
1df6c8
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
1df6c8
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
1df6c8
         {
1df6c8
-            (void)pthread_mutex_unlock(&pool_lock);
1df6c8
-            poisoned = collect_garbage(&state, pool_list);
1df6c8
-            (void)pthread_mutex_lock(&pool_lock);
1df6c8
-
1df6c8
-            if (poisoned) {
1df6c8
-                list_move(&pool_list->thr_list, &state.death_row);
1df6c8
+            if (collect_garbage(&state, pool_list)) {
1df6c8
+                pending = true;
1df6c8
             }
1df6c8
         }
1df6c8
         (void)pthread_mutex_unlock(&pool_lock);
1df6c8
 
1df6c8
-        /* Second pass: free dead pools. */
1df6c8
-        (void)pthread_mutex_lock(&pool_free_lock);
1df6c8
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
1df6c8
-        {
1df6c8
-            for (i = 0; i < NPOOLS; ++i) {
1df6c8
-                pt_pool = &pool_list->pools[i];
1df6c8
-                free_obj_list(pt_pool->cold_list);
1df6c8
-                free_obj_list(pt_pool->hot_list);
1df6c8
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
1df6c8
-            }
1df6c8
-            list_del(&pool_list->thr_list);
1df6c8
-            list_add(&pool_list->thr_list, &pool_free_threads);
1df6c8
-        }
1df6c8
-        (void)pthread_mutex_unlock(&pool_free_lock);
1df6c8
-
1df6c8
-        /* Third pass: free cold objects from live pools. */
1df6c8
+        /* Second pass: free cold objects from live pools. */
1df6c8
         for (i = 0; i < state.n_cold_lists; ++i) {
1df6c8
             free_obj_list(state.cold_lists[i]);
1df6c8
         }
1df6c8
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
1df6c8
     }
1df6c8
+
1df6c8
+    return NULL;
1df6c8
 }
1df6c8
 
1df6c8
 void
1df6c8
-mem_pool_thread_destructor(void)
1df6c8
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
1df6c8
 {
1df6c8
-    per_thread_pool_list_t *pool_list = thread_pool_list;
1df6c8
-
1df6c8
-    /* The pool-sweeper thread will take it from here.
1df6c8
-     *
1df6c8
-     * We can change 'poison' here without taking locks because the change
1df6c8
-     * itself doesn't interact with other parts of the code and a simple write
1df6c8
-     * is already atomic from the point of view of the processor.
1df6c8
-     *
1df6c8
-     * This change can modify what mem_put() does, but both possibilities are
1df6c8
-     * fine until the sweeper thread kicks in. The real synchronization must be
1df6c8
-     * between mem_put() and the sweeper thread. */
1df6c8
+    per_thread_pool_t *pt_pool;
1df6c8
+    uint32_t i;
1df6c8
+
1df6c8
+    if (pool_list == NULL) {
1df6c8
+        pool_list = thread_pool_list;
1df6c8
+    }
1df6c8
+
1df6c8
+    /* The current thread is terminating. None of the allocated objects will
1df6c8
+     * be used again. We can directly destroy them here instead of delaying
1df6c8
+     * it until the next sweeper loop. */
1df6c8
     if (pool_list != NULL) {
1df6c8
-        pool_list->poison = 1;
1df6c8
+        /* Remove pool_list from the global list to avoid that sweeper
1df6c8
+         * could touch it. */
1df6c8
+        pthread_mutex_lock(&pool_lock);
1df6c8
+        list_del(&pool_list->thr_list);
1df6c8
+        pthread_mutex_unlock(&pool_lock);
1df6c8
+
1df6c8
+        /* We need to protect hot/cold changes from potential mem_put() calls
1df6c8
+         * that reference this pool_list. Once poison is set to true, we are
1df6c8
+         * sure that no one else will touch hot/cold lists. The only possible
1df6c8
+         * race is when at the same moment a mem_put() is adding a new item
1df6c8
+         * to the hot list. We protect from that by taking pool_list->lock.
1df6c8
+         * After that we don't need the lock to destroy the hot/cold lists. */
1df6c8
+        pthread_spin_lock(&pool_list->lock);
1df6c8
+        pool_list->poison = true;
1df6c8
+        pthread_spin_unlock(&pool_list->lock);
1df6c8
+
1df6c8
+        for (i = 0; i < NPOOLS; i++) {
1df6c8
+            pt_pool = &pool_list->pools[i];
1df6c8
+
1df6c8
+            free_obj_list(pt_pool->hot_list);
1df6c8
+            pt_pool->hot_list = NULL;
1df6c8
+
1df6c8
+            free_obj_list(pt_pool->cold_list);
1df6c8
+            pt_pool->cold_list = NULL;
1df6c8
+        }
1df6c8
+
1df6c8
+        pthread_mutex_lock(&pool_free_lock);
1df6c8
+        list_add(&pool_list->thr_list, &pool_free_threads);
1df6c8
+        pthread_mutex_unlock(&pool_free_lock);
1df6c8
+
1df6c8
         thread_pool_list = NULL;
1df6c8
     }
1df6c8
 }
1df6c8
@@ -528,6 +538,30 @@ mem_pools_preinit(void)
1df6c8
     init_done = GF_MEMPOOL_INIT_EARLY;
1df6c8
 }
1df6c8
 
1df6c8
+static __attribute__((destructor)) void
1df6c8
+mem_pools_postfini(void)
1df6c8
+{
1df6c8
+    per_thread_pool_list_t *pool_list, *next;
1df6c8
+
1df6c8
+    /* This is part of a process shutdown (or dlclose()) which means that
1df6c8
+     * most probably all threads should be stopped. However this is not the
1df6c8
+     * case for gluster and there are even legitimate situations in which we
1df6c8
+     * could have some threads alive. What is sure is that none of those
1df6c8
+     * threads should be using anything from this library, so destroying
1df6c8
+     * everything here should be fine and safe. */
1df6c8
+
1df6c8
+    list_for_each_entry_safe(pool_list, next, &pool_threads, thr_list)
1df6c8
+    {
1df6c8
+        mem_pool_thread_destructor(pool_list);
1df6c8
+    }
1df6c8
+
1df6c8
+    list_for_each_entry_safe(pool_list, next, &pool_free_threads, thr_list)
1df6c8
+    {
1df6c8
+        list_del(&pool_list->thr_list);
1df6c8
+        FREE(pool_list);
1df6c8
+    }
1df6c8
+}
1df6c8
+
1df6c8
 /* Call mem_pools_init() once threading has been configured completely. This
1df6c8
  * prevent the pool_sweeper thread from getting killed once the main() thread
1df6c8
  * exits during deamonizing. */
1df6c8
@@ -560,10 +594,6 @@ mem_pools_fini(void)
1df6c8
              */
1df6c8
             break;
1df6c8
         case 1: {
1df6c8
-            per_thread_pool_list_t *pool_list;
1df6c8
-            per_thread_pool_list_t *next_pl;
1df6c8
-            unsigned int i;
1df6c8
-
1df6c8
             /* if mem_pools_init() was not called, sweeper_tid will be invalid
1df6c8
              * and the functions will error out. That is not critical. In all
1df6c8
              * other cases, the sweeper_tid will be valid and the thread gets
1df6c8
@@ -571,32 +601,11 @@ mem_pools_fini(void)
1df6c8
             (void)pthread_cancel(sweeper_tid);
1df6c8
             (void)pthread_join(sweeper_tid, NULL);
1df6c8
 
1df6c8
-            /* At this point all threads should have already terminated, so
1df6c8
-             * it should be safe to destroy all pending per_thread_pool_list_t
1df6c8
-             * structures that are stored for each thread. */
1df6c8
-            mem_pool_thread_destructor();
1df6c8
-
1df6c8
-            /* free all objects from all pools */
1df6c8
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
1df6c8
-                                     thr_list)
1df6c8
-            {
1df6c8
-                for (i = 0; i < NPOOLS; ++i) {
1df6c8
-                    free_obj_list(pool_list->pools[i].hot_list);
1df6c8
-                    free_obj_list(pool_list->pools[i].cold_list);
1df6c8
-                    pool_list->pools[i].hot_list = NULL;
1df6c8
-                    pool_list->pools[i].cold_list = NULL;
1df6c8
-                }
1df6c8
-
1df6c8
-                list_del(&pool_list->thr_list);
1df6c8
-                FREE(pool_list);
1df6c8
-            }
1df6c8
-
1df6c8
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
1df6c8
-                                     thr_list)
1df6c8
-            {
1df6c8
-                list_del(&pool_list->thr_list);
1df6c8
-                FREE(pool_list);
1df6c8
-            }
1df6c8
+            /* There could be threads still running in some cases, so we can't
1df6c8
+             * destroy pool_lists in use. We can also not destroy unused
1df6c8
+             * pool_lists because some allocated objects may still be pointing
1df6c8
+             * to them. */
1df6c8
+            mem_pool_thread_destructor(NULL);
1df6c8
 
1df6c8
             init_done = GF_MEMPOOL_INIT_DESTROY;
1df6c8
             /* Fall through. */
1df6c8
@@ -617,7 +626,7 @@ mem_pools_fini(void)
1df6c8
 {
1df6c8
 }
1df6c8
 void
1df6c8
-mem_pool_thread_destructor(void)
1df6c8
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
1df6c8
 {
1df6c8
 }
1df6c8
 
1df6c8
@@ -738,13 +747,21 @@ mem_get_pool_list(void)
1df6c8
         }
1df6c8
     }
1df6c8
 
1df6c8
+    /* There's no need to take pool_list->lock, because this is already an
1df6c8
+     * atomic operation and we don't need to synchronize it with any change
1df6c8
+     * in hot/cold lists. */
1df6c8
+    pool_list->poison = false;
1df6c8
+
1df6c8
     (void)pthread_mutex_lock(&pool_lock);
1df6c8
-    pool_list->poison = 0;
1df6c8
     list_add(&pool_list->thr_list, &pool_threads);
1df6c8
     (void)pthread_mutex_unlock(&pool_lock);
1df6c8
 
1df6c8
     thread_pool_list = pool_list;
1df6c8
 
1df6c8
+    /* Ensure that all memory objects associated to the new pool_list are
1df6c8
+     * destroyed when the thread terminates. */
1df6c8
+    gf_thread_needs_cleanup();
1df6c8
+
1df6c8
     return pool_list;
1df6c8
 }
1df6c8
 
1df6c8
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
1df6c8
index 2eb7b49..0de53c6 100644
1df6c8
--- a/libglusterfs/src/syncop.c
1df6c8
+++ b/libglusterfs/src/syncop.c
1df6c8
@@ -97,6 +97,13 @@ syncopctx_setfsgroups(int count, const void *groups)
1df6c8
 
1df6c8
     /* set/reset the ngrps, this is where reset of groups is handled */
1df6c8
     opctx->ngrps = count;
1df6c8
+
1df6c8
+    if ((opctx->valid & SYNCOPCTX_GROUPS) == 0) {
1df6c8
+        /* This is the first time we are storing groups into the TLS structure
1df6c8
+         * so we mark the current thread so that it will be properly cleaned
1df6c8
+         * up when the thread terminates. */
1df6c8
+        gf_thread_needs_cleanup();
1df6c8
+    }
1df6c8
     opctx->valid |= SYNCOPCTX_GROUPS;
1df6c8
 
1df6c8
 out:
1df6c8
-- 
1df6c8
1.8.3.1
1df6c8