14f8ab
From 75a9d946d252ce70460144615ca17dbdf2e80fab Mon Sep 17 00:00:00 2001
14f8ab
From: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Date: Fri, 7 Feb 2020 10:19:57 +0100
14f8ab
Subject: [PATCH 354/355] core: fix memory pool management races
14f8ab
14f8ab
Objects allocated from a per-thread memory pool keep a reference to it
14f8ab
to be able to return the object to the pool when not used anymore. The
14f8ab
object holding this reference can have a long life cycle that could
14f8ab
survive a glfs_fini() call.
14f8ab
14f8ab
This means that it's unsafe to destroy memory pools from glfs_fini().
14f8ab
14f8ab
Another side effect of destroying memory pools from glfs_fini() is that
14f8ab
the TLS variable that points to one of those pools cannot be reset for
14f8ab
all alive threads.  This means that any attempt to allocate memory from
14f8ab
those threads will access already free'd memory, which is very
14f8ab
dangerous.
14f8ab
14f8ab
To fix these issues, mem_pools_fini() doesn't destroy pool lists
14f8ab
anymore. Only at process termination the pools are destroyed.
14f8ab
14f8ab
Upatream patch:
14f8ab
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24099
14f8ab
> Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
14f8ab
> Fixes: bz#1801684
14f8ab
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
14f8ab
Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
14f8ab
BUG: 1800703
14f8ab
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
14f8ab
Reviewed-on: https://code.engineering.redhat.com/gerrit/192262
14f8ab
Tested-by: RHGS Build Bot <nigelb@redhat.com>
14f8ab
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
14f8ab
---
14f8ab
 libglusterfs/src/globals.c            |  13 ++-
14f8ab
 libglusterfs/src/glusterfs/globals.h  |   3 +
14f8ab
 libglusterfs/src/glusterfs/mem-pool.h |  28 ++---
14f8ab
 libglusterfs/src/mem-pool.c           | 201 ++++++++++++++++++----------------
14f8ab
 libglusterfs/src/syncop.c             |   7 ++
14f8ab
 5 files changed, 146 insertions(+), 106 deletions(-)
14f8ab
14f8ab
diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c
14f8ab
index 02098e6..e433ee8 100644
14f8ab
--- a/libglusterfs/src/globals.c
14f8ab
+++ b/libglusterfs/src/globals.c
14f8ab
@@ -319,7 +319,18 @@ glusterfs_cleanup(void *ptr)
14f8ab
         GF_FREE(thread_syncopctx.groups);
14f8ab
     }
14f8ab
 
14f8ab
-    mem_pool_thread_destructor();
14f8ab
+    mem_pool_thread_destructor(NULL);
14f8ab
+}
14f8ab
+
14f8ab
+void
14f8ab
+gf_thread_needs_cleanup(void)
14f8ab
+{
14f8ab
+    /* The value stored in free_key TLS is not really used for anything, but
14f8ab
+     * pthread implementation doesn't call the TLS destruction function unless
14f8ab
+     * it's != NULL. This function must be called whenever something is
14f8ab
+     * allocated for this thread so that glusterfs_cleanup() will be called
14f8ab
+     * and resources can be released. */
14f8ab
+    (void)pthread_setspecific(free_key, (void *)1);
14f8ab
 }
14f8ab
 
14f8ab
 static void
14f8ab
diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h
14f8ab
index e218285..31717ed 100644
14f8ab
--- a/libglusterfs/src/glusterfs/globals.h
14f8ab
+++ b/libglusterfs/src/glusterfs/globals.h
14f8ab
@@ -181,6 +181,9 @@ glusterfs_leaseid_exist(void);
14f8ab
 int
14f8ab
 glusterfs_globals_init(glusterfs_ctx_t *ctx);
14f8ab
 
14f8ab
+void
14f8ab
+gf_thread_needs_cleanup(void);
14f8ab
+
14f8ab
 struct tvec_base *
14f8ab
 glusterfs_ctx_tw_get(glusterfs_ctx_t *ctx);
14f8ab
 void
14f8ab
diff --git a/libglusterfs/src/glusterfs/mem-pool.h b/libglusterfs/src/glusterfs/mem-pool.h
14f8ab
index be0a26d..97bf76c 100644
14f8ab
--- a/libglusterfs/src/glusterfs/mem-pool.h
14f8ab
+++ b/libglusterfs/src/glusterfs/mem-pool.h
14f8ab
@@ -245,24 +245,26 @@ typedef struct per_thread_pool {
14f8ab
 } per_thread_pool_t;
14f8ab
 
14f8ab
 typedef struct per_thread_pool_list {
14f8ab
-    /*
14f8ab
-     * These first two members are protected by the global pool lock.  When
14f8ab
-     * a thread first tries to use any pool, we create one of these.  We
14f8ab
-     * link it into the global list using thr_list so the pool-sweeper
14f8ab
-     * thread can find it, and use pthread_setspecific so this thread can
14f8ab
-     * find it.  When the per-thread destructor runs, we "poison" the pool
14f8ab
-     * list to prevent further allocations.  This also signals to the
14f8ab
-     * pool-sweeper thread that the list should be detached and freed after
14f8ab
-     * the next time it's swept.
14f8ab
-     */
14f8ab
+    /* thr_list is used to place the TLS pool_list into the active global list
14f8ab
+     * (pool_threads) or the inactive global list (pool_free_threads). It's
14f8ab
+     * protected by the global pool_lock. */
14f8ab
     struct list_head thr_list;
14f8ab
-    unsigned int poison;
14f8ab
+
14f8ab
+    /* This lock is used to update poison and the hot/cold lists of members
14f8ab
+     * of 'pools' array. */
14f8ab
+    pthread_spinlock_t lock;
14f8ab
+
14f8ab
+    /* This field is used to mark a pool_list as not being owned by any thread.
14f8ab
+     * This means that the sweeper thread won't be cleaning objects stored in
14f8ab
+     * its pools. mem_put() uses it to decide if the object being released is
14f8ab
+     * placed into its original pool_list or directly destroyed. */
14f8ab
+    bool poison;
14f8ab
+
14f8ab
     /*
14f8ab
      * There's really more than one pool, but the actual number is hidden
14f8ab
      * in the implementation code so we just make it a single-element array
14f8ab
      * here.
14f8ab
      */
14f8ab
-    pthread_spinlock_t lock;
14f8ab
     per_thread_pool_t pools[1];
14f8ab
 } per_thread_pool_list_t;
14f8ab
 
14f8ab
@@ -307,7 +309,7 @@ void
14f8ab
 mem_pool_destroy(struct mem_pool *pool);
14f8ab
 
14f8ab
 void
14f8ab
-mem_pool_thread_destructor(void);
14f8ab
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list);
14f8ab
 
14f8ab
 void
14f8ab
 gf_mem_acct_enable_set(void *ctx);
14f8ab
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
14f8ab
index d88041d..2b41c01 100644
14f8ab
--- a/libglusterfs/src/mem-pool.c
14f8ab
+++ b/libglusterfs/src/mem-pool.c
14f8ab
@@ -367,7 +367,6 @@ static __thread per_thread_pool_list_t *thread_pool_list = NULL;
14f8ab
 #define POOL_SWEEP_SECS 30
14f8ab
 
14f8ab
 typedef struct {
14f8ab
-    struct list_head death_row;
14f8ab
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
14f8ab
     unsigned int n_cold_lists;
14f8ab
 } sweep_state_t;
14f8ab
@@ -384,36 +383,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
14f8ab
 static unsigned int init_count = 0;
14f8ab
 static pthread_t sweeper_tid;
14f8ab
 
14f8ab
-gf_boolean_t
14f8ab
+static bool
14f8ab
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
14f8ab
 {
14f8ab
     unsigned int i;
14f8ab
     per_thread_pool_t *pt_pool;
14f8ab
-    gf_boolean_t poisoned;
14f8ab
 
14f8ab
     (void)pthread_spin_lock(&pool_list->lock);
14f8ab
 
14f8ab
-    poisoned = pool_list->poison != 0;
14f8ab
-    if (!poisoned) {
14f8ab
-        for (i = 0; i < NPOOLS; ++i) {
14f8ab
-            pt_pool = &pool_list->pools[i];
14f8ab
-            if (pt_pool->cold_list) {
14f8ab
-                if (state->n_cold_lists >= N_COLD_LISTS) {
14f8ab
-                    break;
14f8ab
-                }
14f8ab
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
14f8ab
+    for (i = 0; i < NPOOLS; ++i) {
14f8ab
+        pt_pool = &pool_list->pools[i];
14f8ab
+        if (pt_pool->cold_list) {
14f8ab
+            if (state->n_cold_lists >= N_COLD_LISTS) {
14f8ab
+                (void)pthread_spin_unlock(&pool_list->lock);
14f8ab
+                return true;
14f8ab
             }
14f8ab
-            pt_pool->cold_list = pt_pool->hot_list;
14f8ab
-            pt_pool->hot_list = NULL;
14f8ab
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
14f8ab
         }
14f8ab
+        pt_pool->cold_list = pt_pool->hot_list;
14f8ab
+        pt_pool->hot_list = NULL;
14f8ab
     }
14f8ab
 
14f8ab
     (void)pthread_spin_unlock(&pool_list->lock);
14f8ab
 
14f8ab
-    return poisoned;
14f8ab
+    return false;
14f8ab
 }
14f8ab
 
14f8ab
-void
14f8ab
+static void
14f8ab
 free_obj_list(pooled_obj_hdr_t *victim)
14f8ab
 {
14f8ab
     pooled_obj_hdr_t *next;
14f8ab
@@ -425,82 +421,96 @@ free_obj_list(pooled_obj_hdr_t *victim)
14f8ab
     }
14f8ab
 }
14f8ab
 
14f8ab
-void *
14f8ab
+static void *
14f8ab
 pool_sweeper(void *arg)
14f8ab
 {
14f8ab
     sweep_state_t state;
14f8ab
     per_thread_pool_list_t *pool_list;
14f8ab
-    per_thread_pool_list_t *next_pl;
14f8ab
-    per_thread_pool_t *pt_pool;
14f8ab
-    unsigned int i;
14f8ab
-    gf_boolean_t poisoned;
14f8ab
+    uint32_t i;
14f8ab
+    bool pending;
14f8ab
 
14f8ab
     /*
14f8ab
      * This is all a bit inelegant, but the point is to avoid doing
14f8ab
      * expensive things (like freeing thousands of objects) while holding a
14f8ab
-     * global lock.  Thus, we split each iteration into three passes, with
14f8ab
+     * global lock.  Thus, we split each iteration into two passes, with
14f8ab
      * only the first and fastest holding the lock.
14f8ab
      */
14f8ab
 
14f8ab
+    pending = true;
14f8ab
+
14f8ab
     for (;;) {
14f8ab
-        sleep(POOL_SWEEP_SECS);
14f8ab
+        /* If we know there's pending work to do (or it's the first run), we
14f8ab
+         * do collect garbage more often. */
14f8ab
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
14f8ab
+
14f8ab
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
14f8ab
-        INIT_LIST_HEAD(&state.death_row);
14f8ab
         state.n_cold_lists = 0;
14f8ab
+        pending = false;
14f8ab
 
14f8ab
         /* First pass: collect stuff that needs our attention. */
14f8ab
         (void)pthread_mutex_lock(&pool_lock);
14f8ab
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
14f8ab
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
14f8ab
         {
14f8ab
-            (void)pthread_mutex_unlock(&pool_lock);
14f8ab
-            poisoned = collect_garbage(&state, pool_list);
14f8ab
-            (void)pthread_mutex_lock(&pool_lock);
14f8ab
-
14f8ab
-            if (poisoned) {
14f8ab
-                list_move(&pool_list->thr_list, &state.death_row);
14f8ab
+            if (collect_garbage(&state, pool_list)) {
14f8ab
+                pending = true;
14f8ab
             }
14f8ab
         }
14f8ab
         (void)pthread_mutex_unlock(&pool_lock);
14f8ab
 
14f8ab
-        /* Second pass: free dead pools. */
14f8ab
-        (void)pthread_mutex_lock(&pool_free_lock);
14f8ab
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
14f8ab
-        {
14f8ab
-            for (i = 0; i < NPOOLS; ++i) {
14f8ab
-                pt_pool = &pool_list->pools[i];
14f8ab
-                free_obj_list(pt_pool->cold_list);
14f8ab
-                free_obj_list(pt_pool->hot_list);
14f8ab
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
14f8ab
-            }
14f8ab
-            list_del(&pool_list->thr_list);
14f8ab
-            list_add(&pool_list->thr_list, &pool_free_threads);
14f8ab
-        }
14f8ab
-        (void)pthread_mutex_unlock(&pool_free_lock);
14f8ab
-
14f8ab
-        /* Third pass: free cold objects from live pools. */
14f8ab
+        /* Second pass: free cold objects from live pools. */
14f8ab
         for (i = 0; i < state.n_cold_lists; ++i) {
14f8ab
             free_obj_list(state.cold_lists[i]);
14f8ab
         }
14f8ab
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
14f8ab
     }
14f8ab
+
14f8ab
+    return NULL;
14f8ab
 }
14f8ab
 
14f8ab
 void
14f8ab
-mem_pool_thread_destructor(void)
14f8ab
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
14f8ab
 {
14f8ab
-    per_thread_pool_list_t *pool_list = thread_pool_list;
14f8ab
-
14f8ab
-    /* The pool-sweeper thread will take it from here.
14f8ab
-     *
14f8ab
-     * We can change 'poison' here without taking locks because the change
14f8ab
-     * itself doesn't interact with other parts of the code and a simple write
14f8ab
-     * is already atomic from the point of view of the processor.
14f8ab
-     *
14f8ab
-     * This change can modify what mem_put() does, but both possibilities are
14f8ab
-     * fine until the sweeper thread kicks in. The real synchronization must be
14f8ab
-     * between mem_put() and the sweeper thread. */
14f8ab
+    per_thread_pool_t *pt_pool;
14f8ab
+    uint32_t i;
14f8ab
+
14f8ab
+    if (pool_list == NULL) {
14f8ab
+        pool_list = thread_pool_list;
14f8ab
+    }
14f8ab
+
14f8ab
+    /* The current thread is terminating. None of the allocated objects will
14f8ab
+     * be used again. We can directly destroy them here instead of delaying
14f8ab
+     * it until the next sweeper loop. */
14f8ab
     if (pool_list != NULL) {
14f8ab
-        pool_list->poison = 1;
14f8ab
+        /* Remove pool_list from the global list to avoid that sweeper
14f8ab
+         * could touch it. */
14f8ab
+        pthread_mutex_lock(&pool_lock);
14f8ab
+        list_del(&pool_list->thr_list);
14f8ab
+        pthread_mutex_unlock(&pool_lock);
14f8ab
+
14f8ab
+        /* We need to protect hot/cold changes from potential mem_put() calls
14f8ab
+         * that reference this pool_list. Once poison is set to true, we are
14f8ab
+         * sure that no one else will touch hot/cold lists. The only possible
14f8ab
+         * race is when at the same moment a mem_put() is adding a new item
14f8ab
+         * to the hot list. We protect from that by taking pool_list->lock.
14f8ab
+         * After that we don't need the lock to destroy the hot/cold lists. */
14f8ab
+        pthread_spin_lock(&pool_list->lock);
14f8ab
+        pool_list->poison = true;
14f8ab
+        pthread_spin_unlock(&pool_list->lock);
14f8ab
+
14f8ab
+        for (i = 0; i < NPOOLS; i++) {
14f8ab
+            pt_pool = &pool_list->pools[i];
14f8ab
+
14f8ab
+            free_obj_list(pt_pool->hot_list);
14f8ab
+            pt_pool->hot_list = NULL;
14f8ab
+
14f8ab
+            free_obj_list(pt_pool->cold_list);
14f8ab
+            pt_pool->cold_list = NULL;
14f8ab
+        }
14f8ab
+
14f8ab
+        pthread_mutex_lock(&pool_free_lock);
14f8ab
+        list_add(&pool_list->thr_list, &pool_free_threads);
14f8ab
+        pthread_mutex_unlock(&pool_free_lock);
14f8ab
+
14f8ab
         thread_pool_list = NULL;
14f8ab
     }
14f8ab
 }
14f8ab
@@ -528,6 +538,30 @@ mem_pools_preinit(void)
14f8ab
     init_done = GF_MEMPOOL_INIT_EARLY;
14f8ab
 }
14f8ab
 
14f8ab
+static __attribute__((destructor)) void
14f8ab
+mem_pools_postfini(void)
14f8ab
+{
14f8ab
+    per_thread_pool_list_t *pool_list, *next;
14f8ab
+
14f8ab
+    /* This is part of a process shutdown (or dlclose()) which means that
14f8ab
+     * most probably all threads should be stopped. However this is not the
14f8ab
+     * case for gluster and there are even legitimate situations in which we
14f8ab
+     * could have some threads alive. What is sure is that none of those
14f8ab
+     * threads should be using anything from this library, so destroying
14f8ab
+     * everything here should be fine and safe. */
14f8ab
+
14f8ab
+    list_for_each_entry_safe(pool_list, next, &pool_threads, thr_list)
14f8ab
+    {
14f8ab
+        mem_pool_thread_destructor(pool_list);
14f8ab
+    }
14f8ab
+
14f8ab
+    list_for_each_entry_safe(pool_list, next, &pool_free_threads, thr_list)
14f8ab
+    {
14f8ab
+        list_del(&pool_list->thr_list);
14f8ab
+        FREE(pool_list);
14f8ab
+    }
14f8ab
+}
14f8ab
+
14f8ab
 /* Call mem_pools_init() once threading has been configured completely. This
14f8ab
  * prevent the pool_sweeper thread from getting killed once the main() thread
14f8ab
  * exits during deamonizing. */
14f8ab
@@ -560,10 +594,6 @@ mem_pools_fini(void)
14f8ab
              */
14f8ab
             break;
14f8ab
         case 1: {
14f8ab
-            per_thread_pool_list_t *pool_list;
14f8ab
-            per_thread_pool_list_t *next_pl;
14f8ab
-            unsigned int i;
14f8ab
-
14f8ab
             /* if mem_pools_init() was not called, sweeper_tid will be invalid
14f8ab
              * and the functions will error out. That is not critical. In all
14f8ab
              * other cases, the sweeper_tid will be valid and the thread gets
14f8ab
@@ -571,32 +601,11 @@ mem_pools_fini(void)
14f8ab
             (void)pthread_cancel(sweeper_tid);
14f8ab
             (void)pthread_join(sweeper_tid, NULL);
14f8ab
 
14f8ab
-            /* At this point all threads should have already terminated, so
14f8ab
-             * it should be safe to destroy all pending per_thread_pool_list_t
14f8ab
-             * structures that are stored for each thread. */
14f8ab
-            mem_pool_thread_destructor();
14f8ab
-
14f8ab
-            /* free all objects from all pools */
14f8ab
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
14f8ab
-                                     thr_list)
14f8ab
-            {
14f8ab
-                for (i = 0; i < NPOOLS; ++i) {
14f8ab
-                    free_obj_list(pool_list->pools[i].hot_list);
14f8ab
-                    free_obj_list(pool_list->pools[i].cold_list);
14f8ab
-                    pool_list->pools[i].hot_list = NULL;
14f8ab
-                    pool_list->pools[i].cold_list = NULL;
14f8ab
-                }
14f8ab
-
14f8ab
-                list_del(&pool_list->thr_list);
14f8ab
-                FREE(pool_list);
14f8ab
-            }
14f8ab
-
14f8ab
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
14f8ab
-                                     thr_list)
14f8ab
-            {
14f8ab
-                list_del(&pool_list->thr_list);
14f8ab
-                FREE(pool_list);
14f8ab
-            }
14f8ab
+            /* There could be threads still running in some cases, so we can't
14f8ab
+             * destroy pool_lists in use. We can also not destroy unused
14f8ab
+             * pool_lists because some allocated objects may still be pointing
14f8ab
+             * to them. */
14f8ab
+            mem_pool_thread_destructor(NULL);
14f8ab
 
14f8ab
             init_done = GF_MEMPOOL_INIT_DESTROY;
14f8ab
             /* Fall through. */
14f8ab
@@ -617,7 +626,7 @@ mem_pools_fini(void)
14f8ab
 {
14f8ab
 }
14f8ab
 void
14f8ab
-mem_pool_thread_destructor(void)
14f8ab
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
14f8ab
 {
14f8ab
 }
14f8ab
 
14f8ab
@@ -738,13 +747,21 @@ mem_get_pool_list(void)
14f8ab
         }
14f8ab
     }
14f8ab
 
14f8ab
+    /* There's no need to take pool_list->lock, because this is already an
14f8ab
+     * atomic operation and we don't need to synchronize it with any change
14f8ab
+     * in hot/cold lists. */
14f8ab
+    pool_list->poison = false;
14f8ab
+
14f8ab
     (void)pthread_mutex_lock(&pool_lock);
14f8ab
-    pool_list->poison = 0;
14f8ab
     list_add(&pool_list->thr_list, &pool_threads);
14f8ab
     (void)pthread_mutex_unlock(&pool_lock);
14f8ab
 
14f8ab
     thread_pool_list = pool_list;
14f8ab
 
14f8ab
+    /* Ensure that all memory objects associated to the new pool_list are
14f8ab
+     * destroyed when the thread terminates. */
14f8ab
+    gf_thread_needs_cleanup();
14f8ab
+
14f8ab
     return pool_list;
14f8ab
 }
14f8ab
 
14f8ab
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
14f8ab
index 2eb7b49..0de53c6 100644
14f8ab
--- a/libglusterfs/src/syncop.c
14f8ab
+++ b/libglusterfs/src/syncop.c
14f8ab
@@ -97,6 +97,13 @@ syncopctx_setfsgroups(int count, const void *groups)
14f8ab
 
14f8ab
     /* set/reset the ngrps, this is where reset of groups is handled */
14f8ab
     opctx->ngrps = count;
14f8ab
+
14f8ab
+    if ((opctx->valid & SYNCOPCTX_GROUPS) == 0) {
14f8ab
+        /* This is the first time we are storing groups into the TLS structure
14f8ab
+         * so we mark the current thread so that it will be properly cleaned
14f8ab
+         * up when the thread terminates. */
14f8ab
+        gf_thread_needs_cleanup();
14f8ab
+    }
14f8ab
     opctx->valid |= SYNCOPCTX_GROUPS;
14f8ab
 
14f8ab
 out:
14f8ab
-- 
14f8ab
1.8.3.1
14f8ab