cead9d
From 75a9d946d252ce70460144615ca17dbdf2e80fab Mon Sep 17 00:00:00 2001
cead9d
From: Xavi Hernandez <xhernandez@redhat.com>
cead9d
Date: Fri, 7 Feb 2020 10:19:57 +0100
cead9d
Subject: [PATCH 354/355] core: fix memory pool management races
cead9d
cead9d
Objects allocated from a per-thread memory pool keep a reference to it
cead9d
to be able to return the object to the pool when not used anymore. The
cead9d
object holding this reference can have a long life cycle that could
cead9d
survive a glfs_fini() call.
cead9d
cead9d
This means that it's unsafe to destroy memory pools from glfs_fini().
cead9d
cead9d
Another side effect of destroying memory pools from glfs_fini() is that
cead9d
the TLS variable that points to one of those pools cannot be reset for
cead9d
all alive threads.  This means that any attempt to allocate memory from
cead9d
those threads will access already free'd memory, which is very
cead9d
dangerous.
cead9d
cead9d
To fix these issues, mem_pools_fini() doesn't destroy pool lists
cead9d
anymore. Only at process termination the pools are destroyed.
cead9d
cead9d
Upatream patch:
cead9d
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24099
cead9d
> Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
cead9d
> Fixes: bz#1801684
cead9d
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
cead9d
cead9d
Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
cead9d
BUG: 1800703
cead9d
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
cead9d
Reviewed-on: https://code.engineering.redhat.com/gerrit/192262
cead9d
Tested-by: RHGS Build Bot <nigelb@redhat.com>
cead9d
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
cead9d
---
cead9d
 libglusterfs/src/globals.c            |  13 ++-
cead9d
 libglusterfs/src/glusterfs/globals.h  |   3 +
cead9d
 libglusterfs/src/glusterfs/mem-pool.h |  28 ++---
cead9d
 libglusterfs/src/mem-pool.c           | 201 ++++++++++++++++++----------------
cead9d
 libglusterfs/src/syncop.c             |   7 ++
cead9d
 5 files changed, 146 insertions(+), 106 deletions(-)
cead9d
cead9d
diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c
cead9d
index 02098e6..e433ee8 100644
cead9d
--- a/libglusterfs/src/globals.c
cead9d
+++ b/libglusterfs/src/globals.c
cead9d
@@ -319,7 +319,18 @@ glusterfs_cleanup(void *ptr)
cead9d
         GF_FREE(thread_syncopctx.groups);
cead9d
     }
cead9d
 
cead9d
-    mem_pool_thread_destructor();
cead9d
+    mem_pool_thread_destructor(NULL);
cead9d
+}
cead9d
+
cead9d
+void
cead9d
+gf_thread_needs_cleanup(void)
cead9d
+{
cead9d
+    /* The value stored in free_key TLS is not really used for anything, but
cead9d
+     * pthread implementation doesn't call the TLS destruction function unless
cead9d
+     * it's != NULL. This function must be called whenever something is
cead9d
+     * allocated for this thread so that glusterfs_cleanup() will be called
cead9d
+     * and resources can be released. */
cead9d
+    (void)pthread_setspecific(free_key, (void *)1);
cead9d
 }
cead9d
 
cead9d
 static void
cead9d
diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h
cead9d
index e218285..31717ed 100644
cead9d
--- a/libglusterfs/src/glusterfs/globals.h
cead9d
+++ b/libglusterfs/src/glusterfs/globals.h
cead9d
@@ -181,6 +181,9 @@ glusterfs_leaseid_exist(void);
cead9d
 int
cead9d
 glusterfs_globals_init(glusterfs_ctx_t *ctx);
cead9d
 
cead9d
+void
cead9d
+gf_thread_needs_cleanup(void);
cead9d
+
cead9d
 struct tvec_base *
cead9d
 glusterfs_ctx_tw_get(glusterfs_ctx_t *ctx);
cead9d
 void
cead9d
diff --git a/libglusterfs/src/glusterfs/mem-pool.h b/libglusterfs/src/glusterfs/mem-pool.h
cead9d
index be0a26d..97bf76c 100644
cead9d
--- a/libglusterfs/src/glusterfs/mem-pool.h
cead9d
+++ b/libglusterfs/src/glusterfs/mem-pool.h
cead9d
@@ -245,24 +245,26 @@ typedef struct per_thread_pool {
cead9d
 } per_thread_pool_t;
cead9d
 
cead9d
 typedef struct per_thread_pool_list {
cead9d
-    /*
cead9d
-     * These first two members are protected by the global pool lock.  When
cead9d
-     * a thread first tries to use any pool, we create one of these.  We
cead9d
-     * link it into the global list using thr_list so the pool-sweeper
cead9d
-     * thread can find it, and use pthread_setspecific so this thread can
cead9d
-     * find it.  When the per-thread destructor runs, we "poison" the pool
cead9d
-     * list to prevent further allocations.  This also signals to the
cead9d
-     * pool-sweeper thread that the list should be detached and freed after
cead9d
-     * the next time it's swept.
cead9d
-     */
cead9d
+    /* thr_list is used to place the TLS pool_list into the active global list
cead9d
+     * (pool_threads) or the inactive global list (pool_free_threads). It's
cead9d
+     * protected by the global pool_lock. */
cead9d
     struct list_head thr_list;
cead9d
-    unsigned int poison;
cead9d
+
cead9d
+    /* This lock is used to update poison and the hot/cold lists of members
cead9d
+     * of 'pools' array. */
cead9d
+    pthread_spinlock_t lock;
cead9d
+
cead9d
+    /* This field is used to mark a pool_list as not being owned by any thread.
cead9d
+     * This means that the sweeper thread won't be cleaning objects stored in
cead9d
+     * its pools. mem_put() uses it to decide if the object being released is
cead9d
+     * placed into its original pool_list or directly destroyed. */
cead9d
+    bool poison;
cead9d
+
cead9d
     /*
cead9d
      * There's really more than one pool, but the actual number is hidden
cead9d
      * in the implementation code so we just make it a single-element array
cead9d
      * here.
cead9d
      */
cead9d
-    pthread_spinlock_t lock;
cead9d
     per_thread_pool_t pools[1];
cead9d
 } per_thread_pool_list_t;
cead9d
 
cead9d
@@ -307,7 +309,7 @@ void
cead9d
 mem_pool_destroy(struct mem_pool *pool);
cead9d
 
cead9d
 void
cead9d
-mem_pool_thread_destructor(void);
cead9d
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list);
cead9d
 
cead9d
 void
cead9d
 gf_mem_acct_enable_set(void *ctx);
cead9d
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
cead9d
index d88041d..2b41c01 100644
cead9d
--- a/libglusterfs/src/mem-pool.c
cead9d
+++ b/libglusterfs/src/mem-pool.c
cead9d
@@ -367,7 +367,6 @@ static __thread per_thread_pool_list_t *thread_pool_list = NULL;
cead9d
 #define POOL_SWEEP_SECS 30
cead9d
 
cead9d
 typedef struct {
cead9d
-    struct list_head death_row;
cead9d
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
cead9d
     unsigned int n_cold_lists;
cead9d
 } sweep_state_t;
cead9d
@@ -384,36 +383,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
cead9d
 static unsigned int init_count = 0;
cead9d
 static pthread_t sweeper_tid;
cead9d
 
cead9d
-gf_boolean_t
cead9d
+static bool
cead9d
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
cead9d
 {
cead9d
     unsigned int i;
cead9d
     per_thread_pool_t *pt_pool;
cead9d
-    gf_boolean_t poisoned;
cead9d
 
cead9d
     (void)pthread_spin_lock(&pool_list->lock);
cead9d
 
cead9d
-    poisoned = pool_list->poison != 0;
cead9d
-    if (!poisoned) {
cead9d
-        for (i = 0; i < NPOOLS; ++i) {
cead9d
-            pt_pool = &pool_list->pools[i];
cead9d
-            if (pt_pool->cold_list) {
cead9d
-                if (state->n_cold_lists >= N_COLD_LISTS) {
cead9d
-                    break;
cead9d
-                }
cead9d
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
cead9d
+    for (i = 0; i < NPOOLS; ++i) {
cead9d
+        pt_pool = &pool_list->pools[i];
cead9d
+        if (pt_pool->cold_list) {
cead9d
+            if (state->n_cold_lists >= N_COLD_LISTS) {
cead9d
+                (void)pthread_spin_unlock(&pool_list->lock);
cead9d
+                return true;
cead9d
             }
cead9d
-            pt_pool->cold_list = pt_pool->hot_list;
cead9d
-            pt_pool->hot_list = NULL;
cead9d
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
cead9d
         }
cead9d
+        pt_pool->cold_list = pt_pool->hot_list;
cead9d
+        pt_pool->hot_list = NULL;
cead9d
     }
cead9d
 
cead9d
     (void)pthread_spin_unlock(&pool_list->lock);
cead9d
 
cead9d
-    return poisoned;
cead9d
+    return false;
cead9d
 }
cead9d
 
cead9d
-void
cead9d
+static void
cead9d
 free_obj_list(pooled_obj_hdr_t *victim)
cead9d
 {
cead9d
     pooled_obj_hdr_t *next;
cead9d
@@ -425,82 +421,96 @@ free_obj_list(pooled_obj_hdr_t *victim)
cead9d
     }
cead9d
 }
cead9d
 
cead9d
-void *
cead9d
+static void *
cead9d
 pool_sweeper(void *arg)
cead9d
 {
cead9d
     sweep_state_t state;
cead9d
     per_thread_pool_list_t *pool_list;
cead9d
-    per_thread_pool_list_t *next_pl;
cead9d
-    per_thread_pool_t *pt_pool;
cead9d
-    unsigned int i;
cead9d
-    gf_boolean_t poisoned;
cead9d
+    uint32_t i;
cead9d
+    bool pending;
cead9d
 
cead9d
     /*
cead9d
      * This is all a bit inelegant, but the point is to avoid doing
cead9d
      * expensive things (like freeing thousands of objects) while holding a
cead9d
-     * global lock.  Thus, we split each iteration into three passes, with
cead9d
+     * global lock.  Thus, we split each iteration into two passes, with
cead9d
      * only the first and fastest holding the lock.
cead9d
      */
cead9d
 
cead9d
+    pending = true;
cead9d
+
cead9d
     for (;;) {
cead9d
-        sleep(POOL_SWEEP_SECS);
cead9d
+        /* If we know there's pending work to do (or it's the first run), we
cead9d
+         * do collect garbage more often. */
cead9d
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
cead9d
+
cead9d
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
cead9d
-        INIT_LIST_HEAD(&state.death_row);
cead9d
         state.n_cold_lists = 0;
cead9d
+        pending = false;
cead9d
 
cead9d
         /* First pass: collect stuff that needs our attention. */
cead9d
         (void)pthread_mutex_lock(&pool_lock);
cead9d
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
cead9d
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
cead9d
         {
cead9d
-            (void)pthread_mutex_unlock(&pool_lock);
cead9d
-            poisoned = collect_garbage(&state, pool_list);
cead9d
-            (void)pthread_mutex_lock(&pool_lock);
cead9d
-
cead9d
-            if (poisoned) {
cead9d
-                list_move(&pool_list->thr_list, &state.death_row);
cead9d
+            if (collect_garbage(&state, pool_list)) {
cead9d
+                pending = true;
cead9d
             }
cead9d
         }
cead9d
         (void)pthread_mutex_unlock(&pool_lock);
cead9d
 
cead9d
-        /* Second pass: free dead pools. */
cead9d
-        (void)pthread_mutex_lock(&pool_free_lock);
cead9d
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
cead9d
-        {
cead9d
-            for (i = 0; i < NPOOLS; ++i) {
cead9d
-                pt_pool = &pool_list->pools[i];
cead9d
-                free_obj_list(pt_pool->cold_list);
cead9d
-                free_obj_list(pt_pool->hot_list);
cead9d
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
cead9d
-            }
cead9d
-            list_del(&pool_list->thr_list);
cead9d
-            list_add(&pool_list->thr_list, &pool_free_threads);
cead9d
-        }
cead9d
-        (void)pthread_mutex_unlock(&pool_free_lock);
cead9d
-
cead9d
-        /* Third pass: free cold objects from live pools. */
cead9d
+        /* Second pass: free cold objects from live pools. */
cead9d
         for (i = 0; i < state.n_cold_lists; ++i) {
cead9d
             free_obj_list(state.cold_lists[i]);
cead9d
         }
cead9d
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
cead9d
     }
cead9d
+
cead9d
+    return NULL;
cead9d
 }
cead9d
 
cead9d
 void
cead9d
-mem_pool_thread_destructor(void)
cead9d
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
cead9d
 {
cead9d
-    per_thread_pool_list_t *pool_list = thread_pool_list;
cead9d
-
cead9d
-    /* The pool-sweeper thread will take it from here.
cead9d
-     *
cead9d
-     * We can change 'poison' here without taking locks because the change
cead9d
-     * itself doesn't interact with other parts of the code and a simple write
cead9d
-     * is already atomic from the point of view of the processor.
cead9d
-     *
cead9d
-     * This change can modify what mem_put() does, but both possibilities are
cead9d
-     * fine until the sweeper thread kicks in. The real synchronization must be
cead9d
-     * between mem_put() and the sweeper thread. */
cead9d
+    per_thread_pool_t *pt_pool;
cead9d
+    uint32_t i;
cead9d
+
cead9d
+    if (pool_list == NULL) {
cead9d
+        pool_list = thread_pool_list;
cead9d
+    }
cead9d
+
cead9d
+    /* The current thread is terminating. None of the allocated objects will
cead9d
+     * be used again. We can directly destroy them here instead of delaying
cead9d
+     * it until the next sweeper loop. */
cead9d
     if (pool_list != NULL) {
cead9d
-        pool_list->poison = 1;
cead9d
+        /* Remove pool_list from the global list to avoid that sweeper
cead9d
+         * could touch it. */
cead9d
+        pthread_mutex_lock(&pool_lock);
cead9d
+        list_del(&pool_list->thr_list);
cead9d
+        pthread_mutex_unlock(&pool_lock);
cead9d
+
cead9d
+        /* We need to protect hot/cold changes from potential mem_put() calls
cead9d
+         * that reference this pool_list. Once poison is set to true, we are
cead9d
+         * sure that no one else will touch hot/cold lists. The only possible
cead9d
+         * race is when at the same moment a mem_put() is adding a new item
cead9d
+         * to the hot list. We protect from that by taking pool_list->lock.
cead9d
+         * After that we don't need the lock to destroy the hot/cold lists. */
cead9d
+        pthread_spin_lock(&pool_list->lock);
cead9d
+        pool_list->poison = true;
cead9d
+        pthread_spin_unlock(&pool_list->lock);
cead9d
+
cead9d
+        for (i = 0; i < NPOOLS; i++) {
cead9d
+            pt_pool = &pool_list->pools[i];
cead9d
+
cead9d
+            free_obj_list(pt_pool->hot_list);
cead9d
+            pt_pool->hot_list = NULL;
cead9d
+
cead9d
+            free_obj_list(pt_pool->cold_list);
cead9d
+            pt_pool->cold_list = NULL;
cead9d
+        }
cead9d
+
cead9d
+        pthread_mutex_lock(&pool_free_lock);
cead9d
+        list_add(&pool_list->thr_list, &pool_free_threads);
cead9d
+        pthread_mutex_unlock(&pool_free_lock);
cead9d
+
cead9d
         thread_pool_list = NULL;
cead9d
     }
cead9d
 }
cead9d
@@ -528,6 +538,30 @@ mem_pools_preinit(void)
cead9d
     init_done = GF_MEMPOOL_INIT_EARLY;
cead9d
 }
cead9d
 
cead9d
+static __attribute__((destructor)) void
cead9d
+mem_pools_postfini(void)
cead9d
+{
cead9d
+    per_thread_pool_list_t *pool_list, *next;
cead9d
+
cead9d
+    /* This is part of a process shutdown (or dlclose()) which means that
cead9d
+     * most probably all threads should be stopped. However this is not the
cead9d
+     * case for gluster and there are even legitimate situations in which we
cead9d
+     * could have some threads alive. What is sure is that none of those
cead9d
+     * threads should be using anything from this library, so destroying
cead9d
+     * everything here should be fine and safe. */
cead9d
+
cead9d
+    list_for_each_entry_safe(pool_list, next, &pool_threads, thr_list)
cead9d
+    {
cead9d
+        mem_pool_thread_destructor(pool_list);
cead9d
+    }
cead9d
+
cead9d
+    list_for_each_entry_safe(pool_list, next, &pool_free_threads, thr_list)
cead9d
+    {
cead9d
+        list_del(&pool_list->thr_list);
cead9d
+        FREE(pool_list);
cead9d
+    }
cead9d
+}
cead9d
+
cead9d
 /* Call mem_pools_init() once threading has been configured completely. This
cead9d
  * prevent the pool_sweeper thread from getting killed once the main() thread
cead9d
  * exits during deamonizing. */
cead9d
@@ -560,10 +594,6 @@ mem_pools_fini(void)
cead9d
              */
cead9d
             break;
cead9d
         case 1: {
cead9d
-            per_thread_pool_list_t *pool_list;
cead9d
-            per_thread_pool_list_t *next_pl;
cead9d
-            unsigned int i;
cead9d
-
cead9d
             /* if mem_pools_init() was not called, sweeper_tid will be invalid
cead9d
              * and the functions will error out. That is not critical. In all
cead9d
              * other cases, the sweeper_tid will be valid and the thread gets
cead9d
@@ -571,32 +601,11 @@ mem_pools_fini(void)
cead9d
             (void)pthread_cancel(sweeper_tid);
cead9d
             (void)pthread_join(sweeper_tid, NULL);
cead9d
 
cead9d
-            /* At this point all threads should have already terminated, so
cead9d
-             * it should be safe to destroy all pending per_thread_pool_list_t
cead9d
-             * structures that are stored for each thread. */
cead9d
-            mem_pool_thread_destructor();
cead9d
-
cead9d
-            /* free all objects from all pools */
cead9d
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
cead9d
-                                     thr_list)
cead9d
-            {
cead9d
-                for (i = 0; i < NPOOLS; ++i) {
cead9d
-                    free_obj_list(pool_list->pools[i].hot_list);
cead9d
-                    free_obj_list(pool_list->pools[i].cold_list);
cead9d
-                    pool_list->pools[i].hot_list = NULL;
cead9d
-                    pool_list->pools[i].cold_list = NULL;
cead9d
-                }
cead9d
-
cead9d
-                list_del(&pool_list->thr_list);
cead9d
-                FREE(pool_list);
cead9d
-            }
cead9d
-
cead9d
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
cead9d
-                                     thr_list)
cead9d
-            {
cead9d
-                list_del(&pool_list->thr_list);
cead9d
-                FREE(pool_list);
cead9d
-            }
cead9d
+            /* There could be threads still running in some cases, so we can't
cead9d
+             * destroy pool_lists in use. We can also not destroy unused
cead9d
+             * pool_lists because some allocated objects may still be pointing
cead9d
+             * to them. */
cead9d
+            mem_pool_thread_destructor(NULL);
cead9d
 
cead9d
             init_done = GF_MEMPOOL_INIT_DESTROY;
cead9d
             /* Fall through. */
cead9d
@@ -617,7 +626,7 @@ mem_pools_fini(void)
cead9d
 {
cead9d
 }
cead9d
 void
cead9d
-mem_pool_thread_destructor(void)
cead9d
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
cead9d
 {
cead9d
 }
cead9d
 
cead9d
@@ -738,13 +747,21 @@ mem_get_pool_list(void)
cead9d
         }
cead9d
     }
cead9d
 
cead9d
+    /* There's no need to take pool_list->lock, because this is already an
cead9d
+     * atomic operation and we don't need to synchronize it with any change
cead9d
+     * in hot/cold lists. */
cead9d
+    pool_list->poison = false;
cead9d
+
cead9d
     (void)pthread_mutex_lock(&pool_lock);
cead9d
-    pool_list->poison = 0;
cead9d
     list_add(&pool_list->thr_list, &pool_threads);
cead9d
     (void)pthread_mutex_unlock(&pool_lock);
cead9d
 
cead9d
     thread_pool_list = pool_list;
cead9d
 
cead9d
+    /* Ensure that all memory objects associated to the new pool_list are
cead9d
+     * destroyed when the thread terminates. */
cead9d
+    gf_thread_needs_cleanup();
cead9d
+
cead9d
     return pool_list;
cead9d
 }
cead9d
 
cead9d
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
cead9d
index 2eb7b49..0de53c6 100644
cead9d
--- a/libglusterfs/src/syncop.c
cead9d
+++ b/libglusterfs/src/syncop.c
cead9d
@@ -97,6 +97,13 @@ syncopctx_setfsgroups(int count, const void *groups)
cead9d
 
cead9d
     /* set/reset the ngrps, this is where reset of groups is handled */
cead9d
     opctx->ngrps = count;
cead9d
+
cead9d
+    if ((opctx->valid & SYNCOPCTX_GROUPS) == 0) {
cead9d
+        /* This is the first time we are storing groups into the TLS structure
cead9d
+         * so we mark the current thread so that it will be properly cleaned
cead9d
+         * up when the thread terminates. */
cead9d
+        gf_thread_needs_cleanup();
cead9d
+    }
cead9d
     opctx->valid |= SYNCOPCTX_GROUPS;
cead9d
 
cead9d
 out:
cead9d
-- 
cead9d
1.8.3.1
cead9d