Blob Blame History Raw
From 75a9d946d252ce70460144615ca17dbdf2e80fab Mon Sep 17 00:00:00 2001
From: Xavi Hernandez <xhernandez@redhat.com>
Date: Fri, 7 Feb 2020 10:19:57 +0100
Subject: [PATCH 354/355] core: fix memory pool management races

Objects allocated from a per-thread memory pool keep a reference to it
to be able to return the object to the pool when not used anymore. The
object holding this reference can have a long life cycle that could
survive a glfs_fini() call.

This means that it's unsafe to destroy memory pools from glfs_fini().

Another side effect of destroying memory pools from glfs_fini() is that
the TLS variable that points to one of those pools cannot be reset for
all alive threads.  This means that any attempt to allocate memory from
those threads will access already free'd memory, which is very
dangerous.

To fix these issues, mem_pools_fini() doesn't destroy pool lists
anymore. Only at process termination the pools are destroyed.

Upatream patch:
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24099
> Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
> Fixes: bz#1801684
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>

Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
BUG: 1800703
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/192262
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 libglusterfs/src/globals.c            |  13 ++-
 libglusterfs/src/glusterfs/globals.h  |   3 +
 libglusterfs/src/glusterfs/mem-pool.h |  28 ++---
 libglusterfs/src/mem-pool.c           | 201 ++++++++++++++++++----------------
 libglusterfs/src/syncop.c             |   7 ++
 5 files changed, 146 insertions(+), 106 deletions(-)

diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c
index 02098e6..e433ee8 100644
--- a/libglusterfs/src/globals.c
+++ b/libglusterfs/src/globals.c
@@ -319,7 +319,18 @@ glusterfs_cleanup(void *ptr)
         GF_FREE(thread_syncopctx.groups);
     }
 
-    mem_pool_thread_destructor();
+    mem_pool_thread_destructor(NULL);
+}
+
+void
+gf_thread_needs_cleanup(void)
+{
+    /* The value stored in free_key TLS is not really used for anything, but
+     * pthread implementation doesn't call the TLS destruction function unless
+     * it's != NULL. This function must be called whenever something is
+     * allocated for this thread so that glusterfs_cleanup() will be called
+     * and resources can be released. */
+    (void)pthread_setspecific(free_key, (void *)1);
 }
 
 static void
diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h
index e218285..31717ed 100644
--- a/libglusterfs/src/glusterfs/globals.h
+++ b/libglusterfs/src/glusterfs/globals.h
@@ -181,6 +181,9 @@ glusterfs_leaseid_exist(void);
 int
 glusterfs_globals_init(glusterfs_ctx_t *ctx);
 
+void
+gf_thread_needs_cleanup(void);
+
 struct tvec_base *
 glusterfs_ctx_tw_get(glusterfs_ctx_t *ctx);
 void
diff --git a/libglusterfs/src/glusterfs/mem-pool.h b/libglusterfs/src/glusterfs/mem-pool.h
index be0a26d..97bf76c 100644
--- a/libglusterfs/src/glusterfs/mem-pool.h
+++ b/libglusterfs/src/glusterfs/mem-pool.h
@@ -245,24 +245,26 @@ typedef struct per_thread_pool {
 } per_thread_pool_t;
 
 typedef struct per_thread_pool_list {
-    /*
-     * These first two members are protected by the global pool lock.  When
-     * a thread first tries to use any pool, we create one of these.  We
-     * link it into the global list using thr_list so the pool-sweeper
-     * thread can find it, and use pthread_setspecific so this thread can
-     * find it.  When the per-thread destructor runs, we "poison" the pool
-     * list to prevent further allocations.  This also signals to the
-     * pool-sweeper thread that the list should be detached and freed after
-     * the next time it's swept.
-     */
+    /* thr_list is used to place the TLS pool_list into the active global list
+     * (pool_threads) or the inactive global list (pool_free_threads). It's
+     * protected by the global pool_lock. */
     struct list_head thr_list;
-    unsigned int poison;
+
+    /* This lock is used to update poison and the hot/cold lists of members
+     * of 'pools' array. */
+    pthread_spinlock_t lock;
+
+    /* This field is used to mark a pool_list as not being owned by any thread.
+     * This means that the sweeper thread won't be cleaning objects stored in
+     * its pools. mem_put() uses it to decide if the object being released is
+     * placed into its original pool_list or directly destroyed. */
+    bool poison;
+
     /*
      * There's really more than one pool, but the actual number is hidden
      * in the implementation code so we just make it a single-element array
      * here.
      */
-    pthread_spinlock_t lock;
     per_thread_pool_t pools[1];
 } per_thread_pool_list_t;
 
@@ -307,7 +309,7 @@ void
 mem_pool_destroy(struct mem_pool *pool);
 
 void
-mem_pool_thread_destructor(void);
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list);
 
 void
 gf_mem_acct_enable_set(void *ctx);
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
index d88041d..2b41c01 100644
--- a/libglusterfs/src/mem-pool.c
+++ b/libglusterfs/src/mem-pool.c
@@ -367,7 +367,6 @@ static __thread per_thread_pool_list_t *thread_pool_list = NULL;
 #define POOL_SWEEP_SECS 30
 
 typedef struct {
-    struct list_head death_row;
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
     unsigned int n_cold_lists;
 } sweep_state_t;
@@ -384,36 +383,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static unsigned int init_count = 0;
 static pthread_t sweeper_tid;
 
-gf_boolean_t
+static bool
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
 {
     unsigned int i;
     per_thread_pool_t *pt_pool;
-    gf_boolean_t poisoned;
 
     (void)pthread_spin_lock(&pool_list->lock);
 
-    poisoned = pool_list->poison != 0;
-    if (!poisoned) {
-        for (i = 0; i < NPOOLS; ++i) {
-            pt_pool = &pool_list->pools[i];
-            if (pt_pool->cold_list) {
-                if (state->n_cold_lists >= N_COLD_LISTS) {
-                    break;
-                }
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
+    for (i = 0; i < NPOOLS; ++i) {
+        pt_pool = &pool_list->pools[i];
+        if (pt_pool->cold_list) {
+            if (state->n_cold_lists >= N_COLD_LISTS) {
+                (void)pthread_spin_unlock(&pool_list->lock);
+                return true;
             }
-            pt_pool->cold_list = pt_pool->hot_list;
-            pt_pool->hot_list = NULL;
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
         }
+        pt_pool->cold_list = pt_pool->hot_list;
+        pt_pool->hot_list = NULL;
     }
 
     (void)pthread_spin_unlock(&pool_list->lock);
 
-    return poisoned;
+    return false;
 }
 
-void
+static void
 free_obj_list(pooled_obj_hdr_t *victim)
 {
     pooled_obj_hdr_t *next;
@@ -425,82 +421,96 @@ free_obj_list(pooled_obj_hdr_t *victim)
     }
 }
 
-void *
+static void *
 pool_sweeper(void *arg)
 {
     sweep_state_t state;
     per_thread_pool_list_t *pool_list;
-    per_thread_pool_list_t *next_pl;
-    per_thread_pool_t *pt_pool;
-    unsigned int i;
-    gf_boolean_t poisoned;
+    uint32_t i;
+    bool pending;
 
     /*
      * This is all a bit inelegant, but the point is to avoid doing
      * expensive things (like freeing thousands of objects) while holding a
-     * global lock.  Thus, we split each iteration into three passes, with
+     * global lock.  Thus, we split each iteration into two passes, with
      * only the first and fastest holding the lock.
      */
 
+    pending = true;
+
     for (;;) {
-        sleep(POOL_SWEEP_SECS);
+        /* If we know there's pending work to do (or it's the first run), we
+         * do collect garbage more often. */
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
+
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
-        INIT_LIST_HEAD(&state.death_row);
         state.n_cold_lists = 0;
+        pending = false;
 
         /* First pass: collect stuff that needs our attention. */
         (void)pthread_mutex_lock(&pool_lock);
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
         {
-            (void)pthread_mutex_unlock(&pool_lock);
-            poisoned = collect_garbage(&state, pool_list);
-            (void)pthread_mutex_lock(&pool_lock);
-
-            if (poisoned) {
-                list_move(&pool_list->thr_list, &state.death_row);
+            if (collect_garbage(&state, pool_list)) {
+                pending = true;
             }
         }
         (void)pthread_mutex_unlock(&pool_lock);
 
-        /* Second pass: free dead pools. */
-        (void)pthread_mutex_lock(&pool_free_lock);
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
-        {
-            for (i = 0; i < NPOOLS; ++i) {
-                pt_pool = &pool_list->pools[i];
-                free_obj_list(pt_pool->cold_list);
-                free_obj_list(pt_pool->hot_list);
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
-            }
-            list_del(&pool_list->thr_list);
-            list_add(&pool_list->thr_list, &pool_free_threads);
-        }
-        (void)pthread_mutex_unlock(&pool_free_lock);
-
-        /* Third pass: free cold objects from live pools. */
+        /* Second pass: free cold objects from live pools. */
         for (i = 0; i < state.n_cold_lists; ++i) {
             free_obj_list(state.cold_lists[i]);
         }
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
     }
+
+    return NULL;
 }
 
 void
-mem_pool_thread_destructor(void)
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
 {
-    per_thread_pool_list_t *pool_list = thread_pool_list;
-
-    /* The pool-sweeper thread will take it from here.
-     *
-     * We can change 'poison' here without taking locks because the change
-     * itself doesn't interact with other parts of the code and a simple write
-     * is already atomic from the point of view of the processor.
-     *
-     * This change can modify what mem_put() does, but both possibilities are
-     * fine until the sweeper thread kicks in. The real synchronization must be
-     * between mem_put() and the sweeper thread. */
+    per_thread_pool_t *pt_pool;
+    uint32_t i;
+
+    if (pool_list == NULL) {
+        pool_list = thread_pool_list;
+    }
+
+    /* The current thread is terminating. None of the allocated objects will
+     * be used again. We can directly destroy them here instead of delaying
+     * it until the next sweeper loop. */
     if (pool_list != NULL) {
-        pool_list->poison = 1;
+        /* Remove pool_list from the global list to avoid that sweeper
+         * could touch it. */
+        pthread_mutex_lock(&pool_lock);
+        list_del(&pool_list->thr_list);
+        pthread_mutex_unlock(&pool_lock);
+
+        /* We need to protect hot/cold changes from potential mem_put() calls
+         * that reference this pool_list. Once poison is set to true, we are
+         * sure that no one else will touch hot/cold lists. The only possible
+         * race is when at the same moment a mem_put() is adding a new item
+         * to the hot list. We protect from that by taking pool_list->lock.
+         * After that we don't need the lock to destroy the hot/cold lists. */
+        pthread_spin_lock(&pool_list->lock);
+        pool_list->poison = true;
+        pthread_spin_unlock(&pool_list->lock);
+
+        for (i = 0; i < NPOOLS; i++) {
+            pt_pool = &pool_list->pools[i];
+
+            free_obj_list(pt_pool->hot_list);
+            pt_pool->hot_list = NULL;
+
+            free_obj_list(pt_pool->cold_list);
+            pt_pool->cold_list = NULL;
+        }
+
+        pthread_mutex_lock(&pool_free_lock);
+        list_add(&pool_list->thr_list, &pool_free_threads);
+        pthread_mutex_unlock(&pool_free_lock);
+
         thread_pool_list = NULL;
     }
 }
@@ -528,6 +538,30 @@ mem_pools_preinit(void)
     init_done = GF_MEMPOOL_INIT_EARLY;
 }
 
+static __attribute__((destructor)) void
+mem_pools_postfini(void)
+{
+    per_thread_pool_list_t *pool_list, *next;
+
+    /* This is part of a process shutdown (or dlclose()) which means that
+     * most probably all threads should be stopped. However this is not the
+     * case for gluster and there are even legitimate situations in which we
+     * could have some threads alive. What is sure is that none of those
+     * threads should be using anything from this library, so destroying
+     * everything here should be fine and safe. */
+
+    list_for_each_entry_safe(pool_list, next, &pool_threads, thr_list)
+    {
+        mem_pool_thread_destructor(pool_list);
+    }
+
+    list_for_each_entry_safe(pool_list, next, &pool_free_threads, thr_list)
+    {
+        list_del(&pool_list->thr_list);
+        FREE(pool_list);
+    }
+}
+
 /* Call mem_pools_init() once threading has been configured completely. This
  * prevent the pool_sweeper thread from getting killed once the main() thread
  * exits during deamonizing. */
@@ -560,10 +594,6 @@ mem_pools_fini(void)
              */
             break;
         case 1: {
-            per_thread_pool_list_t *pool_list;
-            per_thread_pool_list_t *next_pl;
-            unsigned int i;
-
             /* if mem_pools_init() was not called, sweeper_tid will be invalid
              * and the functions will error out. That is not critical. In all
              * other cases, the sweeper_tid will be valid and the thread gets
@@ -571,32 +601,11 @@ mem_pools_fini(void)
             (void)pthread_cancel(sweeper_tid);
             (void)pthread_join(sweeper_tid, NULL);
 
-            /* At this point all threads should have already terminated, so
-             * it should be safe to destroy all pending per_thread_pool_list_t
-             * structures that are stored for each thread. */
-            mem_pool_thread_destructor();
-
-            /* free all objects from all pools */
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
-                                     thr_list)
-            {
-                for (i = 0; i < NPOOLS; ++i) {
-                    free_obj_list(pool_list->pools[i].hot_list);
-                    free_obj_list(pool_list->pools[i].cold_list);
-                    pool_list->pools[i].hot_list = NULL;
-                    pool_list->pools[i].cold_list = NULL;
-                }
-
-                list_del(&pool_list->thr_list);
-                FREE(pool_list);
-            }
-
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
-                                     thr_list)
-            {
-                list_del(&pool_list->thr_list);
-                FREE(pool_list);
-            }
+            /* There could be threads still running in some cases, so we can't
+             * destroy pool_lists in use. We can also not destroy unused
+             * pool_lists because some allocated objects may still be pointing
+             * to them. */
+            mem_pool_thread_destructor(NULL);
 
             init_done = GF_MEMPOOL_INIT_DESTROY;
             /* Fall through. */
@@ -617,7 +626,7 @@ mem_pools_fini(void)
 {
 }
 void
-mem_pool_thread_destructor(void)
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
 {
 }
 
@@ -738,13 +747,21 @@ mem_get_pool_list(void)
         }
     }
 
+    /* There's no need to take pool_list->lock, because this is already an
+     * atomic operation and we don't need to synchronize it with any change
+     * in hot/cold lists. */
+    pool_list->poison = false;
+
     (void)pthread_mutex_lock(&pool_lock);
-    pool_list->poison = 0;
     list_add(&pool_list->thr_list, &pool_threads);
     (void)pthread_mutex_unlock(&pool_lock);
 
     thread_pool_list = pool_list;
 
+    /* Ensure that all memory objects associated to the new pool_list are
+     * destroyed when the thread terminates. */
+    gf_thread_needs_cleanup();
+
     return pool_list;
 }
 
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
index 2eb7b49..0de53c6 100644
--- a/libglusterfs/src/syncop.c
+++ b/libglusterfs/src/syncop.c
@@ -97,6 +97,13 @@ syncopctx_setfsgroups(int count, const void *groups)
 
     /* set/reset the ngrps, this is where reset of groups is handled */
     opctx->ngrps = count;
+
+    if ((opctx->valid & SYNCOPCTX_GROUPS) == 0) {
+        /* This is the first time we are storing groups into the TLS structure
+         * so we mark the current thread so that it will be properly cleaned
+         * up when the thread terminates. */
+        gf_thread_needs_cleanup();
+    }
     opctx->valid |= SYNCOPCTX_GROUPS;
 
 out:
-- 
1.8.3.1