21ab4e
From 24fa9d19e113ab8dd219aaa270451921d513a0d8 Mon Sep 17 00:00:00 2001
21ab4e
From: Jeff Darcy <jdarcy@redhat.com>
21ab4e
Date: Fri, 14 Oct 2016 10:04:07 -0400
21ab4e
Subject: [PATCH 322/361] libglusterfs: make memory pools more thread-friendly
21ab4e
21ab4e
Early multiplexing tests revealed *massive* contention on certain
21ab4e
pools' global locks - especially for dictionaries and secondarily for
21ab4e
call stubs.  For the thread counts that multiplexing can create, a
21ab4e
more lock-free solution is clearly needed.  Also, the current mem-pool
21ab4e
implementation does a poor job releasing memory back to the system,
21ab4e
artificially inflating memory usage to match whatever the worst case
21ab4e
was since the process started.  This is bad in general, but especially
21ab4e
so for multiplexing where there are more pools and a major point of
21ab4e
the whole exercise is to reduce memory consumption.
21ab4e
21ab4e
The basic ideas for the new design are these
21ab4e
21ab4e
  There is one pool, globally, for each power-of-two size range.
21ab4e
  Every attempt to create a new pool within this range will instead
21ab4e
  add a reference to the existing pool.
21ab4e
21ab4e
  Instead of adding pools for each translator within each multiplexed
21ab4e
  brick (potentially infinite and quite possibly thousands), we
21ab4e
  allocate one set of size-based pools per *thread* (hundreds at
21ab4e
  worst).
21ab4e
21ab4e
  Each per-thread pool is divided into hot and cold lists.  Every
21ab4e
  allocation first attempts to use the hot list, then the cold list.
21ab4e
  When objects are freed, they always go on the hot list.
21ab4e
21ab4e
  There is one global "pool sweeper" thread, which periodically
21ab4e
  reclaims everything in each pool's cold list and then "demotes" the
21ab4e
  current hot list to be the new cold list.
21ab4e
21ab4e
  For normal allocation activity, only a per-thread lock need be
21ab4e
  taken, and even that only to guard against very rare contention from
21ab4e
  the pool sweeper.  When threads start and stop, a global lock must
21ab4e
  be taken to add them to the pool sweeper's list.  Lock contention is
21ab4e
  therefore extremely low, and the hot/cold lists also provide good
21ab4e
  locality.
21ab4e
21ab4e
A more complete explanation (of a similar earlier design) can be found
21ab4e
here:
21ab4e
21ab4e
 http://www.gluster.org/pipermail/gluster-devel/2016-October/051160.html
21ab4e
21ab4e
mainline:
21ab4e
> BUG: 1385758
21ab4e
> Reviewed-on: https://review.gluster.org/15645
21ab4e
> Smoke: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> NetBSD-regression: NetBSD Build System <jenkins@build.gluster.org>
21ab4e
> CentOS-regression: Gluster Build System <jenkins@build.gluster.org>
21ab4e
> Reviewed-by: Xavier Hernandez <xhernandez@datalab.es>
21ab4e
> Reviewed-by: Shyamsundar Ranganathan <srangana@redhat.com>
21ab4e
(cherry picked from commit 2d539668aa608ba885e5a4e1aed5e188f83f4a2f)
21ab4e
21ab4e
BUG: 1417815
21ab4e
Change-Id: I5bc8a1ba57cfb553998f979a498886e0d006e665
21ab4e
Signed-off-by: Jeff Darcy <jdarcy@redhat.com>
21ab4e
Reviewed-on: https://code.engineering.redhat.com/gerrit/101303
21ab4e
Tested-by: Milind Changire <mchangir@redhat.com>
21ab4e
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
21ab4e
---
21ab4e
 glusterfsd/src/glusterfsd.c     |   7 +
21ab4e
 libglusterfs/src/mem-pool.c     | 515 ++++++++++++++++++++++++----------------
21ab4e
 libglusterfs/src/mem-pool.h     |  67 ++++--
21ab4e
 libglusterfs/src/statedump.c    |   6 +-
21ab4e
 tests/basic/quota-anon-fd-nfs.t |   9 +
21ab4e
 5 files changed, 377 insertions(+), 227 deletions(-)
21ab4e
21ab4e
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
21ab4e
index e16c943..485da4e 100644
21ab4e
--- a/glusterfsd/src/glusterfsd.c
21ab4e
+++ b/glusterfsd/src/glusterfsd.c
21ab4e
@@ -2429,6 +2429,13 @@ main (int argc, char *argv[])
21ab4e
         if (ret)
21ab4e
                 goto out;
21ab4e
 
21ab4e
+        /*
21ab4e
+         * If we do this before daemonize, the pool-sweeper thread dies with
21ab4e
+         * the parent, but we want to do it as soon as possible after that in
21ab4e
+         * case something else depends on pool allocations.
21ab4e
+         */
21ab4e
+        mem_pools_init ();
21ab4e
+
21ab4e
 #ifdef GF_LINUX_HOST_OS
21ab4e
 #ifdef HAVE_LINUX_OOM_H
21ab4e
         ret = set_oom_score_adj (ctx);
21ab4e
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
21ab4e
index 88fbdf5..2b3208c 100644
21ab4e
--- a/libglusterfs/src/mem-pool.c
21ab4e
+++ b/libglusterfs/src/mem-pool.c
21ab4e
@@ -338,8 +338,20 @@ free:
21ab4e
 }
21ab4e
 
21ab4e
 
21ab4e
-/* Based on the mem-type that is used for the allocation, GF_FREE can be
21ab4e
+/*
21ab4e
+ * Based on the mem-type that is used for the allocation, GF_FREE can be
21ab4e
  * called, or something more intelligent for the structure can be done.
21ab4e
+ *
21ab4e
+ * NOTE: this will not work for allocations from a memory pool.  It never did,
21ab4e
+ * because those allocations never set the type in the first place.  Any caller
21ab4e
+ * that relies on knowing whether a particular type was allocated via a pool or
21ab4e
+ * not is *BROKEN*, or will be any time either this module or the module
21ab4e
+ * "owning" the type changes.  The proper way to handle this, assuming the
21ab4e
+ * caller is not smart enough to call a type-specific free function themselves,
21ab4e
+ * would be to create a callback interface where destructors for specific types
21ab4e
+ * can be registered so that code *here* (GF_FREE, mem_put, etc.) can do the
21ab4e
+ * right thing.  That allows type-specific behavior without creating the kind
21ab4e
+ * of fragile coupling that we have now.
21ab4e
  */
21ab4e
 int
21ab4e
 gf_get_mem_type (void *ptr)
21ab4e
@@ -358,78 +370,201 @@ gf_get_mem_type (void *ptr)
21ab4e
 }
21ab4e
 
21ab4e
 
21ab4e
+#define POOL_SMALLEST   7       /* i.e. 128 */
21ab4e
+#define POOL_LARGEST    20      /* i.e. 1048576 */
21ab4e
+#define NPOOLS          (POOL_LARGEST - POOL_SMALLEST + 1)
21ab4e
+#define N_COLD_LISTS    1024
21ab4e
+#define POOL_SWEEP_SECS 30
21ab4e
 
21ab4e
-struct mem_pool *
21ab4e
-mem_pool_new_fn (unsigned long sizeof_type,
21ab4e
-                 unsigned long count, char *name)
21ab4e
+static pthread_key_t            pool_key;
21ab4e
+static pthread_mutex_t          pool_lock       = PTHREAD_MUTEX_INITIALIZER;
21ab4e
+static struct list_head         pool_threads;
21ab4e
+static pthread_mutex_t          pool_free_lock  = PTHREAD_MUTEX_INITIALIZER;
21ab4e
+static struct list_head         pool_free_threads;
21ab4e
+static struct mem_pool          pools[NPOOLS];
21ab4e
+static size_t                   pool_list_size;
21ab4e
+static unsigned long            sweep_times;
21ab4e
+static unsigned long            sweep_usecs;
21ab4e
+static unsigned long            frees_to_system;
21ab4e
+
21ab4e
+typedef struct {
21ab4e
+        struct list_head        death_row;
21ab4e
+        pooled_obj_hdr_t        *cold_lists[N_COLD_LISTS];
21ab4e
+        unsigned int            n_cold_lists;
21ab4e
+} sweep_state_t;
21ab4e
+
21ab4e
+
21ab4e
+void
21ab4e
+collect_garbage (sweep_state_t *state, per_thread_pool_list_t *pool_list)
21ab4e
 {
21ab4e
-        struct mem_pool  *mem_pool = NULL;
21ab4e
-        unsigned long     padded_sizeof_type = 0;
21ab4e
-        GF_UNUSED void             *pool = NULL;
21ab4e
-        GF_UNUSED int               i = 0;
21ab4e
-        int               ret = 0;
21ab4e
-        GF_UNUSED struct list_head *list = NULL;
21ab4e
-        glusterfs_ctx_t  *ctx = NULL;
21ab4e
-
21ab4e
-        if (!sizeof_type || !count) {
21ab4e
-                gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL,
21ab4e
-                                  LG_MSG_INVALID_ARG, "invalid argument");
21ab4e
-                return NULL;
21ab4e
+        unsigned int            i;
21ab4e
+        per_thread_pool_t       *pt_pool;
21ab4e
+
21ab4e
+        if (pool_list->poison) {
21ab4e
+                list_del (&pool_list->thr_list);
21ab4e
+                list_add (&pool_list->thr_list, &state->death_row);
21ab4e
+                return;
21ab4e
         }
21ab4e
-        padded_sizeof_type = sizeof_type + GF_MEM_POOL_PAD_BOUNDARY;
21ab4e
 
21ab4e
-        mem_pool = GF_CALLOC (sizeof (*mem_pool), 1, gf_common_mt_mem_pool);
21ab4e
-        if (!mem_pool)
21ab4e
-                return NULL;
21ab4e
+        if (state->n_cold_lists >= N_COLD_LISTS) {
21ab4e
+                return;
21ab4e
+        }
21ab4e
 
21ab4e
-        ret = gf_asprintf (&mem_pool->name, "%s:%s", THIS->name, name);
21ab4e
-        if (ret < 0)
21ab4e
-                return NULL;
21ab4e
+        (void) pthread_spin_lock (&pool_list->lock);
21ab4e
+        for (i = 0; i < NPOOLS; ++i) {
21ab4e
+                pt_pool = &pool_list->pools[i];
21ab4e
+                if (pt_pool->cold_list) {
21ab4e
+                        state->cold_lists[state->n_cold_lists++]
21ab4e
+                                = pt_pool->cold_list;
21ab4e
+                }
21ab4e
+                pt_pool->cold_list = pt_pool->hot_list;
21ab4e
+                pt_pool->hot_list = NULL;
21ab4e
+                if (state->n_cold_lists >= N_COLD_LISTS) {
21ab4e
+                        /* We'll just catch up on a future pass. */
21ab4e
+                        break;
21ab4e
+                }
21ab4e
+        }
21ab4e
+        (void) pthread_spin_unlock (&pool_list->lock);
21ab4e
+}
21ab4e
 
21ab4e
-        if (!mem_pool->name) {
21ab4e
-                GF_FREE (mem_pool);
21ab4e
-                return NULL;
21ab4e
+
21ab4e
+void
21ab4e
+free_obj_list (pooled_obj_hdr_t *victim)
21ab4e
+{
21ab4e
+        pooled_obj_hdr_t        *next;
21ab4e
+
21ab4e
+        while (victim) {
21ab4e
+                next = victim->next;
21ab4e
+                free (victim);
21ab4e
+                victim = next;
21ab4e
+                ++frees_to_system;
21ab4e
         }
21ab4e
+}
21ab4e
 
21ab4e
-        LOCK_INIT (&mem_pool->lock);
21ab4e
-        INIT_LIST_HEAD (&mem_pool->list);
21ab4e
-        INIT_LIST_HEAD (&mem_pool->global_list);
21ab4e
+void *
21ab4e
+pool_sweeper (void *arg)
21ab4e
+{
21ab4e
+        sweep_state_t           state;
21ab4e
+        per_thread_pool_list_t  *pool_list;
21ab4e
+        per_thread_pool_list_t  *next_pl;
21ab4e
+        per_thread_pool_t       *pt_pool;
21ab4e
+        unsigned int            i;
21ab4e
+        struct timeval          begin_time;
21ab4e
+        struct timeval          end_time;
21ab4e
+        struct timeval          elapsed;
21ab4e
 
21ab4e
-        mem_pool->padded_sizeof_type = padded_sizeof_type;
21ab4e
-        mem_pool->real_sizeof_type = sizeof_type;
21ab4e
+        /*
21ab4e
+         * This is all a bit inelegant, but the point is to avoid doing
21ab4e
+         * expensive things (like freeing thousands of objects) while holding a
21ab4e
+         * global lock.  Thus, we split each iteration into three passes, with
21ab4e
+         * only the first and fastest holding the lock.
21ab4e
+         */
21ab4e
 
21ab4e
-#ifndef DEBUG
21ab4e
-        mem_pool->cold_count = count;
21ab4e
-        pool = GF_CALLOC (count, padded_sizeof_type, gf_common_mt_long);
21ab4e
-        if (!pool) {
21ab4e
-                GF_FREE (mem_pool->name);
21ab4e
-                GF_FREE (mem_pool);
21ab4e
-                return NULL;
21ab4e
+        for (;;) {
21ab4e
+                sleep (POOL_SWEEP_SECS);
21ab4e
+                INIT_LIST_HEAD (&state.death_row);
21ab4e
+                state.n_cold_lists = 0;
21ab4e
+
21ab4e
+                /* First pass: collect stuff that needs our attention. */
21ab4e
+                (void) gettimeofday (&begin_time, NULL);
21ab4e
+                (void) pthread_mutex_lock (&pool_lock);
21ab4e
+                list_for_each_entry_safe (pool_list, next_pl,
21ab4e
+                                          &pool_threads, thr_list) {
21ab4e
+                        collect_garbage (&state, pool_list);
21ab4e
+                }
21ab4e
+                (void) pthread_mutex_unlock (&pool_lock);
21ab4e
+                (void) gettimeofday (&end_time, NULL);
21ab4e
+                timersub (&end_time, &begin_time, &elapsed);
21ab4e
+                sweep_usecs += elapsed.tv_sec * 1000000 + elapsed.tv_usec;
21ab4e
+                sweep_times += 1;
21ab4e
+
21ab4e
+                /* Second pass: free dead pools. */
21ab4e
+                (void) pthread_mutex_lock (&pool_free_lock);
21ab4e
+                list_for_each_entry_safe (pool_list, next_pl,
21ab4e
+                                          &state.death_row, thr_list) {
21ab4e
+                        for (i = 0; i < NPOOLS; ++i) {
21ab4e
+                                pt_pool = &pool_list->pools[i];
21ab4e
+                                free_obj_list (pt_pool->cold_list);
21ab4e
+                                free_obj_list (pt_pool->hot_list);
21ab4e
+                                pt_pool->hot_list = pt_pool->cold_list = NULL;
21ab4e
+                        }
21ab4e
+                        list_del (&pool_list->thr_list);
21ab4e
+                        list_add (&pool_list->thr_list, &pool_free_threads);
21ab4e
+                }
21ab4e
+                (void) pthread_mutex_unlock (&pool_free_lock);
21ab4e
+
21ab4e
+                /* Third pass: free cold objects from live pools. */
21ab4e
+                for (i = 0; i < state.n_cold_lists; ++i) {
21ab4e
+                        free_obj_list (state.cold_lists[i]);
21ab4e
+                }
21ab4e
+        }
21ab4e
+}
21ab4e
+
21ab4e
+
21ab4e
+void
21ab4e
+pool_destructor (void *arg)
21ab4e
+{
21ab4e
+        per_thread_pool_list_t  *pool_list      = arg;
21ab4e
+
21ab4e
+        /* The pool-sweeper thread will take it from here. */
21ab4e
+        pool_list->poison = 1;
21ab4e
+}
21ab4e
+
21ab4e
+
21ab4e
+static __attribute__((constructor)) void
21ab4e
+mem_pools_preinit (void)
21ab4e
+{
21ab4e
+#if !defined(GF_DISABLE_MEMPOOL)
21ab4e
+        unsigned int    i;
21ab4e
+
21ab4e
+        /* Use a pthread_key destructor to clean up when a thread exits. */
21ab4e
+        if (pthread_key_create (&pool_key, pool_destructor) != 0) {
21ab4e
+                gf_log ("mem-pool", GF_LOG_CRITICAL,
21ab4e
+                        "failed to initialize mem-pool key");
21ab4e
         }
21ab4e
 
21ab4e
-        for (i = 0; i < count; i++) {
21ab4e
-                list = pool + (i * (padded_sizeof_type));
21ab4e
-                INIT_LIST_HEAD (list);
21ab4e
-                list_add_tail (list, &mem_pool->list);
21ab4e
+        INIT_LIST_HEAD (&pool_threads);
21ab4e
+        INIT_LIST_HEAD (&pool_free_threads);
21ab4e
+
21ab4e
+        for (i = 0; i < NPOOLS; ++i) {
21ab4e
+                pools[i].power_of_two = POOL_SMALLEST + i;
21ab4e
         }
21ab4e
 
21ab4e
-        mem_pool->pool = pool;
21ab4e
-        mem_pool->pool_end = pool + (count * (padded_sizeof_type));
21ab4e
+        pool_list_size = sizeof (per_thread_pool_list_t)
21ab4e
+                       + sizeof (per_thread_pool_t) * (NPOOLS - 1);
21ab4e
 #endif
21ab4e
+}
21ab4e
 
21ab4e
-        /* add this pool to the global list */
21ab4e
-        ctx = THIS->ctx;
21ab4e
-        if (!ctx)
21ab4e
-                goto out;
21ab4e
+void
21ab4e
+mem_pools_init (void)
21ab4e
+{
21ab4e
+        pthread_t       kid;
21ab4e
 
21ab4e
-        LOCK (&ctx->lock);
21ab4e
-        {
21ab4e
-                list_add (&mem_pool->global_list, &ctx->mempool_list);
21ab4e
+        (void) pthread_create (&kid, NULL, pool_sweeper, NULL);
21ab4e
+        (void) pthread_detach (kid);
21ab4e
+}
21ab4e
+ 
21ab4e
+struct mem_pool *
21ab4e
+mem_pool_new_fn (unsigned long sizeof_type,
21ab4e
+                 unsigned long count, char *name)
21ab4e
+{
21ab4e
+        unsigned int            i;
21ab4e
+
21ab4e
+        if (!sizeof_type) {
21ab4e
+                gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL,
21ab4e
+                                  LG_MSG_INVALID_ARG, "invalid argument");
21ab4e
+                return NULL;
21ab4e
+        }
21ab4e
+
21ab4e
+        for (i = 0; i < NPOOLS; ++i) {
21ab4e
+                if (sizeof_type <= AVAILABLE_SIZE(pools[i].power_of_two)) {
21ab4e
+                        return &pools[i];
21ab4e
+                }
21ab4e
         }
21ab4e
-        UNLOCK (&ctx->lock);
21ab4e
 
21ab4e
-out:
21ab4e
-        return mem_pool;
21ab4e
+        gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL,
21ab4e
+                          LG_MSG_INVALID_ARG, "invalid argument");
21ab4e
+        return NULL;
21ab4e
 }
21ab4e
 
21ab4e
 void*
21ab4e
@@ -445,117 +580,132 @@ mem_get0 (struct mem_pool *mem_pool)
21ab4e
 
21ab4e
         ptr = mem_get(mem_pool);
21ab4e
 
21ab4e
-        if (ptr)
21ab4e
-                memset(ptr, 0, mem_pool->real_sizeof_type);
21ab4e
+        if (ptr) {
21ab4e
+                memset (ptr, 0, AVAILABLE_SIZE(mem_pool->power_of_two));
21ab4e
+        }
21ab4e
 
21ab4e
         return ptr;
21ab4e
 }
21ab4e
 
21ab4e
-void *
21ab4e
-mem_get (struct mem_pool *mem_pool)
21ab4e
+
21ab4e
+per_thread_pool_list_t *
21ab4e
+mem_get_pool_list (void)
21ab4e
 {
21ab4e
-        struct list_head *list = NULL;
21ab4e
-        void             *ptr = NULL;
21ab4e
-        int             *in_use = NULL;
21ab4e
-        struct mem_pool **pool_ptr = NULL;
21ab4e
+        per_thread_pool_list_t  *pool_list;
21ab4e
+        unsigned int            i;
21ab4e
 
21ab4e
-        if (!mem_pool) {
21ab4e
-                gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL,
21ab4e
-                                  LG_MSG_INVALID_ARG, "invalid argument");
21ab4e
-                return NULL;
21ab4e
+        pool_list = pthread_getspecific (pool_key);
21ab4e
+        if (pool_list) {
21ab4e
+                return pool_list;
21ab4e
         }
21ab4e
 
21ab4e
-        LOCK (&mem_pool->lock);
21ab4e
-        {
21ab4e
-                mem_pool->alloc_count++;
21ab4e
-                if (mem_pool->cold_count) {
21ab4e
-                        list = mem_pool->list.next;
21ab4e
-                        list_del (list);
21ab4e
+        (void) pthread_mutex_lock (&pool_free_lock);
21ab4e
+        if (!list_empty (&pool_free_threads)) {
21ab4e
+                pool_list = list_entry (pool_free_threads.next,
21ab4e
+                                        per_thread_pool_list_t, thr_list);
21ab4e
+                list_del (&pool_list->thr_list);
21ab4e
+        }
21ab4e
+        (void) pthread_mutex_unlock (&pool_free_lock);
21ab4e
 
21ab4e
-                        mem_pool->hot_count++;
21ab4e
-                        mem_pool->cold_count--;
21ab4e
+        if (!pool_list) {
21ab4e
+                pool_list = GF_CALLOC (pool_list_size, 1,
21ab4e
+                                       gf_common_mt_mem_pool);
21ab4e
+                if (!pool_list) {
21ab4e
+                        return NULL;
21ab4e
+                }
21ab4e
 
21ab4e
-                        if (mem_pool->max_alloc < mem_pool->hot_count)
21ab4e
-                                mem_pool->max_alloc = mem_pool->hot_count;
21ab4e
+                INIT_LIST_HEAD (&pool_list->thr_list);
21ab4e
+                (void) pthread_spin_init (&pool_list->lock,
21ab4e
+                                          PTHREAD_PROCESS_PRIVATE);
21ab4e
+                for (i = 0; i < NPOOLS; ++i) {
21ab4e
+                        pool_list->pools[i].parent = &pools[i];
21ab4e
+                        pool_list->pools[i].hot_list = NULL;
21ab4e
+                        pool_list->pools[i].cold_list = NULL;
21ab4e
+                }
21ab4e
+        }
21ab4e
 
21ab4e
-                        ptr = list;
21ab4e
-                        in_use = (ptr + GF_MEM_POOL_LIST_BOUNDARY +
21ab4e
-                                  GF_MEM_POOL_PTR);
21ab4e
-                        *in_use = 1;
21ab4e
+        (void) pthread_mutex_lock (&pool_lock);
21ab4e
+        pool_list->poison = 0;
21ab4e
+        list_add (&pool_list->thr_list, &pool_threads);
21ab4e
+        (void) pthread_mutex_unlock (&pool_lock);
21ab4e
 
21ab4e
-                        goto fwd_addr_out;
21ab4e
-                }
21ab4e
+        (void) pthread_setspecific (pool_key, pool_list);
21ab4e
+        return pool_list;
21ab4e
+}
21ab4e
 
21ab4e
-                /* This is a problem area. If we've run out of
21ab4e
-                 * chunks in our slab above, we need to allocate
21ab4e
-                 * enough memory to service this request.
21ab4e
-                 * The problem is, these individual chunks will fail
21ab4e
-                 * the first address range check in __is_member. Now, since
21ab4e
-                 * we're not allocating a full second slab, we wont have
21ab4e
-                 * enough info perform the range check in __is_member.
21ab4e
-                 *
21ab4e
-                 * I am working around this by performing a regular allocation
21ab4e
-                 * , just the way the caller would've done when not using the
21ab4e
-                 * mem-pool. That also means, we're not padding the size with
21ab4e
-                 * the list_head structure because, this will not be added to
21ab4e
-                 * the list of chunks that belong to the mem-pool allocated
21ab4e
-                 * initially.
21ab4e
-                 *
21ab4e
-                 * This is the best we can do without adding functionality for
21ab4e
-                 * managing multiple slabs. That does not interest us at present
21ab4e
-                 * because it is too much work knowing that a better slab
21ab4e
-                 * allocator is coming RSN.
21ab4e
-                 */
21ab4e
-                mem_pool->pool_misses++;
21ab4e
-                mem_pool->curr_stdalloc++;
21ab4e
-                if (mem_pool->max_stdalloc < mem_pool->curr_stdalloc)
21ab4e
-                        mem_pool->max_stdalloc = mem_pool->curr_stdalloc;
21ab4e
-                ptr = GF_CALLOC (1, mem_pool->padded_sizeof_type,
21ab4e
-                                 gf_common_mt_mem_pool);
21ab4e
-
21ab4e
-                /* Memory coming from the heap need not be transformed from a
21ab4e
-                 * chunkhead to a usable pointer since it is not coming from
21ab4e
-                 * the pool.
21ab4e
-                 */
21ab4e
+pooled_obj_hdr_t *
21ab4e
+mem_get_from_pool (per_thread_pool_t *pt_pool)
21ab4e
+{
21ab4e
+        pooled_obj_hdr_t        *retval;
21ab4e
+
21ab4e
+        retval = pt_pool->hot_list;
21ab4e
+        if (retval) {
21ab4e
+                (void) __sync_fetch_and_add (&pt_pool->parent->allocs_hot, 1);
21ab4e
+                pt_pool->hot_list = retval->next;
21ab4e
+                return retval;
21ab4e
         }
21ab4e
-fwd_addr_out:
21ab4e
-        pool_ptr = mem_pool_from_ptr (ptr);
21ab4e
-        *pool_ptr = (struct mem_pool *)mem_pool;
21ab4e
-        ptr = mem_pool_chunkhead2ptr (ptr);
21ab4e
-        UNLOCK (&mem_pool->lock);
21ab4e
 
21ab4e
-        return ptr;
21ab4e
+        retval = pt_pool->cold_list;
21ab4e
+        if (retval) {
21ab4e
+                (void) __sync_fetch_and_add (&pt_pool->parent->allocs_cold, 1);
21ab4e
+                pt_pool->cold_list = retval->next;
21ab4e
+                return retval;
21ab4e
+        }
21ab4e
+
21ab4e
+        (void) __sync_fetch_and_add (&pt_pool->parent->allocs_stdc, 1);
21ab4e
+        return malloc (1 << pt_pool->parent->power_of_two);
21ab4e
 }
21ab4e
 
21ab4e
 
21ab4e
-static int
21ab4e
-__is_member (struct mem_pool *pool, void *ptr)
21ab4e
+void *
21ab4e
+mem_get (struct mem_pool *mem_pool)
21ab4e
 {
21ab4e
-        if (!pool || !ptr) {
21ab4e
+#if defined(GF_DISABLE_MEMPOOL)
21ab4e
+        return GF_CALLOC (1, mem_pool->real_sizeof_type,
21ab4e
+                          gf_common_mt_mem_pool);
21ab4e
+#else
21ab4e
+        per_thread_pool_list_t  *pool_list;
21ab4e
+        per_thread_pool_t       *pt_pool;
21ab4e
+        pooled_obj_hdr_t        *retval;
21ab4e
+
21ab4e
+        if (!mem_pool) {
21ab4e
                 gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL,
21ab4e
                                   LG_MSG_INVALID_ARG, "invalid argument");
21ab4e
-                return -1;
21ab4e
+                return NULL;
21ab4e
         }
21ab4e
 
21ab4e
-        if (ptr < pool->pool || ptr >= pool->pool_end)
21ab4e
-                return 0;
21ab4e
+        pool_list = mem_get_pool_list ();
21ab4e
+        if (!pool_list || pool_list->poison) {
21ab4e
+                return NULL;
21ab4e
+        }
21ab4e
 
21ab4e
-        if ((mem_pool_ptr2chunkhead (ptr) - pool->pool)
21ab4e
-            % pool->padded_sizeof_type)
21ab4e
-                return -1;
21ab4e
+        (void) pthread_spin_lock (&pool_list->lock);
21ab4e
+        pt_pool = &pool_list->pools[mem_pool->power_of_two-POOL_SMALLEST];
21ab4e
+        retval = mem_get_from_pool (pt_pool);
21ab4e
+        (void) pthread_spin_unlock (&pool_list->lock);
21ab4e
 
21ab4e
-        return 1;
21ab4e
-}
21ab4e
+        if (!retval) {
21ab4e
+                return NULL;
21ab4e
+        }
21ab4e
 
21ab4e
+        retval->magic = GF_MEM_HEADER_MAGIC;
21ab4e
+        retval->next = NULL;
21ab4e
+        retval->pool_list = pool_list;;
21ab4e
+        retval->power_of_two = mem_pool->power_of_two;
21ab4e
+
21ab4e
+        return retval + 1;
21ab4e
+}
21ab4e
+#endif /* GF_DISABLE_MEMPOOL */
21ab4e
 
21ab4e
 void
21ab4e
 mem_put (void *ptr)
21ab4e
 {
21ab4e
-        struct list_head *list = NULL;
21ab4e
-        int    *in_use = NULL;
21ab4e
-        void   *head = NULL;
21ab4e
-        struct mem_pool **tmp = NULL;
21ab4e
-        struct mem_pool *pool = NULL;
21ab4e
+#if defined(GF_DISABLE_MEMPOOL)
21ab4e
+        GF_FREE (ptr);
21ab4e
+#else
21ab4e
+        pooled_obj_hdr_t        *hdr;
21ab4e
+        per_thread_pool_list_t  *pool_list;
21ab4e
+        per_thread_pool_t       *pt_pool;
21ab4e
 
21ab4e
         if (!ptr) {
21ab4e
                 gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, EINVAL,
21ab4e
@@ -563,71 +713,21 @@ mem_put (void *ptr)
21ab4e
                 return;
21ab4e
         }
21ab4e
 
21ab4e
-        list = head = mem_pool_ptr2chunkhead (ptr);
21ab4e
-        tmp = mem_pool_from_ptr (head);
21ab4e
-        if (!tmp) {
21ab4e
-                gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, 0,
21ab4e
-                                  LG_MSG_PTR_HEADER_CORRUPTED,
21ab4e
-                                  "ptr header is corrupted");
21ab4e
-                return;
21ab4e
-        }
21ab4e
-
21ab4e
-        pool = *tmp;
21ab4e
-        if (!pool) {
21ab4e
-                gf_msg_callingfn ("mem-pool", GF_LOG_ERROR, 0,
21ab4e
-                                  LG_MSG_MEMPOOL_PTR_NULL,
21ab4e
-                                  "mem-pool ptr is NULL");
21ab4e
+        hdr = ((pooled_obj_hdr_t *)ptr) - 1;
21ab4e
+        if (hdr->magic != GF_MEM_HEADER_MAGIC) {
21ab4e
+                /* Not one of ours; don't touch it. */
21ab4e
                 return;
21ab4e
         }
21ab4e
-        LOCK (&pool->lock);
21ab4e
-        {
21ab4e
-
21ab4e
-                switch (__is_member (pool, ptr))
21ab4e
-                {
21ab4e
-                case 1:
21ab4e
-                        in_use = (head + GF_MEM_POOL_LIST_BOUNDARY +
21ab4e
-                                  GF_MEM_POOL_PTR);
21ab4e
-                        if (!is_mem_chunk_in_use(in_use)) {
21ab4e
-                                gf_msg_callingfn ("mem-pool", GF_LOG_CRITICAL,
21ab4e
-                                                  0,
21ab4e
-                                                  LG_MSG_MEMPOOL_INVALID_FREE,
21ab4e
-                                                  "mem_put called on freed ptr"
21ab4e
-                                                  " %p of mem pool %p", ptr,
21ab4e
-                                                  pool);
21ab4e
-                                break;
21ab4e
-                        }
21ab4e
-                        pool->hot_count--;
21ab4e
-                        pool->cold_count++;
21ab4e
-                        *in_use = 0;
21ab4e
-                        list_add (list, &pool->list);
21ab4e
-                        break;
21ab4e
-                case -1:
21ab4e
-                        /* For some reason, the address given is within
21ab4e
-                         * the address range of the mem-pool but does not align
21ab4e
-                         * with the expected start of a chunk that includes
21ab4e
-                         * the list headers also. Sounds like a problem in
21ab4e
-                         * layers of clouds up above us. ;)
21ab4e
-                         */
21ab4e
-                        abort ();
21ab4e
-                        break;
21ab4e
-                case 0:
21ab4e
-                        /* The address is outside the range of the mem-pool. We
21ab4e
-                         * assume here that this address was allocated at a
21ab4e
-                         * point when the mem-pool was out of chunks in mem_get
21ab4e
-                         * or the programmer has made a mistake by calling the
21ab4e
-                         * wrong de-allocation interface. We do
21ab4e
-                         * not have enough info to distinguish between the two
21ab4e
-                         * situations.
21ab4e
-                         */
21ab4e
-                        pool->curr_stdalloc--;
21ab4e
-                        GF_FREE (list);
21ab4e
-                        break;
21ab4e
-                default:
21ab4e
-                        /* log error */
21ab4e
-                        break;
21ab4e
-                }
21ab4e
-        }
21ab4e
-        UNLOCK (&pool->lock);
21ab4e
+        pool_list = hdr->pool_list;
21ab4e
+        pt_pool = &pool_list->pools[hdr->power_of_two-POOL_SMALLEST];
21ab4e
+
21ab4e
+        (void) pthread_spin_lock (&pool_list->lock);
21ab4e
+        hdr->magic = GF_MEM_INVALID_MAGIC;
21ab4e
+        hdr->next = pt_pool->hot_list;
21ab4e
+        pt_pool->hot_list = hdr;
21ab4e
+        (void) __sync_fetch_and_add (&pt_pool->parent->frees_to_list, 1);
21ab4e
+        (void) pthread_spin_unlock (&pool_list->lock);
21ab4e
+#endif /* GF_DISABLE_MEMPOOL */
21ab4e
 }
21ab4e
 
21ab4e
 void
21ab4e
@@ -636,16 +736,11 @@ mem_pool_destroy (struct mem_pool *pool)
21ab4e
         if (!pool)
21ab4e
                 return;
21ab4e
 
21ab4e
-        gf_msg (THIS->name, GF_LOG_INFO, 0, LG_MSG_MEM_POOL_DESTROY, "size=%lu "
21ab4e
-                "max=%d total=%"PRIu64, pool->padded_sizeof_type,
21ab4e
-                pool->max_alloc, pool->alloc_count);
21ab4e
-
21ab4e
-        list_del (&pool->global_list);
21ab4e
-
21ab4e
-        LOCK_DESTROY (&pool->lock);
21ab4e
-        GF_FREE (pool->name);
21ab4e
-        GF_FREE (pool->pool);
21ab4e
-        GF_FREE (pool);
21ab4e
-
21ab4e
-        return;
21ab4e
+        /*
21ab4e
+         * Pools are now permanent, so this does nothing.  Yes, this means we
21ab4e
+         * can keep allocating from a pool after calling mem_destroy on it, but
21ab4e
+         * that's kind of OK.  All of the objects *in* the pool will eventually
21ab4e
+         * be freed via the pool-sweeper thread, and this way we don't have to
21ab4e
+         * add a lot of reference-counting complexity.
21ab4e
+         */
21ab4e
 }
21ab4e
diff --git a/libglusterfs/src/mem-pool.h b/libglusterfs/src/mem-pool.h
21ab4e
index 6cff7be..0dc1863 100644
21ab4e
--- a/libglusterfs/src/mem-pool.h
21ab4e
+++ b/libglusterfs/src/mem-pool.h
21ab4e
@@ -209,24 +209,61 @@ out:
21ab4e
         return dup_mem;
21ab4e
 }
21ab4e
 
21ab4e
+typedef struct pooled_obj_hdr {
21ab4e
+        unsigned long                   magic;
21ab4e
+        struct pooled_obj_hdr           *next;
21ab4e
+        struct per_thread_pool_list     *pool_list;
21ab4e
+        unsigned int                    power_of_two;
21ab4e
+} pooled_obj_hdr_t;
21ab4e
+
21ab4e
+#define AVAILABLE_SIZE(p2)      ((1 << (p2)) - sizeof(pooled_obj_hdr_t))
21ab4e
+
21ab4e
+typedef struct per_thread_pool {
21ab4e
+        /* This never changes, so doesn't need a lock. */
21ab4e
+        struct mem_pool         *parent;
21ab4e
+        /* Everything else is protected by our own lock. */
21ab4e
+        pooled_obj_hdr_t        *hot_list;
21ab4e
+        pooled_obj_hdr_t        *cold_list;
21ab4e
+} per_thread_pool_t;
21ab4e
+
21ab4e
+typedef struct per_thread_pool_list {
21ab4e
+        /*
21ab4e
+         * These first two members are protected by the global pool lock.  When
21ab4e
+         * a thread first tries to use any pool, we create one of these.  We
21ab4e
+         * link it into the global list using thr_list so the pool-sweeper
21ab4e
+         * thread can find it, and use pthread_setspecific so this thread can
21ab4e
+         * find it.  When the per-thread destructor runs, we "poison" the pool
21ab4e
+         * list to prevent further allocations.  This also signals to the
21ab4e
+         * pool-sweeper thread that the list should be detached and freed after
21ab4e
+         * the next time it's swept.
21ab4e
+         */
21ab4e
+        struct list_head        thr_list;
21ab4e
+        unsigned int            poison;
21ab4e
+        /*
21ab4e
+         * There's really more than one pool, but the actual number is hidden
21ab4e
+         * in the implementation code so we just make it a single-element array
21ab4e
+         * here.
21ab4e
+         */
21ab4e
+        pthread_spinlock_t      lock;
21ab4e
+        per_thread_pool_t       pools[1];
21ab4e
+} per_thread_pool_list_t;
21ab4e
+
21ab4e
 struct mem_pool {
21ab4e
-        struct list_head  list;
21ab4e
-        int               hot_count;
21ab4e
-        int               cold_count;
21ab4e
-        gf_lock_t         lock;
21ab4e
-        unsigned long     padded_sizeof_type;
21ab4e
-        void             *pool;
21ab4e
-        void             *pool_end;
21ab4e
-        int               real_sizeof_type;
21ab4e
-        uint64_t          alloc_count;
21ab4e
-        uint64_t          pool_misses;
21ab4e
-        int               max_alloc;
21ab4e
-        int               curr_stdalloc;
21ab4e
-        int               max_stdalloc;
21ab4e
-        char             *name;
21ab4e
-        struct list_head  global_list;
21ab4e
+        unsigned int            power_of_two;
21ab4e
+        /*
21ab4e
+         * Updates to these are *not* protected by a global lock, so races
21ab4e
+         * could occur and the numbers might be slightly off.  Don't expect
21ab4e
+         * them to line up exactly.  It's the general trends that matter, and
21ab4e
+         * it's not worth the locked-bus-cycle overhead to make these precise.
21ab4e
+         */
21ab4e
+        unsigned long           allocs_hot;
21ab4e
+        unsigned long           allocs_cold;
21ab4e
+        unsigned long           allocs_stdc;
21ab4e
+        unsigned long           frees_to_list;
21ab4e
 };
21ab4e
 
21ab4e
+void mem_pools_init (void);
21ab4e
+
21ab4e
 struct mem_pool *
21ab4e
 mem_pool_new_fn (unsigned long sizeof_type, unsigned long count, char *name);
21ab4e
 
21ab4e
diff --git a/libglusterfs/src/statedump.c b/libglusterfs/src/statedump.c
21ab4e
index a292857..bb8043a 100644
21ab4e
--- a/libglusterfs/src/statedump.c
21ab4e
+++ b/libglusterfs/src/statedump.c
21ab4e
@@ -376,6 +376,7 @@ gf_proc_dump_mem_info_to_dict (dict_t *dict)
21ab4e
 void
21ab4e
 gf_proc_dump_mempool_info (glusterfs_ctx_t *ctx)
21ab4e
 {
21ab4e
+#if defined(OLD_MEM_POOLS)
21ab4e
         struct mem_pool *pool = NULL;
21ab4e
 
21ab4e
         gf_proc_dump_add_section ("mempool");
21ab4e
@@ -394,11 +395,13 @@ gf_proc_dump_mempool_info (glusterfs_ctx_t *ctx)
21ab4e
                 gf_proc_dump_write ("cur-stdalloc", "%d", pool->curr_stdalloc);
21ab4e
                 gf_proc_dump_write ("max-stdalloc", "%d", pool->max_stdalloc);
21ab4e
         }
21ab4e
+#endif
21ab4e
 }
21ab4e
 
21ab4e
 void
21ab4e
 gf_proc_dump_mempool_info_to_dict (glusterfs_ctx_t *ctx, dict_t *dict)
21ab4e
 {
21ab4e
+#if defined(OLD_MEM_POOLS)
21ab4e
         struct mem_pool *pool = NULL;
21ab4e
         char            key[GF_DUMP_MAX_BUF_LEN] = {0,};
21ab4e
         int             count = 0;
21ab4e
@@ -458,8 +461,7 @@ gf_proc_dump_mempool_info_to_dict (glusterfs_ctx_t *ctx, dict_t *dict)
21ab4e
                 count++;
21ab4e
         }
21ab4e
         ret = dict_set_int32 (dict, "mempool-count", count);
21ab4e
-
21ab4e
-        return;
21ab4e
+#endif
21ab4e
 }
21ab4e
 
21ab4e
 void gf_proc_dump_latency_info (xlator_t *xl);
21ab4e
diff --git a/tests/basic/quota-anon-fd-nfs.t b/tests/basic/quota-anon-fd-nfs.t
21ab4e
index c6b0155..ea07b52 100755
21ab4e
--- a/tests/basic/quota-anon-fd-nfs.t
21ab4e
+++ b/tests/basic/quota-anon-fd-nfs.t
21ab4e
@@ -97,6 +97,15 @@ $CLI volume statedump $V0 all
21ab4e
 
21ab4e
 EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0
21ab4e
 
21ab4e
+# This is ugly, but there seems to be a latent race between other actions and
21ab4e
+# stopping the volume.  The visible symptom is that "umount -l" (run from
21ab4e
+# gf_umount_lazy in glusterd) hangs.  This happens pretty consistently with the
21ab4e
+# new mem-pool code, though it's not really anything to do with memory pools -
21ab4e
+# just with changed timing.  Adding the sleep here makes it work consistently.
21ab4e
+#
21ab4e
+# If anyone else wants to debug the race condition, feel free.
21ab4e
+sleep 3
21ab4e
+
21ab4e
 TEST $CLI volume stop $V0
21ab4e
 EXPECT "1" get_aux
21ab4e
 
21ab4e
-- 
21ab4e
1.8.3.1
21ab4e