From 2975f68e139169ee2d2259cfbbb2a15b54dc3724 Mon Sep 17 00:00:00 2001
From: William Brown <firstyear@redhat.com>
Date: Wed, 26 Jul 2017 11:01:49 +1000
Subject: [PATCH] Ticket 49330 - Improve ndn cache performance 1.3.6

Backport from 1.3.7 master.

Bug Description:  Normalised DN's are a costly process to update
and maintain. As a result, a normalised DN cache was created. Yet
it was never able to perform well. In some datasets with large sets
of dn attr types, the NDN cache actively hurt performance.

The issue stemmed from 3 major issues in the design of the NDN
cache.

First, it is a global cache which means it exists behind
a rwlock. This causes delay as threads wait behind the lock
to access or update the cache (especially on a miss).

Second, the cache was limited to 4073 buckets. Despite the fact
that a prime number on a hash causes a skew in distribution,
this was in an NSPR hash - which does not grow dynamically,
rather devolving a bucket to a linked list. AS a result, once you
passed ~3000 your lookup performance would degrade rapidly to O(1)

Finally, the cache's lru policy did not evict least used - it
evicted the 10,000 least used. So if you tuned your cache
to match the NSPR map, every inclusion that would trigger a
delete of old values would effectively empty your cache. ON bigger
set sizes, this has to walk the map (at O(1)) to clean 10,000
elements.

Premature optimisation strikes again ....

Fix Description:  Throw it out. Rewrite. We now use a hash
algo that has proper distribution across a set. The hash
sizes slots to a power of two. Finally, each thread has
a private cache rather than shared which completely eliminates
a lock contention and even NUMA performance issues.

Interestingly this fix should have improvements for DB
imports, memberof and refint performance and more.

Some testing has shown in simple search workloads a 10%
improvement in throughput, and on complex searches a 47x
improvement.

https://pagure.io/389-ds-base/issue/49330

Author: wibrown

Review by: lkrispen, tbordaz
---
 ldap/servers/slapd/back-ldbm/monitor.c |  11 +-
 ldap/servers/slapd/dn.c                | 809 +++++++++++++++++++++------------
 ldap/servers/slapd/slapi-private.h     |   2 +-
 3 files changed, 527 insertions(+), 295 deletions(-)

diff --git a/ldap/servers/slapd/back-ldbm/monitor.c b/ldap/servers/slapd/back-ldbm/monitor.c
index c58b069..aa7d709 100644
--- a/ldap/servers/slapd/back-ldbm/monitor.c
+++ b/ldap/servers/slapd/back-ldbm/monitor.c
@@ -43,6 +43,9 @@ int ldbm_back_monitor_instance_search(Slapi_PBlock *pb, Slapi_Entry *e,
     PRUint64 hits, tries;
     long nentries, maxentries, count;
     size_t size, maxsize;
+    size_t thread_size;
+    size_t evicts;
+    size_t slots;
 /* NPCTE fix for bugid 544365, esc 0. <P.R> <04-Jul-2001> */
     struct stat astat;
 /* end of NPCTE fix for bugid 544365 */
@@ -118,7 +121,7 @@ int ldbm_back_monitor_instance_search(Slapi_PBlock *pb, Slapi_Entry *e,
     }
     /* normalized dn cache stats */
     if(ndn_cache_started()){
-        ndn_cache_get_stats(&hits, &tries, &size, &maxsize, &count);
+        ndn_cache_get_stats(&hits, &tries, &size, &maxsize, &thread_size, &evicts, &slots, &count);
         sprintf(buf, "%" PRIu64, tries);
         MSET("normalizedDnCacheTries");
         sprintf(buf, "%" PRIu64, hits);
@@ -127,6 +130,8 @@ int ldbm_back_monitor_instance_search(Slapi_PBlock *pb, Slapi_Entry *e,
         MSET("normalizedDnCacheMisses");
         sprintf(buf, "%lu", (unsigned long)(100.0*(double)hits / (double)(tries > 0 ? tries : 1)));
         MSET("normalizedDnCacheHitRatio");
+        sprintf(buf, "%"PRIu64, evicts);
+        MSET("NormalizedDnCacheEvictions");
         sprintf(buf, "%lu", (long unsigned int)size);
         MSET("currentNormalizedDnCacheSize");
         if(maxsize == 0){
@@ -135,6 +140,10 @@ int ldbm_back_monitor_instance_search(Slapi_PBlock *pb, Slapi_Entry *e,
         	sprintf(buf, "%lu", (long unsigned int)maxsize);
         }
         MSET("maxNormalizedDnCacheSize");
+        sprintf(buf, "%"PRIu64, thread_size);
+        MSET("NormalizedDnCacheThreadSize");
+        sprintf(buf, "%"PRIu64, slots);
+        MSET("NormalizedDnCacheThreadSlots");
         sprintf(buf, "%ld", count);
         MSET("currentNormalizedDnCacheCount");
     }
diff --git a/ldap/servers/slapd/dn.c b/ldap/servers/slapd/dn.c
index fa3909f..9cb3e7b 100644
--- a/ldap/servers/slapd/dn.c
+++ b/ldap/servers/slapd/dn.c
@@ -22,6 +22,24 @@
 #include "slap.h"
 #include <plhash.h>
 
+#include <inttypes.h>
+#include <stddef.h> /* for size_t */
+
+#if defined(HAVE_SYS_ENDIAN_H)
+#include <sys/endian.h>
+#elif defined(HAVE_ENDIAN_H)
+#include <endian.h>
+#else
+#error platform header for endian detection not found.
+#endif
+
+/* See: http://sourceforge.net/p/predef/wiki/Endianness/ */
+#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN
+#define _le64toh(x) ((uint64_t)(x))
+#else
+#define _le64toh(x) le64toh(x)
+#endif
+
 #undef SDN_DEBUG
 
 static void add_rdn_av( char *avstart, char *avend, int *rdn_av_countp,
@@ -33,52 +51,89 @@ static void rdn_av_swap( struct berval *av1, struct berval *av2, int escape );
 static int does_cn_uses_dn_syntax_in_dns(char *type, char *dn);
 
 /* normalized dn cache related definitions*/
-struct
-ndn_cache_lru
-{
-    struct ndn_cache_lru *prev;
-    struct ndn_cache_lru *next;
-    char *key;
-};
-
-struct
-ndn_cache_ctx
-{
-    struct ndn_cache_lru *head;
-    struct ndn_cache_lru *tail;
+struct ndn_cache_stats {
     Slapi_Counter *cache_hits;
     Slapi_Counter *cache_tries;
-    Slapi_Counter *cache_misses;
-    size_t cache_size;
-    size_t cache_max_size;
-    long cache_count;
+    Slapi_Counter *cache_count;
+    Slapi_Counter *cache_size;
+    Slapi_Counter *cache_evicts;
+    size_t max_size;
+    size_t thread_max_size;
+    size_t slots;
 };
 
-struct
-ndn_hash_val
-{
+struct ndn_cache_value {
+    size_t size;
+    size_t slot;
+    char *dn;
     char *ndn;
-    size_t len;
-    int size;
-    struct ndn_cache_lru *lru_node; /* used to speed up lru shuffling */
+    struct ndn_cache_value *next;
+    struct ndn_cache_value *prev;
+    struct ndn_cache_value *child;
+};
+
+/*
+ * This uses a similar alloc trick to IDList to keep
+ * The amount of derefs small.
+ */
+struct ndn_cache {
+    /*
+     * We keep per thread stats and flush them occasionally
+     */
+    size_t max_size;
+    /* Need to track this because we need to provide diffs to counter */
+    size_t last_count;
+    size_t count;
+    /* Number of ops */
+    size_t tries;
+    /* hit vs miss. in theroy miss == tries - hits.*/
+    size_t hits;
+    /* How many values we kicked out */
+    size_t evicts;
+    /* Need to track this because we need to provide diffs to counter */
+    size_t last_size;
+    size_t size;
+
+    size_t slots;
+    /*
+     * This is used by siphash to prevent hash bugket attacks
+     */
+    char key[16];
+
+    struct ndn_cache_value *head;
+    struct ndn_cache_value *tail;
+    struct ndn_cache_value *table[1];
 };
 
-#define NDN_FLUSH_COUNT 10000 /* number of DN's to remove when cache fills up */
-#define NDN_MIN_COUNT 1000 /* the minimum number of DN's to keep in the cache */
-#define NDN_CACHE_BUCKETS 2053 /* prime number */
+/*
+ * This means we need 1 MB minimum per thread
+ * 
+ */
+#define NDN_CACHE_MINIMUM_CAPACITY 1048576
+/*
+ * This helps us define the number of hashtable slots
+ * to create. We assume an average DN is 64 chars long
+ * This way we end up we a ht entry of:
+ * 8 bytes: from the table pointing to us.
+ * 8 bytes: next ptr
+ * 8 bytes: prev ptr
+ * 8 bytes + 64: dn
+ * 8 bytes + 64: ndn itself.
+ * This gives us 168 bytes. In theory this means
+ * 6241 entries, but we have to clamp this to a power of
+ * two, so we have 8192 slots. In reality, dns may be
+ * shorter *and* the dn may be the same as the ndn
+ * so we *may* store more ndns that this. Again, a good reason
+ * to round the ht size up!
+ */
+#define NDN_ENTRY_AVG_SIZE 168
+/*
+ * After how many operations do we sync our per-thread stats.
+ */
+#define NDN_STAT_COMMIT_FREQUENCY 256
 
-static PLHashNumber ndn_hash_string(const void *key);
 static int ndn_cache_lookup(char *dn, size_t dn_len, char **result, char **udn, int *rc);
-static void ndn_cache_update_lru(struct ndn_cache_lru **node);
 static void ndn_cache_add(char *dn, size_t dn_len, char *ndn, size_t ndn_len);
-static void ndn_cache_delete(char *dn);
-static void ndn_cache_flush(void);
-static void ndn_cache_free(void);
-static int ndn_started = 0;
-static PRLock *lru_lock = NULL;
-static Slapi_RWLock *ndn_cache_lock = NULL;
-static struct ndn_cache_ctx *ndn_cache = NULL;
-static PLHashTable *ndn_cache_hashtable = NULL;
 
 #define ISBLANK(c)	((c) == ' ')
 #define ISBLANKSTR(s)	(((*(s)) == '2') && (*((s)+1) == '0'))
@@ -2768,166 +2823,408 @@ slapi_sdn_get_size(const Slapi_DN *sdn)
  *
  */
 
+/* <MIT License>
+ Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ </MIT License>
+
+ Original location:
+    https://github.com/majek/csiphash/
+
+ Solution inspired by code from:
+    Samuel Neves (supercop/crypto_auth/siphash24/little)
+    djb (supercop/crypto_auth/siphash24/little2)
+    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
+*/
+
+#define ROTATE(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
+
+#define HALF_ROUND(a, b, c, d, s, t) \
+    a += b;                          \
+    c += d;                          \
+    b = ROTATE(b, s) ^ a;            \
+    d = ROTATE(d, t) ^ c;            \
+    a = ROTATE(a, 32);
+
+#define ROUND(v0, v1, v2, v3)           \
+    HALF_ROUND(v0, v1, v2, v3, 13, 16); \
+    HALF_ROUND(v2, v1, v0, v3, 17, 21)
+
+#define cROUND(v0, v1, v2, v3) \
+    ROUND(v0, v1, v2, v3)
+
+#define dROUND(v0, v1, v2, v3) \
+    ROUND(v0, v1, v2, v3);     \
+    ROUND(v0, v1, v2, v3);     \
+    ROUND(v0, v1, v2, v3)
+
+
+static uint64_t
+sds_siphash13(const void *src, size_t src_sz, const char key[16])
+{
+    const uint64_t *_key = (uint64_t *)key;
+    uint64_t k0 = _le64toh(_key[0]);
+    uint64_t k1 = _le64toh(_key[1]);
+    uint64_t b = (uint64_t)src_sz << 56;
+    const uint64_t *in = (uint64_t *)src;
+
+    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
+    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
+    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
+    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
+
+    while (src_sz >= 8) {
+        uint64_t mi = _le64toh(*in);
+        in += 1;
+        src_sz -= 8;
+        v3 ^= mi;
+        // cround
+        cROUND(v0, v1, v2, v3);
+        v0 ^= mi;
+    }
+
+    uint64_t t = 0;
+    uint8_t *pt = (uint8_t *)&t;
+    uint8_t *m = (uint8_t *)in;
+
+    switch (src_sz) {
+    case 7:
+        pt[6] = m[6]; /* FALLTHRU */
+    case 6:
+        pt[5] = m[5]; /* FALLTHRU */
+    case 5:
+        pt[4] = m[4]; /* FALLTHRU */
+    case 4:
+        *((uint32_t *)&pt[0]) = *((uint32_t *)&m[0]);
+        break;
+    case 3:
+        pt[2] = m[2]; /* FALLTHRU */
+    case 2:
+        pt[1] = m[1]; /* FALLTHRU */
+    case 1:
+        pt[0] = m[0]; /* FALLTHRU */
+    }
+    b |= _le64toh(t);
+
+    v3 ^= b;
+    // cround
+    cROUND(v0, v1, v2, v3);
+    v0 ^= b;
+    v2 ^= 0xff;
+    // dround
+    dROUND(v0, v1, v2, v3);
+    return (v0 ^ v1) ^ (v2 ^ v3);
+}
+
+static pthread_key_t ndn_cache_key;
+static pthread_once_t ndn_cache_key_once = PTHREAD_ONCE_INIT;
+static struct ndn_cache_stats t_cache_stats = {0};
 /*
- *  Hashing function using Bernstein's method
+ * WARNING: For some reason we try to use the NDN cache *before*
+ * we have a chance to configure it. As a result, we need to rely
+ * on a trick in the way we start, that we start in one thread
+ * so we can manipulate ints as though they were atomics, then
+ * we start in *one* thread, so it's set, then when threads
+ * fork the get barriers, so we can go from there. However we *CANNOT*
+ * change this at runtime without expensive atomics per op, so lets
+ * not bother until we improve libglobs to be COW.
  */
-static PLHashNumber
-ndn_hash_string(const void *key)
-{
-    PLHashNumber hash = 5381;
-    unsigned char *x = (unsigned char *)key;
-    int c;
+static int32_t ndn_enabled = 0;
+
+static struct ndn_cache *
+ndn_thread_cache_create(size_t thread_max_size, size_t slots) {
+    size_t t_cache_size = sizeof(struct ndn_cache) + (slots * sizeof(struct ndn_cache_value *));
+    struct ndn_cache *t_cache = (struct ndn_cache *)slapi_ch_calloc(1, t_cache_size);
+
+    t_cache->max_size = thread_max_size;
+    t_cache->slots = slots;
 
-    while ((c = *x++)){
-        hash = ((hash << 5) + hash) ^ c;
+    return t_cache;
+}
+
+static void
+ndn_thread_cache_commit_status(struct ndn_cache *t_cache) {
+    /*
+     * Every so often we commit these atomically. We do this infrequently
+     * to avoid the costly atomics.
+     */
+    if (t_cache->tries % NDN_STAT_COMMIT_FREQUENCY == 0) {
+        /* We can just add tries and hits. */
+        slapi_counter_add(t_cache_stats.cache_evicts, t_cache->evicts);
+        slapi_counter_add(t_cache_stats.cache_tries, t_cache->tries);
+        slapi_counter_add(t_cache_stats.cache_hits, t_cache->hits);
+        t_cache->hits = 0;
+        t_cache->tries = 0;
+        t_cache->evicts = 0;
+        /* Count and size need diff */
+        int64_t diff = (t_cache->size - t_cache->last_size);
+        if (diff > 0) {
+            // We have more ....
+            slapi_counter_add(t_cache_stats.cache_size, (uint64_t)diff);
+        } else if (diff < 0) {
+            slapi_counter_subtract(t_cache_stats.cache_size, (uint64_t)llabs(diff));
+        }
+        t_cache->last_size = t_cache->size;
+
+        diff = (t_cache->count - t_cache->last_count);
+        if (diff > 0) {
+            // We have more ....
+            slapi_counter_add(t_cache_stats.cache_count, (uint64_t)diff);
+        } else if (diff < 0) {
+            slapi_counter_subtract(t_cache_stats.cache_count, (uint64_t)llabs(diff));
+        }
+        t_cache->last_count = t_cache->count;
+
+    }
+}
+
+static void
+ndn_thread_cache_value_destroy(struct ndn_cache *t_cache, struct ndn_cache_value *v) {
+    /* Update stats */
+    t_cache->size = t_cache->size - v->size;
+    t_cache->count--;
+    t_cache->evicts++;
+
+    if (v == t_cache->head) {
+        t_cache->head = v->prev;
+    }
+    if (v == t_cache->tail) {
+        t_cache->tail = v->next;
+    }
+
+    /* Cut the node out. */
+    if (v->next != NULL) {
+        v->next->prev = v->prev;
+    }
+    if (v->prev != NULL) {
+        v->prev->next = v->next;
+    }
+    /* Set the pointer in the table to NULL */
+    /* Now see if we were in a list */
+    struct ndn_cache_value *slot_node = t_cache->table[v->slot];
+    if (slot_node == v) {
+        t_cache->table[v->slot] = v->child;
+    } else {
+        struct ndn_cache_value *former_slot_node = NULL;
+        do {
+            former_slot_node = slot_node;
+            slot_node = slot_node->child;
+        } while(slot_node != v);
+        /* Okay, now slot_node is us, and former is our parent */
+        former_slot_node->child = v->child;
+    }
+
+    slapi_ch_free((void **)&(v->dn));
+    slapi_ch_free((void **)&(v->ndn));
+    slapi_ch_free((void **)&v);
+}
+
+static void
+ndn_thread_cache_destroy(void *v_cache) {
+    struct ndn_cache *t_cache = (struct ndn_cache *)v_cache;
+    /*
+     * FREE ALL THE NODES!!!
+     */
+    struct ndn_cache_value *node = t_cache->tail;
+    struct ndn_cache_value *next_node = NULL;
+    while (node) {
+        next_node = node->next;
+        ndn_thread_cache_value_destroy(t_cache, node);
+        node = next_node;
+    }
+    slapi_ch_free((void **)&t_cache);
+}
+
+static void
+ndn_cache_key_init() {
+    if (pthread_key_create(&ndn_cache_key, ndn_thread_cache_destroy) != 0) {
+        /* Log a scary warning? */
+        slapi_log_err(SLAPI_LOG_ERR, "ndn_cache_init", "Failed to create pthread key, aborting.\n");
     }
-    return hash;
 }
 
 void
 ndn_cache_init()
 {
-    if(!config_get_ndn_cache_enabled() || ndn_started){
+    ndn_enabled = config_get_ndn_cache_enabled();
+    if (ndn_enabled == 0) {
+        /*
+         * Don't configure the keys or anything, need a restart
+         * to enable. We'll just never use ndn cache in this
+         * run.
+         */
         return;
     }
-    ndn_cache_hashtable = PL_NewHashTable( NDN_CACHE_BUCKETS, ndn_hash_string, PL_CompareStrings, PL_CompareValues, 0, 0);
-    ndn_cache = (struct ndn_cache_ctx *)slapi_ch_malloc(sizeof(struct ndn_cache_ctx));
-    ndn_cache->cache_max_size = config_get_ndn_cache_size();
-    ndn_cache->cache_hits = slapi_counter_new();
-    ndn_cache->cache_tries = slapi_counter_new();
-    ndn_cache->cache_misses = slapi_counter_new();
-    ndn_cache->cache_count = 0;
-    ndn_cache->cache_size = sizeof(struct ndn_cache_ctx) + sizeof(PLHashTable) + sizeof(PLHashTable);
-    ndn_cache->head = NULL;
-    ndn_cache->tail = NULL;
-    ndn_started = 1;
-    if ( NULL == ( lru_lock = PR_NewLock()) ||  NULL == ( ndn_cache_lock = slapi_new_rwlock())) {
-        ndn_cache_destroy();
-        slapi_log_err(SLAPI_LOG_ERR, "ndn_cache_init", "Failed to create locks.  Disabling cache.\n" );
+
+    /* Create the pthread key */
+    (void)pthread_once(&ndn_cache_key_once, ndn_cache_key_init);
+
+    /* Create the global stats. */
+    t_cache_stats.max_size = config_get_ndn_cache_size();
+    t_cache_stats.cache_evicts = slapi_counter_new();
+    t_cache_stats.cache_tries = slapi_counter_new();
+    t_cache_stats.cache_hits = slapi_counter_new();
+    t_cache_stats.cache_count = slapi_counter_new();
+    t_cache_stats.cache_size = slapi_counter_new();
+    /* Get thread numbers and calc the per thread size */
+    int32_t maxthreads = (int32_t)config_get_threadnumber();
+    size_t tentative_size = t_cache_stats.max_size / maxthreads;
+    if (tentative_size < NDN_CACHE_MINIMUM_CAPACITY) {
+        tentative_size = NDN_CACHE_MINIMUM_CAPACITY;
+        t_cache_stats.max_size = NDN_CACHE_MINIMUM_CAPACITY * maxthreads;
+    }
+    t_cache_stats.thread_max_size = tentative_size;
+
+    /*
+     * Slots *must* be a power of two, even if the number of entries
+     * we store will be *less* than this.
+     */
+    size_t possible_elements = tentative_size / NDN_ENTRY_AVG_SIZE;
+    /*
+     * So this is like 1048576 / 168, so we get 6241. Now we need to
+     * shift this to get the number of bits.
+     */
+    size_t shifts = 0;
+    while (possible_elements > 0) {
+        shifts++;
+        possible_elements = possible_elements >> 1;
     }
+    /*
+     * So now we can use this to make the slot count.
+     */
+    t_cache_stats.slots = 1 << shifts;
+    /* Done? */
+    return;
 }
 
 void
 ndn_cache_destroy()
 {
-    if(!ndn_started){
+    if (ndn_enabled == 0) {
         return;
     }
-    if(lru_lock){
-        PR_DestroyLock(lru_lock);
-        lru_lock = NULL;
-    }
-    if(ndn_cache_lock){
-        slapi_destroy_rwlock(ndn_cache_lock);
-        ndn_cache_lock = NULL;
-    }
-    if(ndn_cache_hashtable){
-        ndn_cache_free();
-        PL_HashTableDestroy(ndn_cache_hashtable);
-        ndn_cache_hashtable = NULL;
-    }
-    config_set_ndn_cache_enabled(CONFIG_NDN_CACHE, "off", NULL, 1 );
-    slapi_counter_destroy(&ndn_cache->cache_hits);
-    slapi_counter_destroy(&ndn_cache->cache_tries);
-    slapi_counter_destroy(&ndn_cache->cache_misses);
-    slapi_ch_free((void **)&ndn_cache);
-
-    ndn_started = 0;
+    slapi_counter_destroy(&(t_cache_stats.cache_tries));
+    slapi_counter_destroy(&(t_cache_stats.cache_hits));
+    slapi_counter_destroy(&(t_cache_stats.cache_count));
+    slapi_counter_destroy(&(t_cache_stats.cache_size));
+    slapi_counter_destroy(&(t_cache_stats.cache_evicts));
 }
 
 int
 ndn_cache_started()
 {
-    return ndn_started;
+    return ndn_enabled;
 }
 
 /*
  *  Look up this dn in the ndn cache
  */
 static int
-ndn_cache_lookup(char *dn, size_t dn_len, char **result, char **udn, int *rc)
+ndn_cache_lookup(char *dn, size_t dn_len, char **ndn, char **udn, int *rc)
 {
-    struct ndn_hash_val *ndn_ht_val = NULL;
-    char *ndn, *key;
-    int rv = 0;
-
-    if(NULL == udn){
-        return rv;
+    if (ndn_enabled == 0 || NULL == udn) {
+        return 0;
     }
     *udn = NULL;
-    if(ndn_started == 0){
-        return rv;
-    }
-    if(dn_len == 0){
-        *result = dn;
+
+    if (dn_len == 0) {
+        *ndn = dn;
         *rc = 0;
         return 1;
     }
-    slapi_counter_increment(ndn_cache->cache_tries);
-    slapi_rwlock_rdlock(ndn_cache_lock);
-    ndn_ht_val = (struct ndn_hash_val *)PL_HashTableLookupConst(ndn_cache_hashtable, dn);
-    if(ndn_ht_val){
-        ndn_cache_update_lru(&ndn_ht_val->lru_node);
-        slapi_counter_increment(ndn_cache->cache_hits);
-        if ((ndn_ht_val->len != dn_len) || 
-            /* even if the lengths match, dn may not be normalized yet.
-             * (e.g., 'cn="o=ABC",o=XYZ' vs. 'cn=o\3DABC,o=XYZ') */
-            (memcmp(dn, ndn_ht_val->ndn, dn_len))){
-            *rc = 1; /* free result */
-            ndn = slapi_ch_malloc(ndn_ht_val->len + 1);
-            memcpy(ndn, ndn_ht_val->ndn, ndn_ht_val->len);
-            ndn[ndn_ht_val->len] = '\0';
-            *result = ndn;
-        } else {
-            /* the dn was already normalized, just return the dn as the result */
-            *result = dn;
-            *rc = 0;
-        }
-        rv = 1;
-    } else {
-        /* copy/preserve the udn, so we can use it as the key when we add dn's to the hashtable */
-        key = slapi_ch_malloc(dn_len + 1);
-        memcpy(key, dn, dn_len);
-        key[dn_len] = '\0';
-        *udn = key;
+
+    struct ndn_cache *t_cache = pthread_getspecific(ndn_cache_key);
+    if (t_cache == NULL) {
+        t_cache = ndn_thread_cache_create(t_cache_stats.thread_max_size, t_cache_stats.slots);
+        pthread_setspecific(ndn_cache_key, t_cache);
+        /* If we have no cache, we can't look up ... */
+        return 0;
     }
-    slapi_rwlock_unlock(ndn_cache_lock);
 
-    return rv;
-}
+    t_cache->tries++;
 
-/*
- *  Move this lru node to the top of the list
- */
-static void
-ndn_cache_update_lru(struct ndn_cache_lru **node)
-{
-    struct ndn_cache_lru *prev, *next, *curr_node = *node;
+    /*
+     * Hash our DN ...
+     */
+    uint64_t dn_hash = sds_siphash13(dn, dn_len, t_cache->key);
+    /* Where should it be? */
+    size_t expect_slot = dn_hash % t_cache->slots;
 
-    if(curr_node == NULL){
-        return;
-    }
-    PR_Lock(lru_lock);
-    if(curr_node->prev == NULL){
-        /* already the top node */
-        PR_Unlock(lru_lock);
-        return;
-    }
-    prev = curr_node->prev;
-    next = curr_node->next;
-    if(next){
-        next->prev = prev;
-        prev->next = next;
-    } else {
-        /* this was the tail, so reset the tail */
-        ndn_cache->tail = prev;
-        prev->next = NULL;
+    /*
+     * Is it there?
+     */
+    if (t_cache->table[expect_slot] != NULL) {
+        /*
+         * Check it really matches, could be collision.
+         */
+        struct ndn_cache_value *node = t_cache->table[expect_slot];
+        while (node != NULL) {
+            if (strcmp(dn, node->dn) == 0) {
+                /*
+                 * Update LRU
+                 * Are we already the tail? If so, we can just skip.
+                 * remember, this means in a set of 1, we will always be tail
+                 */
+                if (t_cache->tail != node) {
+                    /*
+                     * Okay, we are *not* the tail. We could be anywhere between
+                     * tail -> ... -> x -> head
+                     * or even, we are the head ourself.
+                     */
+                    if (t_cache->head == node) {
+                        /* We are the head, update head to our predecessor */
+                        t_cache->head = node->prev;
+                        /* Remember, the head has no next. */
+                        t_cache->head->next = NULL;
+                    } else {
+                        /* Right, we aren't the head, so we have a next node. */
+                        node->next->prev = node->prev;
+                    }
+                    /* Because we must be in the middle somewhere, we can assume next and prev exist. */
+                    node->prev->next = node->next;
+                    /*
+                     * Tail can't be NULL if we have a value in the cache, so we can
+                     * just deref this.
+                     */
+                    node->next = t_cache->tail;
+                    t_cache->tail->prev = node;
+                    t_cache->tail = node;
+                    node->prev = NULL;
+                }
+                /* Update that we have a hit.*/
+                t_cache->hits++;
+                /* Cope the NDN to the caller. */
+                *ndn = slapi_ch_strdup(node->ndn);
+                /* Indicate to the caller to free this. */
+                *rc = 1;
+                ndn_thread_cache_commit_status(t_cache);
+                return 1;
+            }
+            node = node->child;
+        }
     }
-    curr_node->prev = NULL;
-    curr_node->next = ndn_cache->head;
-    ndn_cache->head->prev = curr_node;
-    ndn_cache->head = curr_node;
-    PR_Unlock(lru_lock);
+    /* If we miss, we need to duplicate dn to udn here. */
+    *udn = slapi_ch_strdup(dn);
+    *rc = 0;
+    ndn_thread_cache_commit_status(t_cache);
+    return 0;
 }
 
 /*
@@ -2936,176 +3233,102 @@ ndn_cache_update_lru(struct ndn_cache_lru **node)
 static void
 ndn_cache_add(char *dn, size_t dn_len, char *ndn, size_t ndn_len)
 {
-    struct ndn_hash_val *ht_entry;
-    struct ndn_cache_lru *new_node = NULL;
-    PLHashEntry *he;
-    int size;
-
-    if(ndn_started == 0 || dn_len == 0){
+    if (ndn_enabled == 0) {
         return;
     }
-    if(strlen(ndn) > ndn_len){
+    if (dn_len == 0) {
+        return;
+    }
+    if (strlen(ndn) > ndn_len) {
         /* we need to null terminate the ndn */
         *(ndn + ndn_len) = '\0';
     }
     /*
      *  Calculate the approximate memory footprint of the hash entry, key, and lru entry.
      */
-    size = (dn_len * 2) + ndn_len + sizeof(PLHashEntry) + sizeof(struct ndn_hash_val) + sizeof(struct ndn_cache_lru);
+    struct ndn_cache_value *new_value = (struct ndn_cache_value *)slapi_ch_calloc(1, sizeof(struct ndn_cache_value));
+    new_value->size = sizeof(struct ndn_cache_value) + dn_len + ndn_len;
+    /* DN is alloc for us */
+    new_value->dn = dn;
+    /* But we need to copy ndn */
+    new_value->ndn = slapi_ch_strdup(ndn);
+
     /*
-     *  Create our LRU node
+     * Get our local cache out.
      */
-    new_node = (struct ndn_cache_lru *)slapi_ch_malloc(sizeof(struct ndn_cache_lru));
-    if(new_node == NULL){
-        slapi_log_err(SLAPI_LOG_ERR, "ndn_cache_add", "Failed to allocate new lru node.\n");
-        return;
+    struct ndn_cache *t_cache = pthread_getspecific(ndn_cache_key);
+    if (t_cache == NULL) {
+        t_cache = ndn_thread_cache_create(t_cache_stats.thread_max_size, t_cache_stats.slots);
+        pthread_setspecific(ndn_cache_key, t_cache);
     }
-    new_node->prev = NULL;
-    new_node->key = dn; /* dn has already been allocated */
     /*
-     *  Its possible this dn was added to the hash by another thread.
+     * Hash the DN
      */
-    slapi_rwlock_wrlock(ndn_cache_lock);
-    ht_entry = (struct ndn_hash_val *)PL_HashTableLookupConst(ndn_cache_hashtable, dn);
-    if(ht_entry){
-        /* already exists, free the node and return */
-        slapi_rwlock_unlock(ndn_cache_lock);
-        slapi_ch_free_string(&new_node->key);
-        slapi_ch_free((void **)&new_node);
-        return;
-    }
+    uint64_t dn_hash = sds_siphash13(new_value->dn, dn_len, t_cache->key);
     /*
-     *  Create the hash entry
+     * Get the insert slot: This works because the number spaces of dn_hash is
+     * a 64bit int, and slots is a power of two. As a result, we end up with
+     * even distribution of the values.
      */
-    ht_entry = (struct ndn_hash_val *)slapi_ch_malloc(sizeof(struct ndn_hash_val));
-    if(ht_entry == NULL){
-        slapi_rwlock_unlock(ndn_cache_lock);
-        slapi_log_err(SLAPI_LOG_ERR, "ndn_cache_add", "Failed to allocate new hash entry.\n");
-        slapi_ch_free_string(&new_node->key);
-        slapi_ch_free((void **)&new_node);
-        return;
-    }
-    ht_entry->ndn = slapi_ch_malloc(ndn_len + 1);
-    memcpy(ht_entry->ndn, ndn, ndn_len);
-    ht_entry->ndn[ndn_len] = '\0';
-    ht_entry->len = ndn_len;
-    ht_entry->size = size;
-    ht_entry->lru_node = new_node;
+    size_t insert_slot = dn_hash % t_cache->slots;
+    /* Track this for free */
+    new_value->slot = insert_slot;
+
     /*
-     *  Check if our cache is full
+     * Okay, check if we have space, else we need to trim nodes from
+     * the LRU
      */
-    PR_Lock(lru_lock); /* grab the lru lock now, as ndn_cache_flush needs it */
-    if(ndn_cache->cache_max_size != 0 && ((ndn_cache->cache_size + size) > ndn_cache->cache_max_size)){
-        ndn_cache_flush();
+    while (t_cache->head && (t_cache->size + new_value->size) > t_cache->max_size) {
+        struct ndn_cache_value *trim_node = t_cache->head;
+        ndn_thread_cache_value_destroy(t_cache, trim_node);
     }
+
     /*
-     * Set the ndn cache lru nodes
+     * Add it!
      */
-    if(ndn_cache->head == NULL && ndn_cache->tail == NULL){
-        /* this is the first node */
-        ndn_cache->head = new_node;
-        ndn_cache->tail = new_node;
-        new_node->next = NULL;
+    if (t_cache->table[insert_slot] == NULL) {
+        t_cache->table[insert_slot] = new_value;
     } else {
-        new_node->next = ndn_cache->head;
-        if(ndn_cache->head)
-            ndn_cache->head->prev = new_node;
+        /*
+         * Hash collision! We need to replace the bucket then ....
+         * insert at the head of the slot to make this simpler.
+         */
+        new_value->child = t_cache->table[insert_slot];
+        t_cache->table[insert_slot] = new_value;
     }
-    ndn_cache->head = new_node;
-    PR_Unlock(lru_lock);
+
     /*
-     *  Add the new object to the hashtable, and update our stats
+     * Finally, stick this onto the tail because it's the newest.
      */
-    he = PL_HashTableAdd(ndn_cache_hashtable, new_node->key, (void *)ht_entry);
-    if(he == NULL){
-        slapi_log_err(SLAPI_LOG_ERR, "ndn_cache_add", "Failed to add new entry to hash(%s)\n",dn);
-    } else {
-        ndn_cache->cache_count++;
-        ndn_cache->cache_size += size;
+    if (t_cache->head == NULL) {
+        t_cache->head = new_value;
     }
-    slapi_rwlock_unlock(ndn_cache_lock);
-}
-
-/*
- *  cache is full, remove the least used dn's.  lru_lock/ndn_cache write lock are already taken
- */
-static void
-ndn_cache_flush(void)
-{
-    struct ndn_cache_lru *node, *next, *flush_node;
-    int i;
-
-    node = ndn_cache->tail;
-    for(i = 0; node && i < NDN_FLUSH_COUNT && ndn_cache->cache_count > NDN_MIN_COUNT; i++){
-        flush_node = node;
-        /* update the lru */
-        next = node->prev;
-        next->next = NULL;
-        ndn_cache->tail = next;
-        node = next;
-        /* now update the hash */
-        ndn_cache->cache_count--;
-        ndn_cache_delete(flush_node->key);
-        slapi_ch_free_string(&flush_node->key);
-        slapi_ch_free((void **)&flush_node);
+    if (t_cache->tail != NULL) {
+        new_value->next = t_cache->tail;
+        t_cache->tail->prev = new_value;
     }
+    t_cache->tail = new_value;
 
-    slapi_log_err(SLAPI_LOG_CACHE, "ndn_cache_flush","Flushed cache.\n");
-}
-
-static void
-ndn_cache_free(void)
-{
-    struct ndn_cache_lru *node, *next, *flush_node;
-
-    if(!ndn_cache){
-        return;
-    }
-
-    node = ndn_cache->tail;
-    while(node && ndn_cache->cache_count){
-        flush_node = node;
-        /* update the lru */
-        next = node->prev;
-        if(next){
-            next->next = NULL;
-        }
-        ndn_cache->tail = next;
-        node = next;
-        /* now update the hash */
-        ndn_cache->cache_count--;
-        ndn_cache_delete(flush_node->key);
-        slapi_ch_free_string(&flush_node->key);
-        slapi_ch_free((void **)&flush_node);
-    }
-}
-
-/* this is already "write" locked from ndn_cache_add */
-static void
-ndn_cache_delete(char *dn)
-{
-    struct ndn_hash_val *ht_entry;
+    /*
+     * And update the stats.
+     */
+    t_cache->size = t_cache->size + new_value->size;
+    t_cache->count++;
 
-    ht_entry = (struct ndn_hash_val *)PL_HashTableLookupConst(ndn_cache_hashtable, dn);
-    if(ht_entry){
-        ndn_cache->cache_size -= ht_entry->size;
-        slapi_ch_free_string(&ht_entry->ndn);
-        slapi_ch_free((void **)&ht_entry);
-        PL_HashTableRemove(ndn_cache_hashtable, dn);
-    }
 }
 
 /* stats for monitor */
 void
-ndn_cache_get_stats(PRUint64 *hits, PRUint64 *tries, size_t *size, size_t *max_size, long *count)
-{
-    slapi_rwlock_rdlock(ndn_cache_lock);
-    *hits = slapi_counter_get_value(ndn_cache->cache_hits);
-    *tries = slapi_counter_get_value(ndn_cache->cache_tries);
-    *size = ndn_cache->cache_size;
-    *max_size = ndn_cache->cache_max_size;
-    *count = ndn_cache->cache_count;
-    slapi_rwlock_unlock(ndn_cache_lock);
+ndn_cache_get_stats(PRUint64 *hits, PRUint64 *tries, size_t *size, size_t *max_size, size_t *thread_size, size_t *evicts, size_t *slots, long *count)
+{
+    *max_size = t_cache_stats.max_size;
+    *thread_size = t_cache_stats.thread_max_size;
+    *slots = t_cache_stats.slots;
+    *evicts = slapi_counter_get_value(t_cache_stats.cache_evicts);
+    *hits = slapi_counter_get_value(t_cache_stats.cache_hits);
+    *tries = slapi_counter_get_value(t_cache_stats.cache_tries);
+    *size = slapi_counter_get_value(t_cache_stats.cache_size);
+    *count = slapi_counter_get_value(t_cache_stats.cache_count);
 }
 
 /* Common ancestor sdn is allocated.
diff --git a/ldap/servers/slapd/slapi-private.h b/ldap/servers/slapd/slapi-private.h
index 3910dbe..68b59f3 100644
--- a/ldap/servers/slapd/slapi-private.h
+++ b/ldap/servers/slapd/slapi-private.h
@@ -380,7 +380,7 @@ char *slapi_dn_normalize_case_original( char *dn );
 void ndn_cache_init(void);
 void ndn_cache_destroy(void);
 int ndn_cache_started(void);
-void ndn_cache_get_stats(PRUint64 *hits, PRUint64 *tries, size_t *size, size_t *max_size, long *count);
+void ndn_cache_get_stats(PRUint64 *hits, PRUint64 *tries, size_t *size, size_t *max_size, size_t *thread_size, size_t *evicts, size_t *slots, long *count);
 #define NDN_DEFAULT_SIZE 20971520 /* 20mb - size of normalized dn cache */
 
 /* filter.c */
-- 
2.9.4