Blame SOURCES/0020-Ticket-49401-improve-valueset-sorted-performance-on-.patch

96373c
From a59b2f4129565dbfa1b63899dd550e9c22b02923 Mon Sep 17 00:00:00 2001
40cd75
From: Mohammad Nweider <nweiderm@amazon.com>
40cd75
Date: Wed, 18 Oct 2017 13:02:15 +0000
40cd75
Subject: [PATCH] Ticket 49401 - improve valueset sorted performance on delete
40cd75
40cd75
Bug Description:  valueset sorted maintains a list of syntax sorted
40cd75
references to the attributes of the entry. During addition these are
40cd75
modified and added properly, so they stay sorted.
40cd75
40cd75
However, in the past to maintain the sorted property, during a delete
40cd75
we would need to remove the vs->sorted array, and recreate it via qsort,
40cd75
40cd75
While this was an improvement from past (where we would removed
40cd75
vs->sorted during an attr delete), it still has performance implications
40cd75
on very very large datasets, IE 50,000 member groups with
40cd75
addition/deletion, large entry caches and replication.
40cd75
40cd75
Fix Description:  Implement a new algorithm that is able to maintain
40cd75
existing sort data in a near linear time.
40cd75
40cd75
https://pagure.io/389-ds-base/issue/49401
40cd75
40cd75
Author: nweiderm, wibrown
40cd75
40cd75
Review by: wibrown, lkrispen, tbordaz (Thanks nweiderm!)
40cd75
40cd75
(cherry picked from commit a43a8efc7907116146b505ac40f18fac71f474b0)
40cd75
---
40cd75
 ldap/servers/slapd/valueset.c | 171 +++++++++++++++++++++++++-----------------
96373c
 1 file changed, 103 insertions(+), 68 deletions(-)
40cd75
40cd75
diff --git a/ldap/servers/slapd/valueset.c b/ldap/servers/slapd/valueset.c
96373c
index d2c67d2fb..1c1bc150a 100644
40cd75
--- a/ldap/servers/slapd/valueset.c
40cd75
+++ b/ldap/servers/slapd/valueset.c
96373c
@@ -677,100 +677,136 @@ valueset_array_purge(const Slapi_Attr *a, Slapi_ValueSet *vs, const CSN *csn)
40cd75
     size_t i = 0;
40cd75
     size_t j = 0;
40cd75
     int nextValue = 0;
40cd75
+    int nv = 0;
40cd75
     int numValues = 0;
40cd75
+    Slapi_Value **va2 = NULL;
40cd75
+    int *sorted2 = NULL;
40cd75
 
40cd75
     /* Loop over all the values freeing the old ones. */
96373c
-    for (i = 0; i < vs->num; i++) {
96373c
+    for(i = 0; i < vs->num; i++)
96373c
+    {
96373c
         /* If we have the sorted array, find the va array ref by it. */
96373c
         if (vs->sorted) {
96373c
             j = vs->sorted[i];
40cd75
         } else {
40cd75
             j = i;
40cd75
         }
96373c
-        csnset_purge(&(vs->va[j]->v_csnset), csn);
40cd75
-        if (vs->va[j]->v_csnset == NULL) {
40cd75
-            slapi_value_free(&vs->va[j]);
40cd75
-            vs->va[j] = NULL;
40cd75
-        } else if (vs->va[j] != NULL) {
40cd75
-            /* This value survived, we should count it. */
40cd75
-            numValues++;
40cd75
+        if (vs->va[j]) {
40cd75
+            csnset_purge(&(vs->va[j]->v_csnset),csn);
40cd75
+            if (vs->va[j]->v_csnset == NULL) {
40cd75
+                slapi_value_free(&vs->va[j]);
40cd75
+                /* Set the removed value to NULL so we know later to skip it */
40cd75
+                vs->va[j] = NULL;
40cd75
+                if (vs->sorted) {
40cd75
+                    /* Mark the value in sorted for removal */
40cd75
+                    vs->sorted[i] = -1;
40cd75
+                }
40cd75
+            } else {
40cd75
+                /* This value survived, we should count it. */
40cd75
+                numValues++;
40cd75
+            }
40cd75
         }
40cd75
     }
40cd75
 
40cd75
-    /* Now compact the value/sorted list. */
40cd75
+    /* Compact vs->va and vs->sorted only when there're
40cd75
+     * remaining values ie: numValues is greater than 0 */
40cd75
     /*
40cd75
-     * Because we want to preserve the sorted array, this is complicated.
40cd75
+     * Algorithm explination: We start with a pair of arrays - the attrs, and the sorted array that provides
40cd75
+     * a lookup into it:
40cd75
+     *
40cd75
+     * va: [d e a c b] sorted: [2 4 3 0 1]
40cd75
+     *
40cd75
+     * When we remove the element b, we NULL it, and we have to mark the place where it "was" as a -1 to
40cd75
+     * flag it's removal.
40cd75
+     *
40cd75
+     * va: [d e a c NULL] sorted: [2 -1 3 0 1]
40cd75
+     *
40cd75
+     * Now a second va is created with the reduced allocation,
40cd75
+     *
40cd75
+     * va2: [ X X X X ] ....
96373c
+     *
96373c
+     * Now we loop over sorted, skipping -1 that we find. In a new counter we create new sorted
96373c
+     * references, and move the values compacting them in the process.
96373c
+     * va: [d e a c NULL]
96373c
+     * va2: [a x x x]
96373c
+     * sorted: [_0 -1 3 0 1]
96373c
+     *
96373c
+     * Looping a few more times would yield:
40cd75
      *
40cd75
-     *  We have an array of values:
40cd75
-     *  [ b, a, c, NULL, e, NULL, NULL, d]
40cd75
-     *  And an array of indicies that are sorted.
40cd75
-     *  [ 1, 0, 2, 7, 4, 3, 5, 6 ]
40cd75
-     *  Were we to iterate over the sorted array, we get refs to the values in
40cd75
-     * some order.
40cd75
-     *  The issue is now we must *remove* from both the values *and* the sorted.
96373c
+     * va2: [a c x x]
96373c
+     * sorted: [_0 _1 3 0 1]
40cd75
      *
40cd75
-     * Previously, we just discarded this, because too hard. Now we try to keep
40cd75
-     * it. The issue is that this is surprisingly hard to actually keep in
40cd75
-     * sync.
96373c
+     * va2: [a c d x]
96373c
+     * sorted: [_0 _1 _2 0 1]
40cd75
      *
40cd75
-     * We can't just blindly move the values down: That breaks the sorted array
40cd75
-     * and we would need to iterate over the sorted array multiple times to
40cd75
-     * achieve this.
40cd75
+     * va2: [a c d e]
40cd75
+     * sorted: [_0 _1 _2 _3 1]
40cd75
+     *
40cd75
+     * Not only does this sort va, but with sorted, we have a faster lookup, and it will benefit cache
40cd75
+     * lookup.
40cd75
      *
40cd75
-     * It's actually going to be easier to just ditch the sorted, compact vs
40cd75
-     * and then qsort the array.
40cd75
      */
40cd75
+    if (numValues > 0) {
40cd75
+        if(vs->sorted) {
40cd75
+            /* Let's allocate va2 and sorted2 */
40cd75
+            va2 = (Slapi_Value **) slapi_ch_malloc( (numValues + 1) * sizeof(Slapi_Value *));
40cd75
+            sorted2 = (int *) slapi_ch_malloc( (numValues + 1)* sizeof(int));
40cd75
+        }
40cd75
 
40cd75
-    j = 0;
96373c
-    while (nextValue < numValues && j < vs->num) {
40cd75
-        /* nextValue is what we are looking at now
40cd75
-         * j tracks along the array getting next elements.
40cd75
-         *
40cd75
-         *  [ b, a, c, NULL, e, NULL, NULL, d]
40cd75
-         *             ^nv   ^j
40cd75
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
40cd75
-         *             ^nv ^j
40cd75
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
40cd75
-         *                ^nv    ^j
40cd75
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
40cd75
-         *                ^nv         ^j
40cd75
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
40cd75
-         *                ^nv               ^j
40cd75
-         *  [ b, a, c, e, d, NULL, NULL, NULL]
40cd75
-         *                ^nv              ^j
40cd75
-         */
40cd75
-        if (vs->va[nextValue] == NULL) {
40cd75
-            /* Advance j till we find something */
40cd75
-            while (vs->va[j] == NULL) {
40cd75
-                j++;
40cd75
+        /* I is the index for the *new* va2 array */
40cd75
+        for(i=0; i<vs->num; i++) {
40cd75
+            if (vs->sorted) {
40cd75
+                /* Skip any removed values from the index */
40cd75
+                while((nv < vs->num) && (-1 == vs->sorted[nv])) {
40cd75
+                    nv++;
40cd75
+                }
40cd75
+                /* We have a remaining value, add it to the va */
40cd75
+                if(nv < vs->num) {
40cd75
+                    va2[i] = vs->va[vs->sorted[nv]];
40cd75
+                    sorted2[i] = i;
40cd75
+                    nv++;
40cd75
+                }
40cd75
+            } else {
40cd75
+                while((nextValue < vs->num) && (NULL == vs->va[nextValue])) {
40cd75
+                    nextValue++;
40cd75
+                }
40cd75
+
40cd75
+                if(nextValue < vs->num) {
40cd75
+                    vs->va[i] = vs->va[nextValue];
40cd75
+                    nextValue++;
40cd75
+                } else {
40cd75
+                    break;
40cd75
+                }
40cd75
             }
40cd75
-            /* We have something! */
40cd75
-            vs->va[nextValue] = vs->va[j];
96373c
-            vs->va[j] = NULL;
96373c
         }
96373c
-        nextValue++;
96373c
-    }
96373c
-    /* Fix up the number of values */
96373c
-    vs->num = numValues;
96373c
-    /* Should we re-alloc values to be smaller? */
96373c
-    /* Other parts of DS are lazy. Lets clean our list */
96373c
-    for (j = vs->num; j < vs->max; j++) {
96373c
-        vs->va[j] = NULL;
96373c
-    }
96373c
 
96373c
-    /* All the values were deleted, we can discard the whole array. */
96373c
-    if (vs->num == 0) {
96373c
         if (vs->sorted) {
40cd75
+            /* Finally replace the valuearray and adjust num, max */
40cd75
+            slapi_ch_free((void **)&vs->va);
96373c
             slapi_ch_free((void **)&vs->sorted);
40cd75
+            vs->va = va2;
40cd75
+            vs->sorted = sorted2;
40cd75
+            vs->num = numValues;
40cd75
+            vs->max = vs->num + 1;
40cd75
+        } else {
40cd75
+            vs->num = numValues;
96373c
         }
96373c
-        slapi_ch_free((void **)&vs->va);
96373c
-        vs->va = NULL;
96373c
-        vs->max = 0;
96373c
-    } else if (vs->sorted != NULL) {
96373c
-        /* We still have values! rebuild the sorted array */
96373c
-        valueset_array_to_sorted(a, vs);
40cd75
+
40cd75
+        for (j = vs->num; j < vs->max; j++) {
96373c
+            vs->va[j] = NULL;
40cd75
+            if (vs->sorted) {
40cd75
+                vs->sorted[j] = -1;
40cd75
+            }
96373c
+        }
40cd75
+    } else {
40cd75
+        slapi_valueset_done(vs);
40cd75
     }
40cd75
 
40cd75
+    /* We still have values but not sorted array! rebuild it */
40cd75
+    if(vs->num > VALUESET_ARRAY_SORT_THRESHOLD && vs->sorted == NULL) {
40cd75
+        vs->sorted = (int *) slapi_ch_malloc( vs->max* sizeof(int));
96373c
+        valueset_array_to_sorted(a, vs);
96373c
+    }
40cd75
 #ifdef DEBUG
40cd75
     PR_ASSERT(vs->num == 0 || (vs->num > 0 && vs->va[0] != NULL));
40cd75
     size_t index = 0;
96373c
@@ -781,7 +817,6 @@ valueset_array_purge(const Slapi_Attr *a, Slapi_ValueSet *vs, const CSN *csn)
40cd75
         PR_ASSERT(vs->va[index] == NULL);
40cd75
     }
40cd75
 #endif
40cd75
-
40cd75
     /* return the number of remaining values */
40cd75
     return numValues;
40cd75
 }
40cd75
-- 
40cd75
2.13.6
40cd75