Blame SOURCES/0020-Ticket-49401-improve-valueset-sorted-performance-on-.patch

058656
From a59b2f4129565dbfa1b63899dd550e9c22b02923 Mon Sep 17 00:00:00 2001
b69e47
From: Mohammad Nweider <nweiderm@amazon.com>
b69e47
Date: Wed, 18 Oct 2017 13:02:15 +0000
b69e47
Subject: [PATCH] Ticket 49401 - improve valueset sorted performance on delete
b69e47
b69e47
Bug Description:  valueset sorted maintains a list of syntax sorted
b69e47
references to the attributes of the entry. During addition these are
b69e47
modified and added properly, so they stay sorted.
b69e47
b69e47
However, in the past to maintain the sorted property, during a delete
b69e47
we would need to remove the vs->sorted array, and recreate it via qsort,
b69e47
b69e47
While this was an improvement from past (where we would removed
b69e47
vs->sorted during an attr delete), it still has performance implications
b69e47
on very very large datasets, IE 50,000 member groups with
b69e47
addition/deletion, large entry caches and replication.
b69e47
b69e47
Fix Description:  Implement a new algorithm that is able to maintain
b69e47
existing sort data in a near linear time.
b69e47
b69e47
https://pagure.io/389-ds-base/issue/49401
b69e47
b69e47
Author: nweiderm, wibrown
b69e47
b69e47
Review by: wibrown, lkrispen, tbordaz (Thanks nweiderm!)
b69e47
b69e47
(cherry picked from commit a43a8efc7907116146b505ac40f18fac71f474b0)
b69e47
---
b69e47
 ldap/servers/slapd/valueset.c | 171 +++++++++++++++++++++++++-----------------
058656
 1 file changed, 103 insertions(+), 68 deletions(-)
b69e47
b69e47
diff --git a/ldap/servers/slapd/valueset.c b/ldap/servers/slapd/valueset.c
058656
index d2c67d2fb..1c1bc150a 100644
b69e47
--- a/ldap/servers/slapd/valueset.c
b69e47
+++ b/ldap/servers/slapd/valueset.c
058656
@@ -677,100 +677,136 @@ valueset_array_purge(const Slapi_Attr *a, Slapi_ValueSet *vs, const CSN *csn)
b69e47
     size_t i = 0;
b69e47
     size_t j = 0;
b69e47
     int nextValue = 0;
b69e47
+    int nv = 0;
b69e47
     int numValues = 0;
b69e47
+    Slapi_Value **va2 = NULL;
b69e47
+    int *sorted2 = NULL;
b69e47
 
b69e47
     /* Loop over all the values freeing the old ones. */
058656
-    for (i = 0; i < vs->num; i++) {
058656
+    for(i = 0; i < vs->num; i++)
058656
+    {
058656
         /* If we have the sorted array, find the va array ref by it. */
058656
         if (vs->sorted) {
058656
             j = vs->sorted[i];
b69e47
         } else {
b69e47
             j = i;
b69e47
         }
058656
-        csnset_purge(&(vs->va[j]->v_csnset), csn);
b69e47
-        if (vs->va[j]->v_csnset == NULL) {
b69e47
-            slapi_value_free(&vs->va[j]);
b69e47
-            vs->va[j] = NULL;
b69e47
-        } else if (vs->va[j] != NULL) {
b69e47
-            /* This value survived, we should count it. */
b69e47
-            numValues++;
b69e47
+        if (vs->va[j]) {
b69e47
+            csnset_purge(&(vs->va[j]->v_csnset),csn);
b69e47
+            if (vs->va[j]->v_csnset == NULL) {
b69e47
+                slapi_value_free(&vs->va[j]);
b69e47
+                /* Set the removed value to NULL so we know later to skip it */
b69e47
+                vs->va[j] = NULL;
b69e47
+                if (vs->sorted) {
b69e47
+                    /* Mark the value in sorted for removal */
b69e47
+                    vs->sorted[i] = -1;
b69e47
+                }
b69e47
+            } else {
b69e47
+                /* This value survived, we should count it. */
b69e47
+                numValues++;
b69e47
+            }
b69e47
         }
b69e47
     }
b69e47
 
b69e47
-    /* Now compact the value/sorted list. */
b69e47
+    /* Compact vs->va and vs->sorted only when there're
b69e47
+     * remaining values ie: numValues is greater than 0 */
b69e47
     /*
b69e47
-     * Because we want to preserve the sorted array, this is complicated.
b69e47
+     * Algorithm explination: We start with a pair of arrays - the attrs, and the sorted array that provides
b69e47
+     * a lookup into it:
b69e47
+     *
b69e47
+     * va: [d e a c b] sorted: [2 4 3 0 1]
b69e47
+     *
b69e47
+     * When we remove the element b, we NULL it, and we have to mark the place where it "was" as a -1 to
b69e47
+     * flag it's removal.
b69e47
+     *
b69e47
+     * va: [d e a c NULL] sorted: [2 -1 3 0 1]
b69e47
+     *
b69e47
+     * Now a second va is created with the reduced allocation,
b69e47
+     *
b69e47
+     * va2: [ X X X X ] ....
058656
+     *
058656
+     * Now we loop over sorted, skipping -1 that we find. In a new counter we create new sorted
058656
+     * references, and move the values compacting them in the process.
058656
+     * va: [d e a c NULL]
058656
+     * va2: [a x x x]
058656
+     * sorted: [_0 -1 3 0 1]
058656
+     *
058656
+     * Looping a few more times would yield:
b69e47
      *
b69e47
-     *  We have an array of values:
b69e47
-     *  [ b, a, c, NULL, e, NULL, NULL, d]
b69e47
-     *  And an array of indicies that are sorted.
b69e47
-     *  [ 1, 0, 2, 7, 4, 3, 5, 6 ]
b69e47
-     *  Were we to iterate over the sorted array, we get refs to the values in
b69e47
-     * some order.
b69e47
-     *  The issue is now we must *remove* from both the values *and* the sorted.
058656
+     * va2: [a c x x]
058656
+     * sorted: [_0 _1 3 0 1]
b69e47
      *
b69e47
-     * Previously, we just discarded this, because too hard. Now we try to keep
b69e47
-     * it. The issue is that this is surprisingly hard to actually keep in
b69e47
-     * sync.
058656
+     * va2: [a c d x]
058656
+     * sorted: [_0 _1 _2 0 1]
b69e47
      *
b69e47
-     * We can't just blindly move the values down: That breaks the sorted array
b69e47
-     * and we would need to iterate over the sorted array multiple times to
b69e47
-     * achieve this.
b69e47
+     * va2: [a c d e]
b69e47
+     * sorted: [_0 _1 _2 _3 1]
b69e47
+     *
b69e47
+     * Not only does this sort va, but with sorted, we have a faster lookup, and it will benefit cache
b69e47
+     * lookup.
b69e47
      *
b69e47
-     * It's actually going to be easier to just ditch the sorted, compact vs
b69e47
-     * and then qsort the array.
b69e47
      */
b69e47
+    if (numValues > 0) {
b69e47
+        if(vs->sorted) {
b69e47
+            /* Let's allocate va2 and sorted2 */
b69e47
+            va2 = (Slapi_Value **) slapi_ch_malloc( (numValues + 1) * sizeof(Slapi_Value *));
b69e47
+            sorted2 = (int *) slapi_ch_malloc( (numValues + 1)* sizeof(int));
b69e47
+        }
b69e47
 
b69e47
-    j = 0;
058656
-    while (nextValue < numValues && j < vs->num) {
b69e47
-        /* nextValue is what we are looking at now
b69e47
-         * j tracks along the array getting next elements.
b69e47
-         *
b69e47
-         *  [ b, a, c, NULL, e, NULL, NULL, d]
b69e47
-         *             ^nv   ^j
b69e47
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
b69e47
-         *             ^nv ^j
b69e47
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
b69e47
-         *                ^nv    ^j
b69e47
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
b69e47
-         *                ^nv         ^j
b69e47
-         *  [ b, a, c, e, NULL, NULL, NULL, d]
b69e47
-         *                ^nv               ^j
b69e47
-         *  [ b, a, c, e, d, NULL, NULL, NULL]
b69e47
-         *                ^nv              ^j
b69e47
-         */
b69e47
-        if (vs->va[nextValue] == NULL) {
b69e47
-            /* Advance j till we find something */
b69e47
-            while (vs->va[j] == NULL) {
b69e47
-                j++;
b69e47
+        /* I is the index for the *new* va2 array */
b69e47
+        for(i=0; i<vs->num; i++) {
b69e47
+            if (vs->sorted) {
b69e47
+                /* Skip any removed values from the index */
b69e47
+                while((nv < vs->num) && (-1 == vs->sorted[nv])) {
b69e47
+                    nv++;
b69e47
+                }
b69e47
+                /* We have a remaining value, add it to the va */
b69e47
+                if(nv < vs->num) {
b69e47
+                    va2[i] = vs->va[vs->sorted[nv]];
b69e47
+                    sorted2[i] = i;
b69e47
+                    nv++;
b69e47
+                }
b69e47
+            } else {
b69e47
+                while((nextValue < vs->num) && (NULL == vs->va[nextValue])) {
b69e47
+                    nextValue++;
b69e47
+                }
b69e47
+
b69e47
+                if(nextValue < vs->num) {
b69e47
+                    vs->va[i] = vs->va[nextValue];
b69e47
+                    nextValue++;
b69e47
+                } else {
b69e47
+                    break;
b69e47
+                }
b69e47
             }
b69e47
-            /* We have something! */
b69e47
-            vs->va[nextValue] = vs->va[j];
058656
-            vs->va[j] = NULL;
058656
         }
058656
-        nextValue++;
058656
-    }
058656
-    /* Fix up the number of values */
058656
-    vs->num = numValues;
058656
-    /* Should we re-alloc values to be smaller? */
058656
-    /* Other parts of DS are lazy. Lets clean our list */
058656
-    for (j = vs->num; j < vs->max; j++) {
058656
-        vs->va[j] = NULL;
058656
-    }
058656
 
058656
-    /* All the values were deleted, we can discard the whole array. */
058656
-    if (vs->num == 0) {
058656
         if (vs->sorted) {
b69e47
+            /* Finally replace the valuearray and adjust num, max */
b69e47
+            slapi_ch_free((void **)&vs->va);
058656
             slapi_ch_free((void **)&vs->sorted);
b69e47
+            vs->va = va2;
b69e47
+            vs->sorted = sorted2;
b69e47
+            vs->num = numValues;
b69e47
+            vs->max = vs->num + 1;
b69e47
+        } else {
b69e47
+            vs->num = numValues;
058656
         }
058656
-        slapi_ch_free((void **)&vs->va);
058656
-        vs->va = NULL;
058656
-        vs->max = 0;
058656
-    } else if (vs->sorted != NULL) {
058656
-        /* We still have values! rebuild the sorted array */
058656
-        valueset_array_to_sorted(a, vs);
b69e47
+
b69e47
+        for (j = vs->num; j < vs->max; j++) {
058656
+            vs->va[j] = NULL;
b69e47
+            if (vs->sorted) {
b69e47
+                vs->sorted[j] = -1;
b69e47
+            }
058656
+        }
b69e47
+    } else {
b69e47
+        slapi_valueset_done(vs);
b69e47
     }
b69e47
 
b69e47
+    /* We still have values but not sorted array! rebuild it */
b69e47
+    if(vs->num > VALUESET_ARRAY_SORT_THRESHOLD && vs->sorted == NULL) {
b69e47
+        vs->sorted = (int *) slapi_ch_malloc( vs->max* sizeof(int));
058656
+        valueset_array_to_sorted(a, vs);
058656
+    }
b69e47
 #ifdef DEBUG
b69e47
     PR_ASSERT(vs->num == 0 || (vs->num > 0 && vs->va[0] != NULL));
b69e47
     size_t index = 0;
058656
@@ -781,7 +817,6 @@ valueset_array_purge(const Slapi_Attr *a, Slapi_ValueSet *vs, const CSN *csn)
b69e47
         PR_ASSERT(vs->va[index] == NULL);
b69e47
     }
b69e47
 #endif
b69e47
-
b69e47
     /* return the number of remaining values */
b69e47
     return numValues;
b69e47
 }
b69e47
-- 
b69e47
2.13.6
b69e47