Blame SOURCES/0040-Issue-50538-cleanAllRUV-task-limit-is-not-enforced-f.patch

e79480
From 94703d5171853b69bb8ef9574f32bc9f0c051632 Mon Sep 17 00:00:00 2001
e79480
From: Mark Reynolds <mreynolds@redhat.com>
e79480
Date: Wed, 7 Aug 2019 20:36:53 -0400
e79480
Subject: [PATCH] Issue 50538 - cleanAllRUV task limit is not enforced for
e79480
 replicated tasks
e79480
e79480
Bug Description:
e79480
e79480
There is a hard limit of 64 concurrent cleanAllRUV tasks, but this limit is
e79480
only enforced when creating "new" tasks. It was not enforced when a task was
e79480
received via an extended operation. There were also race conditions in the
e79480
existing logic that allowed the array of cleaned rids to get corrupted . This
e79480
allowed for a very large number of task threads to be created.
e79480
e79480
Fix Description:
e79480
e79480
Maintain a new counter to keep track of the number of clean and abort threads
e79480
to make sure it never over runs the rid array buffers.
e79480
e79480
relates: https://pagure.io/389-ds-base/issue/50538
e79480
e79480
Reviewed by: lkrispenz(Thanks!)
e79480
---
e79480
 .../suites/replication/cleanallruv_test.py    |  47 +++-
e79480
 ldap/servers/plugins/replication/repl5.h      |   7 +-
e79480
 .../replication/repl5_replica_config.c        | 247 ++++++++++--------
e79480
 ldap/servers/plugins/replication/repl_extop.c |  19 +-
e79480
 4 files changed, 202 insertions(+), 118 deletions(-)
e79480
e79480
diff --git a/dirsrvtests/tests/suites/replication/cleanallruv_test.py b/dirsrvtests/tests/suites/replication/cleanallruv_test.py
e79480
index 620a53e1a..43801dd52 100644
e79480
--- a/dirsrvtests/tests/suites/replication/cleanallruv_test.py
e79480
+++ b/dirsrvtests/tests/suites/replication/cleanallruv_test.py
e79480
@@ -1,5 +1,5 @@
e79480
 # --- BEGIN COPYRIGHT BLOCK ---
e79480
-# Copyright (C) 2016 Red Hat, Inc.
e79480
+# Copyright (C) 2019 Red Hat, Inc.
e79480
 # All rights reserved.
e79480
 #
e79480
 # License: GPL (version 3 or any later version).
e79480
@@ -7,7 +7,6 @@
e79480
 # --- END COPYRIGHT BLOCK ---
e79480
 #
e79480
 import threading
e79480
-
e79480
 import pytest
e79480
 from lib389.tasks import *
e79480
 from lib389.utils import *
e79480
@@ -859,6 +858,50 @@ def test_multiple_tasks_with_force(topology_m4):
e79480
     restore_master4(topology_m4)
e79480
 
e79480
 
e79480
+def test_max_tasks(topology_m4):
e79480
+    """Test we can not create more than 64 cleaning tasks
e79480
+
e79480
+    :id: c34d0b40-3c3e-4f53-8656-5e4c2a310a1f
e79480
+    :setup: Replication setup with four masters
e79480
+    :steps:
e79480
+        1. Stop masters 3 & 4
e79480
+        2. Create over 64 tasks between m1 and m2
e79480
+        3. Check logs to see if (>65) tasks were rejected
e79480
+
e79480
+    :expectedresults:
e79480
+        1. Success
e79480
+        2. Success
e79480
+        3. Success
e79480
+    """
e79480
+
e79480
+    # Stop masters 3 & 4
e79480
+    m1 = topology_m4.ms["master1"]
e79480
+    m2 = topology_m4.ms["master2"]
e79480
+    m3 = topology_m4.ms["master3"]
e79480
+    m4 = topology_m4.ms["master4"]
e79480
+    m3.stop()
e79480
+    m4.stop()
e79480
+
e79480
+    # Add over 64 tasks between master1 & 2 to try to exceed the 64 task limit
e79480
+    for i in range(1, 64):
e79480
+        cruv_task = CleanAllRUVTask(m1)
e79480
+        cruv_task.create(properties={
e79480
+            'replica-id': str(i),
e79480
+            'replica-base-dn': DEFAULT_SUFFIX,
e79480
+            'replica-force-cleaning': 'no',  # This forces these tasks to stick around
e79480
+        })
e79480
+        cruv_task = CleanAllRUVTask(m2)
e79480
+        cruv_task.create(properties={
e79480
+            'replica-id': "10" + str(i),
e79480
+            'replica-base-dn': DEFAULT_SUFFIX,
e79480
+            'replica-force-cleaning': 'yes',  # This allows the tasks to propagate
e79480
+        })
e79480
+
e79480
+    # Check the errors log for our error message in master 1
e79480
+    assert m1.searchErrorsLog('Exceeded maximum number of active CLEANALLRUV tasks')
e79480
+>>>>>>> ab24aa4cb... Issue 50538 - cleanAllRUV task limit is not enforced for replicated tasks
e79480
+
e79480
+
e79480
 if __name__ == '__main__':
e79480
     # Run isolated
e79480
     # -s for DEBUG mode
e79480
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
e79480
index e08fec752..d414926c2 100644
e79480
--- a/ldap/servers/plugins/replication/repl5.h
e79480
+++ b/ldap/servers/plugins/replication/repl5.h
e79480
@@ -80,6 +80,8 @@
e79480
 #define CLEANRUV_FINISHED  "finished"
e79480
 #define CLEANRUV_CLEANING  "cleaning"
e79480
 #define CLEANRUV_NO_MAXCSN "no maxcsn"
e79480
+#define CLEANALLRUV_ID "CleanAllRUV Task"
e79480
+#define ABORT_CLEANALLRUV_ID "Abort CleanAllRUV Task"
e79480
 
e79480
 /* DS 5.0 replication protocol error codes */
e79480
 #define NSDS50_REPL_REPLICA_READY             0x00  /* Replica ready, go ahead */
e79480
@@ -784,6 +786,7 @@ void multimaster_mtnode_construct_replicas(void);
e79480
 void multimaster_be_state_change(void *handle, char *be_name, int old_be_state, int new_be_state);
e79480
 
e79480
 #define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
e79480
+#define CLEANRID_BUFSIZ 128
e79480
 
e79480
 typedef struct _cleanruv_data
e79480
 {
e79480
@@ -815,6 +818,8 @@ int get_replica_type(Replica *r);
e79480
 int replica_execute_cleanruv_task_ext(Object *r, ReplicaId rid);
e79480
 void add_cleaned_rid(cleanruv_data *data, char *maxcsn);
e79480
 int is_cleaned_rid(ReplicaId rid);
e79480
+int32_t check_and_set_cleanruv_task_count(ReplicaId rid);
e79480
+int32_t check_and_set_abort_cleanruv_task_count(void);
e79480
 int replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter, int *returncode, char *returntext, void *arg);
e79480
 void replica_cleanallruv_thread_ext(void *arg);
e79480
 void stop_ruv_cleaning(void);
e79480
@@ -833,8 +838,6 @@ void set_cleaned_rid(ReplicaId rid);
e79480
 void cleanruv_log(Slapi_Task *task, int rid, char *task_type, int sev_level, char *fmt, ...);
e79480
 char *replica_cleanallruv_get_local_maxcsn(ReplicaId rid, char *base_dn);
e79480
 
e79480
-
e79480
-
e79480
 /* replutil.c */
e79480
 LDAPControl *create_managedsait_control(void);
e79480
 LDAPControl *create_backend_control(Slapi_DN *sdn);
e79480
diff --git a/ldap/servers/plugins/replication/repl5_replica_config.c b/ldap/servers/plugins/replication/repl5_replica_config.c
e79480
index b4aff9eb4..0ba2cd976 100644
e79480
--- a/ldap/servers/plugins/replication/repl5_replica_config.c
e79480
+++ b/ldap/servers/plugins/replication/repl5_replica_config.c
e79480
@@ -30,17 +30,18 @@
e79480
 #define CLEANALLRUV "CLEANALLRUV"
e79480
 #define CLEANALLRUVLEN 11
e79480
 #define REPLICA_RDN "cn=replica"
e79480
-#define CLEANALLRUV_ID "CleanAllRUV Task"
e79480
-#define ABORT_CLEANALLRUV_ID "Abort CleanAllRUV Task"
e79480
 
e79480
 int slapi_log_urp = SLAPI_LOG_REPL;
e79480
-static ReplicaId cleaned_rids[CLEANRIDSIZ + 1] = {0};
e79480
-static ReplicaId pre_cleaned_rids[CLEANRIDSIZ + 1] = {0};
e79480
-static ReplicaId aborted_rids[CLEANRIDSIZ + 1] = {0};
e79480
-static Slapi_RWLock *rid_lock = NULL;
e79480
-static Slapi_RWLock *abort_rid_lock = NULL;
e79480
+static ReplicaId cleaned_rids[CLEANRID_BUFSIZ] = {0};
e79480
+static ReplicaId pre_cleaned_rids[CLEANRID_BUFSIZ] = {0};
e79480
+static ReplicaId aborted_rids[CLEANRID_BUFSIZ] = {0};
e79480
+static PRLock *rid_lock = NULL;
e79480
+static PRLock *abort_rid_lock = NULL;
e79480
 static PRLock *notify_lock = NULL;
e79480
 static PRCondVar *notify_cvar = NULL;
e79480
+static PRLock *task_count_lock = NULL;
e79480
+static int32_t clean_task_count = 0;
e79480
+static int32_t abort_task_count = 0;
e79480
 
e79480
 /* Forward Declartions */
e79480
 static int replica_config_add(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *entryAfter, int *returncode, char *returntext, void *arg);
e79480
@@ -67,8 +68,6 @@ static int replica_cleanallruv_send_abort_extop(Repl_Agmt *ra, Slapi_Task *task,
e79480
 static int replica_cleanallruv_check_maxcsn(Repl_Agmt *agmt, char *basedn, char *rid_text, char *maxcsn, Slapi_Task *task);
e79480
 static int replica_cleanallruv_replica_alive(Repl_Agmt *agmt);
e79480
 static int replica_cleanallruv_check_ruv(char *repl_root, Repl_Agmt *ra, char *rid_text, Slapi_Task *task, char *force);
e79480
-static int get_cleanruv_task_count(void);
e79480
-static int get_abort_cleanruv_task_count(void);
e79480
 static int replica_cleanup_task(Object *r, const char *task_name, char *returntext, int apply_mods);
e79480
 static int replica_task_done(Replica *replica);
e79480
 static void delete_cleaned_rid_config(cleanruv_data *data);
e79480
@@ -114,20 +113,27 @@ replica_config_init()
e79480
                       PR_GetError());
e79480
         return -1;
e79480
     }
e79480
-    rid_lock = slapi_new_rwlock();
e79480
+    rid_lock = PR_NewLock();
e79480
     if (rid_lock == NULL) {
e79480
         slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "replica_config_init - "
e79480
                                                        "Failed to create rid_lock; NSPR error - %d\n",
e79480
                       PR_GetError());
e79480
         return -1;
e79480
     }
e79480
-    abort_rid_lock = slapi_new_rwlock();
e79480
+    abort_rid_lock = PR_NewLock();
e79480
     if (abort_rid_lock == NULL) {
e79480
         slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "replica_config_init - "
e79480
                                                        "Failed to create abort_rid_lock; NSPR error - %d\n",
e79480
                       PR_GetError());
e79480
         return -1;
e79480
     }
e79480
+    task_count_lock = PR_NewLock();
e79480
+    if (task_count_lock == NULL) {
e79480
+        slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "replica_config_init - "
e79480
+                                                       "Failed to create task_count_lock; NSPR error - %d\n",
e79480
+                      PR_GetError());
e79480
+        return -1;
e79480
+    }
e79480
     if ((notify_lock = PR_NewLock()) == NULL) {
e79480
         slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "replica_config_init - "
e79480
                                                        "Failed to create notify lock; NSPR error - %d\n",
e79480
@@ -1484,12 +1490,6 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
e79480
 
e79480
     cleanruv_log(pre_task, rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Initiating CleanAllRUV Task...");
e79480
 
e79480
-    if (get_cleanruv_task_count() >= CLEANRIDSIZ) {
e79480
-        /* we are already running the maximum number of tasks */
e79480
-        cleanruv_log(pre_task, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR,
e79480
-                     "Exceeded maximum number of active CLEANALLRUV tasks(%d)", CLEANRIDSIZ);
e79480
-        return LDAP_UNWILLING_TO_PERFORM;
e79480
-    }
e79480
     /*
e79480
      *  Grab the replica
e79480
      */
e79480
@@ -1541,6 +1541,13 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
e79480
         goto fail;
e79480
     }
e79480
 
e79480
+    if (check_and_set_cleanruv_task_count(rid) != LDAP_SUCCESS) {
e79480
+        cleanruv_log(NULL, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR,
e79480
+                     "Exceeded maximum number of active CLEANALLRUV tasks(%d)", CLEANRIDSIZ);
e79480
+        rc = LDAP_UNWILLING_TO_PERFORM;
e79480
+        goto fail;
e79480
+    }
e79480
+
e79480
     /*
e79480
      *  Launch the cleanallruv thread.  Once all the replicas are cleaned it will release the rid
e79480
      */
e79480
@@ -1548,6 +1555,9 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
e79480
     if (data == NULL) {
e79480
         cleanruv_log(pre_task, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR, "Failed to allocate cleanruv_data.  Aborting task.");
e79480
         rc = -1;
e79480
+        PR_Lock(task_count_lock);
e79480
+        clean_task_count--;
e79480
+        PR_Unlock(task_count_lock);
e79480
         goto fail;
e79480
     }
e79480
     data->repl_obj = r;
e79480
@@ -1630,13 +1640,13 @@ replica_cleanallruv_thread(void *arg)
e79480
     int aborted = 0;
e79480
     int rc = 0;
e79480
 
e79480
-    if (!data || slapi_is_shutting_down()) {
e79480
-        return; /* no data */
e79480
-    }
e79480
-
e79480
     /* Increase active thread count to prevent a race condition at server shutdown */
e79480
     g_incr_active_threadcnt();
e79480
 
e79480
+    if (!data || slapi_is_shutting_down()) {
e79480
+        goto done;
e79480
+    }
e79480
+
e79480
     if (data->task) {
e79480
         slapi_task_inc_refcount(data->task);
e79480
         slapi_log_err(SLAPI_LOG_PLUGIN, repl_plugin_name,
e79480
@@ -1683,16 +1693,13 @@ replica_cleanallruv_thread(void *arg)
e79480
         slapi_task_begin(data->task, 1);
e79480
     }
e79480
     /*
e79480
-     *  Presetting the rid prevents duplicate thread creation, but allows the db and changelog to still
e79480
-     *  process updates from the rid.
e79480
-     *  set_cleaned_rid() blocks updates, so we don't want to do that... yet unless we are in force mode.
e79480
-     *  If we are forcing a clean independent of state of other servers for this RID we can set_cleaned_rid()
e79480
+     *  We have already preset this rid, but if we are forcing a clean independent of state
e79480
+     *  of other servers for this RID we can set_cleaned_rid()
e79480
      */
e79480
     if (data->force) {
e79480
         set_cleaned_rid(data->rid);
e79480
-    } else {
e79480
-        preset_cleaned_rid(data->rid);
e79480
     }
e79480
+
e79480
     rid_text = slapi_ch_smprintf("%d", data->rid);
e79480
     csn_as_string(data->maxcsn, PR_FALSE, csnstr);
e79480
     /*
e79480
@@ -1862,6 +1869,9 @@ done:
e79480
     /*
e79480
      *  If the replicas are cleaned, release the rid
e79480
      */
e79480
+    if (slapi_is_shutting_down()) {
e79480
+        stop_ruv_cleaning();
e79480
+    }
e79480
     if (!aborted && !slapi_is_shutting_down()) {
e79480
         /*
e79480
          * Success - the rid has been cleaned!
e79480
@@ -1880,10 +1890,9 @@ done:
e79480
         } else {
e79480
             cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Propagated task does not delete Keep alive entry (%d).", data->rid);
e79480
         }
e79480
-
e79480
         clean_agmts(data);
e79480
         remove_cleaned_rid(data->rid);
e79480
-        cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Successfully cleaned rid(%d).", data->rid);
e79480
+        cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Successfully cleaned rid(%d)", data->rid);
e79480
     } else {
e79480
         /*
e79480
          *  Shutdown or abort
e79480
@@ -1916,6 +1925,10 @@ done:
e79480
     slapi_ch_free_string(&data->force);
e79480
     slapi_ch_free_string(&rid_text);
e79480
     slapi_ch_free((void **)&data);
e79480
+    /* decrement task count */
e79480
+    PR_Lock(task_count_lock);
e79480
+    clean_task_count--;
e79480
+    PR_Unlock(task_count_lock);
e79480
     g_decr_active_threadcnt();
e79480
 }
e79480
 
e79480
@@ -2415,16 +2428,14 @@ replica_send_cleanruv_task(Repl_Agmt *agmt, cleanruv_data *clean_data)
e79480
 int
e79480
 is_cleaned_rid(ReplicaId rid)
e79480
 {
e79480
-    int i;
e79480
-
e79480
-    slapi_rwlock_rdlock(rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ && cleaned_rids[i] != 0; i++) {
e79480
+    PR_Lock(rid_lock);
e79480
+    for (size_t i = 0; i < CLEANRID_BUFSIZ; i++) {
e79480
         if (rid == cleaned_rids[i]) {
e79480
-            slapi_rwlock_unlock(rid_lock);
e79480
+            PR_Unlock(rid_lock);
e79480
             return 1;
e79480
         }
e79480
     }
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(rid_lock);
e79480
 
e79480
     return 0;
e79480
 }
e79480
@@ -2432,16 +2443,14 @@ is_cleaned_rid(ReplicaId rid)
e79480
 int
e79480
 is_pre_cleaned_rid(ReplicaId rid)
e79480
 {
e79480
-    int i;
e79480
-
e79480
-    slapi_rwlock_rdlock(rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ && pre_cleaned_rids[i] != 0; i++) {
e79480
+    PR_Lock(rid_lock);
e79480
+    for (size_t i = 0; i < CLEANRID_BUFSIZ; i++) {
e79480
         if (rid == pre_cleaned_rids[i]) {
e79480
-            slapi_rwlock_unlock(rid_lock);
e79480
+            PR_Unlock(rid_lock);
e79480
             return 1;
e79480
         }
e79480
     }
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(rid_lock);
e79480
 
e79480
     return 0;
e79480
 }
e79480
@@ -2454,14 +2463,14 @@ is_task_aborted(ReplicaId rid)
e79480
     if (rid == 0) {
e79480
         return 0;
e79480
     }
e79480
-    slapi_rwlock_rdlock(abort_rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ && aborted_rids[i] != 0; i++) {
e79480
+    PR_Lock(abort_rid_lock);
e79480
+    for (i = 0; i < CLEANRID_BUFSIZ && aborted_rids[i] != 0; i++) {
e79480
         if (rid == aborted_rids[i]) {
e79480
-            slapi_rwlock_unlock(abort_rid_lock);
e79480
+            PR_Unlock(abort_rid_lock);
e79480
             return 1;
e79480
         }
e79480
     }
e79480
-    slapi_rwlock_unlock(abort_rid_lock);
e79480
+    PR_Unlock(abort_rid_lock);
e79480
     return 0;
e79480
 }
e79480
 
e79480
@@ -2470,15 +2479,14 @@ preset_cleaned_rid(ReplicaId rid)
e79480
 {
e79480
     int i;
e79480
 
e79480
-    slapi_rwlock_wrlock(rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ; i++) {
e79480
+    PR_Lock(rid_lock);
e79480
+    for (i = 0; i < CLEANRID_BUFSIZ && pre_cleaned_rids[i] != rid; i++) {
e79480
         if (pre_cleaned_rids[i] == 0) {
e79480
             pre_cleaned_rids[i] = rid;
e79480
-            pre_cleaned_rids[i + 1] = 0;
e79480
             break;
e79480
         }
e79480
     }
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(rid_lock);
e79480
 }
e79480
 
e79480
 /*
e79480
@@ -2491,14 +2499,13 @@ set_cleaned_rid(ReplicaId rid)
e79480
 {
e79480
     int i;
e79480
 
e79480
-    slapi_rwlock_wrlock(rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ; i++) {
e79480
+    PR_Lock(rid_lock);
e79480
+    for (i = 0; i < CLEANRID_BUFSIZ && cleaned_rids[i] != rid; i++) {
e79480
         if (cleaned_rids[i] == 0) {
e79480
             cleaned_rids[i] = rid;
e79480
-            cleaned_rids[i + 1] = 0;
e79480
         }
e79480
     }
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(rid_lock);
e79480
 }
e79480
 
e79480
 /*
e79480
@@ -2570,15 +2577,14 @@ add_aborted_rid(ReplicaId rid, Replica *r, char *repl_root)
e79480
     int rc;
e79480
     int i;
e79480
 
e79480
-    slapi_rwlock_wrlock(abort_rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ; i++) {
e79480
+    PR_Lock(abort_rid_lock);
e79480
+    for (i = 0; i < CLEANRID_BUFSIZ; i++) {
e79480
         if (aborted_rids[i] == 0) {
e79480
             aborted_rids[i] = rid;
e79480
-            aborted_rids[i + 1] = 0;
e79480
             break;
e79480
         }
e79480
     }
e79480
-    slapi_rwlock_unlock(abort_rid_lock);
e79480
+    PR_Unlock(abort_rid_lock);
e79480
     /*
e79480
      *  Write the rid to the config entry
e79480
      */
e79480
@@ -2621,21 +2627,24 @@ delete_aborted_rid(Replica *r, ReplicaId rid, char *repl_root, int skip)
e79480
     char *data;
e79480
     char *dn;
e79480
     int rc;
e79480
-    int i;
e79480
 
e79480
     if (r == NULL)
e79480
         return;
e79480
 
e79480
     if (skip) {
e79480
         /* skip the deleting of the config, and just remove the in memory rid */
e79480
-        slapi_rwlock_wrlock(abort_rid_lock);
e79480
-        for (i = 0; i < CLEANRIDSIZ && aborted_rids[i] != rid; i++)
e79480
-            ; /* found rid, stop */
e79480
-        for (; i < CLEANRIDSIZ; i++) {
e79480
-            /* rewrite entire array */
e79480
-            aborted_rids[i] = aborted_rids[i + 1];
e79480
-        }
e79480
-        slapi_rwlock_unlock(abort_rid_lock);
e79480
+        ReplicaId new_abort_rids[CLEANRID_BUFSIZ] = {0};
e79480
+        int32_t idx = 0;
e79480
+
e79480
+        PR_Lock(abort_rid_lock);
e79480
+        for (size_t i = 0; i < CLEANRID_BUFSIZ; i++) {
e79480
+            if (aborted_rids[i] != rid) {
e79480
+                new_abort_rids[idx] = aborted_rids[i];
e79480
+                idx++;
e79480
+            }
e79480
+        }
e79480
+        memcpy(aborted_rids, new_abort_rids, sizeof(new_abort_rids));
e79480
+        PR_Unlock(abort_rid_lock);
e79480
     } else {
e79480
         /* only remove the config, leave the in-memory rid */
e79480
         dn = replica_get_dn(r);
e79480
@@ -2793,27 +2802,31 @@ bail:
e79480
 void
e79480
 remove_cleaned_rid(ReplicaId rid)
e79480
 {
e79480
-    int i;
e79480
-    /*
e79480
-     *  Remove this rid, and optimize the array
e79480
-     */
e79480
-    slapi_rwlock_wrlock(rid_lock);
e79480
+    ReplicaId new_cleaned_rids[CLEANRID_BUFSIZ] = {0};
e79480
+    ReplicaId new_pre_cleaned_rids[CLEANRID_BUFSIZ] = {0};
e79480
+    size_t idx = 0;
e79480
+
e79480
+    PR_Lock(rid_lock);
e79480
 
e79480
-    for (i = 0; i < CLEANRIDSIZ && cleaned_rids[i] != rid; i++)
e79480
-        ; /* found rid, stop */
e79480
-    for (; i < CLEANRIDSIZ; i++) {
e79480
-        /* rewrite entire array */
e79480
-        cleaned_rids[i] = cleaned_rids[i + 1];
e79480
+    for (size_t i = 0; i < CLEANRID_BUFSIZ; i++) {
e79480
+        if (cleaned_rids[i] != rid) {
e79480
+            new_cleaned_rids[idx] = cleaned_rids[i];
e79480
+            idx++;
e79480
+        }
e79480
     }
e79480
+    memcpy(cleaned_rids, new_cleaned_rids, sizeof(new_cleaned_rids));
e79480
+
e79480
     /* now do the preset cleaned rids */
e79480
-    for (i = 0; i < CLEANRIDSIZ && pre_cleaned_rids[i] != rid; i++)
e79480
-        ; /* found rid, stop */
e79480
-    for (; i < CLEANRIDSIZ; i++) {
e79480
-        /* rewrite entire array */
e79480
-        pre_cleaned_rids[i] = pre_cleaned_rids[i + 1];
e79480
+    idx = 0;
e79480
+    for (size_t i = 0; i < CLEANRID_BUFSIZ; i++) {
e79480
+        if (pre_cleaned_rids[i] != rid) {
e79480
+            new_pre_cleaned_rids[idx] = pre_cleaned_rids[i];
e79480
+            idx++;
e79480
+        }
e79480
     }
e79480
+    memcpy(pre_cleaned_rids, new_pre_cleaned_rids, sizeof(new_pre_cleaned_rids));
e79480
 
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(rid_lock);
e79480
 }
e79480
 
e79480
 /*
e79480
@@ -2841,16 +2854,6 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb __attribute__((unused)),
e79480
     char *ridstr = NULL;
e79480
     int rc = SLAPI_DSE_CALLBACK_OK;
e79480
 
e79480
-    if (get_abort_cleanruv_task_count() >= CLEANRIDSIZ) {
e79480
-        /* we are already running the maximum number of tasks */
e79480
-        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE,
e79480
-                    "Exceeded maximum number of active ABORT CLEANALLRUV tasks(%d)",
e79480
-                    CLEANRIDSIZ);
e79480
-        cleanruv_log(task, -1, ABORT_CLEANALLRUV_ID, SLAPI_LOG_ERR, "%s", returntext);
e79480
-        *returncode = LDAP_OPERATIONS_ERROR;
e79480
-        return SLAPI_DSE_CALLBACK_ERROR;
e79480
-    }
e79480
-
e79480
     /* allocate new task now */
e79480
     task = slapi_new_task(slapi_entry_get_ndn(e));
e79480
 
e79480
@@ -2935,6 +2938,16 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb __attribute__((unused)),
e79480
          */
e79480
         certify_all = "no";
e79480
     }
e79480
+
e79480
+    if (check_and_set_abort_cleanruv_task_count() != LDAP_SUCCESS) {
e79480
+        /* we are already running the maximum number of tasks */
e79480
+        PR_snprintf(returntext, SLAPI_DSE_RETURNTEXT_SIZE,
e79480
+                    "Exceeded maximum number of active ABORT CLEANALLRUV tasks(%d)",
e79480
+                    CLEANRIDSIZ);
e79480
+        cleanruv_log(task, -1, ABORT_CLEANALLRUV_ID, SLAPI_LOG_ERR, "%s", returntext);
e79480
+        *returncode = LDAP_UNWILLING_TO_PERFORM;
e79480
+        goto out;
e79480
+    }
e79480
     /*
e79480
      *  Create payload
e79480
      */
e79480
@@ -3143,6 +3156,9 @@ done:
e79480
     slapi_ch_free_string(&data->certify);
e79480
     slapi_sdn_free(&data->sdn);
e79480
     slapi_ch_free((void **)&data);
e79480
+    PR_Lock(task_count_lock);
e79480
+    abort_task_count--;
e79480
+    PR_Unlock(task_count_lock);
e79480
     g_decr_active_threadcnt();
e79480
 }
e79480
 
e79480
@@ -3494,36 +3510,43 @@ replica_cleanallruv_check_ruv(char *repl_root, Repl_Agmt *agmt, char *rid_text,
e79480
     return rc;
e79480
 }
e79480
 
e79480
-static int
e79480
-get_cleanruv_task_count(void)
e79480
+/*
e79480
+ * Before starting a cleanAllRUV task make sure there are not
e79480
+ * too many task threads already running.  If everything is okay
e79480
+ * also pre-set the RID now so rebounding extended ops do not
e79480
+ * try to clean it over and over.
e79480
+ */
e79480
+int32_t
e79480
+check_and_set_cleanruv_task_count(ReplicaId rid)
e79480
 {
e79480
-    int i, count = 0;
e79480
+    int32_t rc = 0;
e79480
 
e79480
-    slapi_rwlock_wrlock(rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ; i++) {
e79480
-        if (pre_cleaned_rids[i] != 0) {
e79480
-            count++;
e79480
-        }
e79480
+    PR_Lock(task_count_lock);
e79480
+    if (clean_task_count >= CLEANRIDSIZ) {
e79480
+        rc = -1;
e79480
+    } else {
e79480
+        clean_task_count++;
e79480
+        preset_cleaned_rid(rid);
e79480
     }
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(task_count_lock);
e79480
 
e79480
-    return count;
e79480
+    return rc;
e79480
 }
e79480
 
e79480
-static int
e79480
-get_abort_cleanruv_task_count(void)
e79480
+int32_t
e79480
+check_and_set_abort_cleanruv_task_count(void)
e79480
 {
e79480
-    int i, count = 0;
e79480
+    int32_t rc = 0;
e79480
 
e79480
-    slapi_rwlock_wrlock(rid_lock);
e79480
-    for (i = 0; i < CLEANRIDSIZ; i++) {
e79480
-        if (aborted_rids[i] != 0) {
e79480
-            count++;
e79480
+    PR_Lock(task_count_lock);
e79480
+    if (abort_task_count > CLEANRIDSIZ) {
e79480
+            rc = -1;
e79480
+        } else {
e79480
+            abort_task_count++;
e79480
         }
e79480
-    }
e79480
-    slapi_rwlock_unlock(rid_lock);
e79480
+    PR_Unlock(task_count_lock);
e79480
 
e79480
-    return count;
e79480
+    return rc;
e79480
 }
e79480
 
e79480
 /*
e79480
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
e79480
index 68e2544b4..0c2abb6d5 100644
e79480
--- a/ldap/servers/plugins/replication/repl_extop.c
e79480
+++ b/ldap/servers/plugins/replication/repl_extop.c
e79480
@@ -1393,6 +1393,12 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
e79480
         rc = LDAP_OPERATIONS_ERROR;
e79480
         goto out;
e79480
     }
e79480
+    if (check_and_set_abort_cleanruv_task_count() != LDAP_SUCCESS) {
e79480
+        cleanruv_log(NULL, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR,
e79480
+                     "Exceeded maximum number of active abort CLEANALLRUV tasks(%d)", CLEANRIDSIZ);
e79480
+        rc = LDAP_UNWILLING_TO_PERFORM;
e79480
+        goto out;
e79480
+    }
e79480
     /*
e79480
      *  Prepare the abort data
e79480
      */
e79480
@@ -1499,6 +1505,7 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb)
e79480
     if (force == NULL) {
e79480
         force = "no";
e79480
     }
e79480
+
e79480
     maxcsn = csn_new();
e79480
     csn_init_by_string(maxcsn, csnstr);
e79480
     /*
e79480
@@ -1535,13 +1542,21 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb)
e79480
         goto free_and_return;
e79480
     }
e79480
 
e79480
+    if (check_and_set_cleanruv_task_count((ReplicaId)rid) != LDAP_SUCCESS) {
e79480
+        cleanruv_log(NULL, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR,
e79480
+                     "Exceeded maximum number of active CLEANALLRUV tasks(%d)", CLEANRIDSIZ);
e79480
+        rc = LDAP_UNWILLING_TO_PERFORM;
e79480
+        goto free_and_return;
e79480
+    }
e79480
+
e79480
     if (replica_get_type(r) != REPLICA_TYPE_READONLY) {
e79480
         /*
e79480
          *  Launch the cleanruv monitoring thread.  Once all the replicas are cleaned it will release the rid
e79480
          *
e79480
          *  This will also release mtnode_ext->replica
e79480
          */
e79480
-        slapi_log_err(SLAPI_LOG_INFO, repl_plugin_name, "multimaster_extop_cleanruv - CleanAllRUV Task - Launching cleanAllRUV thread...\n");
e79480
+
e79480
+        cleanruv_log(NULL, rid, CLEANALLRUV_ID, SLAPI_LOG_ERR, "Launching cleanAllRUV thread...\n");
e79480
         data = (cleanruv_data *)slapi_ch_calloc(1, sizeof(cleanruv_data));
e79480
         if (data == NULL) {
e79480
             slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "multimaster_extop_cleanruv - CleanAllRUV Task - Failed to allocate "
e79480
@@ -1635,7 +1650,7 @@ free_and_return:
e79480
         ber_printf(resp_bere, "{s}", CLEANRUV_ACCEPTED);
e79480
         ber_flatten(resp_bere, &resp_bval);
e79480
         slapi_pblock_set(pb, SLAPI_EXT_OP_RET_VALUE, resp_bval);
e79480
-        slapi_send_ldap_result(pb, LDAP_SUCCESS, NULL, NULL, 0, NULL);
e79480
+        slapi_send_ldap_result(pb, rc, NULL, NULL, 0, NULL);
e79480
         /* resp_bere */
e79480
         if (NULL != resp_bere) {
e79480
             ber_free(resp_bere, 1);
e79480
-- 
e79480
2.21.0
e79480