Blame SOURCES/0053-Ticket-49463-After-cleanALLruv-there-is-a-flow-of-ke.patch

96373c
From 0ac68e15a9a4048d3c1ad4519000996cd65fdefb Mon Sep 17 00:00:00 2001
96373c
From: Thierry Bordaz <tbordaz@redhat.com>
96373c
Date: Fri, 1 Dec 2017 16:23:11 +0100
96373c
Subject: [PATCH] Ticket 49463 - After cleanALLruv, there is a flow of keep
96373c
 alive DEL
96373c
96373c
Bug Description:
96373c
	When cleanAllRuv is launched, it spawn cleanAllRuv on all replicas.
96373c
	Each replica will clean its changelog and database RUV AND in addition
96373c
	will DEL the keep alive entry of the target ReplicaID.
96373c
	So for the same entry (keep alive) there will be as many DEL as there are replicas
96373c
96373c
	This flow of DEL is useless as only one DEL is enough.
96373c
	In addition because of https://pagure.io/389-ds-base/issue/49466, replication may
96373c
	loop on each of those DELs.
96373c
96373c
Fix Description:
96373c
	The fix is only to prevent the flow of DEL.
96373c
	It adds a flag ('original_task') in the task payload.
96373c
	The server receiving the task (replica_execute_cleanall_ruv_task) flags the
96373c
	task as 'original_task'.
96373c
	In the opposite, the propagated cleanAllRuv (multimaster_extop_cleanruv) does
96373c
	not flag the task as 'original_task'
96373c
	Only original task does the DEL of the keep alive entry.
96373c
	Note the propageted payload (extop) is not changed. In a mixed version
96373c
	environment "old" servers will DEL the keep alive and flow can still happen
96373c
96373c
https://pagure.io/389-ds-base/issue/49466
96373c
96373c
Reviewed by: Ludwig Krispenz
96373c
96373c
Platforms tested: F23
96373c
96373c
Flag Day: no
96373c
96373c
Doc impact: no
96373c
---
96373c
 ldap/servers/plugins/replication/repl5.h           | 49 ++++++++++++----------
96373c
 ldap/servers/plugins/replication/repl5_replica.c   | 21 ++++++++++
96373c
 .../plugins/replication/repl5_replica_config.c     | 32 +++++++++++---
96373c
 ldap/servers/plugins/replication/repl_extop.c      |  2 +
96373c
 4 files changed, 76 insertions(+), 28 deletions(-)
96373c
96373c
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
96373c
index 4e206a0fc..e08fec752 100644
96373c
--- a/ldap/servers/plugins/replication/repl5.h
96373c
+++ b/ldap/servers/plugins/replication/repl5.h
96373c
@@ -783,12 +783,37 @@ void multimaster_mtnode_construct_replicas(void);
96373c
 
96373c
 void multimaster_be_state_change(void *handle, char *be_name, int old_be_state, int new_be_state);
96373c
 
96373c
+#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
96373c
+
96373c
+typedef struct _cleanruv_data
96373c
+{
96373c
+    Object *repl_obj;
96373c
+    Replica *replica;
96373c
+    ReplicaId rid;
96373c
+    Slapi_Task *task;
96373c
+    struct berval *payload;
96373c
+    CSN *maxcsn;
96373c
+    char *repl_root;
96373c
+    Slapi_DN *sdn;
96373c
+    char *certify;
96373c
+    char *force;
96373c
+    PRBool original_task;
96373c
+} cleanruv_data;
96373c
+
96373c
+typedef struct _cleanruv_purge_data
96373c
+{
96373c
+    int cleaned_rid;
96373c
+    const Slapi_DN *suffix_sdn;
96373c
+    char *replName;
96373c
+    char *replGen;
96373c
+} cleanruv_purge_data;
96373c
+
96373c
 /* In repl5_replica_config.c */
96373c
 int replica_config_init(void);
96373c
 void replica_config_destroy(void);
96373c
 int get_replica_type(Replica *r);
96373c
 int replica_execute_cleanruv_task_ext(Object *r, ReplicaId rid);
96373c
-void add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing);
96373c
+void add_cleaned_rid(cleanruv_data *data, char *maxcsn);
96373c
 int is_cleaned_rid(ReplicaId rid);
96373c
 int replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter, int *returncode, char *returntext, void *arg);
96373c
 void replica_cleanallruv_thread_ext(void *arg);
96373c
@@ -808,29 +833,7 @@ void set_cleaned_rid(ReplicaId rid);
96373c
 void cleanruv_log(Slapi_Task *task, int rid, char *task_type, int sev_level, char *fmt, ...);
96373c
 char *replica_cleanallruv_get_local_maxcsn(ReplicaId rid, char *base_dn);
96373c
 
96373c
-#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
96373c
 
96373c
-typedef struct _cleanruv_data
96373c
-{
96373c
-    Object *repl_obj;
96373c
-    Replica *replica;
96373c
-    ReplicaId rid;
96373c
-    Slapi_Task *task;
96373c
-    struct berval *payload;
96373c
-    CSN *maxcsn;
96373c
-    char *repl_root;
96373c
-    Slapi_DN *sdn;
96373c
-    char *certify;
96373c
-    char *force;
96373c
-} cleanruv_data;
96373c
-
96373c
-typedef struct _cleanruv_purge_data
96373c
-{
96373c
-    int cleaned_rid;
96373c
-    const Slapi_DN *suffix_sdn;
96373c
-    char *replName;
96373c
-    char *replGen;
96373c
-} cleanruv_purge_data;
96373c
 
96373c
 /* replutil.c */
96373c
 LDAPControl *create_managedsait_control(void);
96373c
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
96373c
index 77f4f18e4..e75807a62 100644
96373c
--- a/ldap/servers/plugins/replication/repl5_replica.c
96373c
+++ b/ldap/servers/plugins/replication/repl5_replica.c
96373c
@@ -2120,6 +2120,7 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
96373c
         char csnstr[CSN_STRSIZE];
96373c
         char *token = NULL;
96373c
         char *forcing;
96373c
+        PRBool original_task;
96373c
         char *csnpart;
96373c
         char *ridstr;
96373c
         char *iter = NULL;
96373c
@@ -2151,8 +2152,15 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
96373c
             csn_init_by_string(maxcsn, csnpart);
96373c
             csn_as_string(maxcsn, PR_FALSE, csnstr);
96373c
             forcing = ldap_utf8strtok_r(iter, ":", &iter);
96373c
+            original_task = PR_TRUE;
96373c
             if (forcing == NULL) {
96373c
                 forcing = "no";
96373c
+            } else if (!strcasecmp(forcing, "yes") || !strcasecmp(forcing, "no")) {
96373c
+                /* forcing was correctly set, lets try to read the original task flag */
96373c
+                token = ldap_utf8strtok_r(iter, ":", &iter);
96373c
+                if (token && !atoi(token)) {
96373c
+                    original_task = PR_FALSE;
96373c
+                }
96373c
             }
96373c
 
96373c
             slapi_log_err(SLAPI_LOG_NOTICE, repl_plugin_name, "CleanAllRUV Task - cleanAllRUV task found, "
96373c
@@ -2190,6 +2198,13 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
96373c
                 data->force = slapi_ch_strdup(forcing);
96373c
                 data->repl_root = NULL;
96373c
 
96373c
+                /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
96373c
+                 * We retrieved from type_replicaCleanRUV if the cleanAllRuv request
96373c
+                 * was received from a direct task ADD or if was received via
96373c
+                 * the cleanAllRuv extop.
96373c
+                 */
96373c
+                data->original_task = original_task;
96373c
+
96373c
                 thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
96373c
                                          (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
96373c
                                          PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
96373c
@@ -2284,6 +2299,12 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
96373c
                     data->sdn = slapi_sdn_dup(r->repl_root);
96373c
                     data->certify = slapi_ch_strdup(certify);
96373c
 
96373c
+                    /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
96373c
+                     * Let's assum this replica was the original receiver of the task.
96373c
+                     * This flag has no impact on Abort cleanAllRuv
96373c
+                     */
96373c
+                    data->original_task = PR_TRUE;
96373c
+
96373c
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
96373c
                                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
96373c
                                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
96373c
diff --git a/ldap/servers/plugins/replication/repl5_replica_config.c b/ldap/servers/plugins/replication/repl5_replica_config.c
96373c
index 005528a41..95b933bb8 100644
96373c
--- a/ldap/servers/plugins/replication/repl5_replica_config.c
96373c
+++ b/ldap/servers/plugins/replication/repl5_replica_config.c
96373c
@@ -1573,6 +1573,11 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
96373c
     data->repl_root = slapi_ch_strdup(basedn);
96373c
     data->force = slapi_ch_strdup(force_cleaning);
96373c
 
96373c
+    /* It is either a consequence of a direct ADD cleanAllRuv task
96373c
+     * or modify of the replica to add nsds5task: cleanAllRuv
96373c
+     */
96373c
+    data->original_task = PR_TRUE;
96373c
+
96373c
     thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread,
96373c
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
96373c
                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
96373c
@@ -1702,7 +1707,7 @@ replica_cleanallruv_thread(void *arg)
96373c
     /*
96373c
      *  Add the cleanallruv task to the repl config - so we can handle restarts
96373c
      */
96373c
-    add_cleaned_rid(data->rid, data->replica, csnstr, data->force); /* marks config that we started cleaning a rid */
96373c
+    add_cleaned_rid(data, csnstr); /* marks config that we started cleaning a rid */
96373c
     cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Cleaning rid (%d)...", data->rid);
96373c
     /*
96373c
      *  First, wait for the maxcsn to be covered
96373c
@@ -1878,7 +1883,13 @@ done:
96373c
          */
96373c
         delete_cleaned_rid_config(data);
96373c
         check_replicas_are_done_cleaning(data);
96373c
-        remove_keep_alive_entry(data->task, data->rid, data->repl_root);
96373c
+        if (data->original_task) {
96373c
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Original task deletes Keep alive entry (%d).", data->rid);
96373c
+            remove_keep_alive_entry(data->task, data->rid, data->repl_root);
96373c
+        } else {
96373c
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Propagated task does not delete Keep alive entry (%d).", data->rid);
96373c
+        }
96373c
+
96373c
         clean_agmts(data);
96373c
         remove_cleaned_rid(data->rid);
96373c
         cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Successfully cleaned rid(%d).", data->rid);
96373c
@@ -2029,7 +2040,7 @@ check_replicas_are_done_cleaning(cleanruv_data *data)
96373c
                  "Waiting for all the replicas to finish cleaning...");
96373c
 
96373c
     csn_as_string(data->maxcsn, PR_FALSE, csnstr);
96373c
-    filter = PR_smprintf("(%s=%d:%s:%s)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force);
96373c
+    filter = PR_smprintf("(%s=%d:%s:%s:%d)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force, data->original_task ? 1 : 0);
96373c
     while (not_all_cleaned && !is_task_aborted(data->rid) && !slapi_is_shutting_down()) {
96373c
         agmt_obj = agmtlist_get_first_agreement_for_replica(data->replica);
96373c
         if (agmt_obj == NULL) {
96373c
@@ -2502,7 +2513,7 @@ set_cleaned_rid(ReplicaId rid)
96373c
  *  Add the rid and maxcsn to the repl config (so we can resume after a server restart)
96373c
  */
96373c
 void
96373c
-add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
96373c
+add_cleaned_rid(cleanruv_data *cleanruv_data, char *maxcsn)
96373c
 {
96373c
     Slapi_PBlock *pb;
96373c
     struct berval *vals[2];
96373c
@@ -2512,6 +2523,16 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
96373c
     char data[CSN_STRSIZE + 10];
96373c
     char *dn;
96373c
     int rc;
96373c
+    ReplicaId rid;
96373c
+    Replica *r;
96373c
+    char *forcing;
96373c
+
96373c
+    if (data == NULL) {
96373c
+        return;
96373c
+    }
96373c
+    rid = cleanruv_data->rid;
96373c
+    r = cleanruv_data->replica;
96373c
+    forcing = cleanruv_data->force;
96373c
 
96373c
     if (r == NULL || maxcsn == NULL) {
96373c
         return;
96373c
@@ -2519,7 +2540,7 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
96373c
     /*
96373c
      *  Write the rid & maxcsn to the config entry
96373c
      */
96373c
-    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s", rid, maxcsn, forcing);
96373c
+    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s:%d", rid, maxcsn, forcing, cleanruv_data->original_task ? 1 : 0);
96373c
     dn = replica_get_dn(r);
96373c
     pb = slapi_pblock_new();
96373c
     mod.mod_op = LDAP_MOD_ADD | LDAP_MOD_BVALUES;
96373c
@@ -2961,6 +2982,7 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb __attribute__((unused)),
96373c
     data->repl_root = slapi_ch_strdup(base_dn);
96373c
     data->sdn = NULL;
96373c
     data->certify = slapi_ch_strdup(certify_all);
96373c
+    data->original_task = PR_TRUE;
96373c
 
96373c
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
96373c
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
96373c
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
96373c
index c49c6bd8d..68e2544b4 100644
96373c
--- a/ldap/servers/plugins/replication/repl_extop.c
96373c
+++ b/ldap/servers/plugins/replication/repl_extop.c
96373c
@@ -1412,6 +1412,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
96373c
     data->rid = rid;
96373c
     data->repl_root = slapi_ch_strdup(repl_root);
96373c
     data->certify = slapi_ch_strdup(certify_all);
96373c
+    data->original_task = PR_FALSE;
96373c
     /*
96373c
      *  Set the aborted rid and stop the cleaning
96373c
      */
96373c
@@ -1555,6 +1556,7 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb)
96373c
         data->payload = slapi_ch_bvdup(extop_payload);
96373c
         data->force = slapi_ch_strdup(force);
96373c
         data->repl_root = slapi_ch_strdup(repl_root);
96373c
+        data->original_task = PR_FALSE;
96373c
 
96373c
         thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
96373c
                                  (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
96373c
-- 
96373c
2.13.6
96373c