Blame SOURCES/0053-Ticket-49463-After-cleanALLruv-there-is-a-flow-of-ke.patch

081b2d
From 0ac68e15a9a4048d3c1ad4519000996cd65fdefb Mon Sep 17 00:00:00 2001
081b2d
From: Thierry Bordaz <tbordaz@redhat.com>
081b2d
Date: Fri, 1 Dec 2017 16:23:11 +0100
081b2d
Subject: [PATCH] Ticket 49463 - After cleanALLruv, there is a flow of keep
081b2d
 alive DEL
081b2d
081b2d
Bug Description:
081b2d
	When cleanAllRuv is launched, it spawn cleanAllRuv on all replicas.
081b2d
	Each replica will clean its changelog and database RUV AND in addition
081b2d
	will DEL the keep alive entry of the target ReplicaID.
081b2d
	So for the same entry (keep alive) there will be as many DEL as there are replicas
081b2d
081b2d
	This flow of DEL is useless as only one DEL is enough.
081b2d
	In addition because of https://pagure.io/389-ds-base/issue/49466, replication may
081b2d
	loop on each of those DELs.
081b2d
081b2d
Fix Description:
081b2d
	The fix is only to prevent the flow of DEL.
081b2d
	It adds a flag ('original_task') in the task payload.
081b2d
	The server receiving the task (replica_execute_cleanall_ruv_task) flags the
081b2d
	task as 'original_task'.
081b2d
	In the opposite, the propagated cleanAllRuv (multimaster_extop_cleanruv) does
081b2d
	not flag the task as 'original_task'
081b2d
	Only original task does the DEL of the keep alive entry.
081b2d
	Note the propageted payload (extop) is not changed. In a mixed version
081b2d
	environment "old" servers will DEL the keep alive and flow can still happen
081b2d
081b2d
https://pagure.io/389-ds-base/issue/49466
081b2d
081b2d
Reviewed by: Ludwig Krispenz
081b2d
081b2d
Platforms tested: F23
081b2d
081b2d
Flag Day: no
081b2d
081b2d
Doc impact: no
081b2d
---
081b2d
 ldap/servers/plugins/replication/repl5.h           | 49 ++++++++++++----------
081b2d
 ldap/servers/plugins/replication/repl5_replica.c   | 21 ++++++++++
081b2d
 .../plugins/replication/repl5_replica_config.c     | 32 +++++++++++---
081b2d
 ldap/servers/plugins/replication/repl_extop.c      |  2 +
081b2d
 4 files changed, 76 insertions(+), 28 deletions(-)
081b2d
081b2d
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
081b2d
index 4e206a0fc..e08fec752 100644
081b2d
--- a/ldap/servers/plugins/replication/repl5.h
081b2d
+++ b/ldap/servers/plugins/replication/repl5.h
081b2d
@@ -783,12 +783,37 @@ void multimaster_mtnode_construct_replicas(void);
081b2d
 
081b2d
 void multimaster_be_state_change(void *handle, char *be_name, int old_be_state, int new_be_state);
081b2d
 
081b2d
+#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
081b2d
+
081b2d
+typedef struct _cleanruv_data
081b2d
+{
081b2d
+    Object *repl_obj;
081b2d
+    Replica *replica;
081b2d
+    ReplicaId rid;
081b2d
+    Slapi_Task *task;
081b2d
+    struct berval *payload;
081b2d
+    CSN *maxcsn;
081b2d
+    char *repl_root;
081b2d
+    Slapi_DN *sdn;
081b2d
+    char *certify;
081b2d
+    char *force;
081b2d
+    PRBool original_task;
081b2d
+} cleanruv_data;
081b2d
+
081b2d
+typedef struct _cleanruv_purge_data
081b2d
+{
081b2d
+    int cleaned_rid;
081b2d
+    const Slapi_DN *suffix_sdn;
081b2d
+    char *replName;
081b2d
+    char *replGen;
081b2d
+} cleanruv_purge_data;
081b2d
+
081b2d
 /* In repl5_replica_config.c */
081b2d
 int replica_config_init(void);
081b2d
 void replica_config_destroy(void);
081b2d
 int get_replica_type(Replica *r);
081b2d
 int replica_execute_cleanruv_task_ext(Object *r, ReplicaId rid);
081b2d
-void add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing);
081b2d
+void add_cleaned_rid(cleanruv_data *data, char *maxcsn);
081b2d
 int is_cleaned_rid(ReplicaId rid);
081b2d
 int replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter, int *returncode, char *returntext, void *arg);
081b2d
 void replica_cleanallruv_thread_ext(void *arg);
081b2d
@@ -808,29 +833,7 @@ void set_cleaned_rid(ReplicaId rid);
081b2d
 void cleanruv_log(Slapi_Task *task, int rid, char *task_type, int sev_level, char *fmt, ...);
081b2d
 char *replica_cleanallruv_get_local_maxcsn(ReplicaId rid, char *base_dn);
081b2d
 
081b2d
-#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
081b2d
 
081b2d
-typedef struct _cleanruv_data
081b2d
-{
081b2d
-    Object *repl_obj;
081b2d
-    Replica *replica;
081b2d
-    ReplicaId rid;
081b2d
-    Slapi_Task *task;
081b2d
-    struct berval *payload;
081b2d
-    CSN *maxcsn;
081b2d
-    char *repl_root;
081b2d
-    Slapi_DN *sdn;
081b2d
-    char *certify;
081b2d
-    char *force;
081b2d
-} cleanruv_data;
081b2d
-
081b2d
-typedef struct _cleanruv_purge_data
081b2d
-{
081b2d
-    int cleaned_rid;
081b2d
-    const Slapi_DN *suffix_sdn;
081b2d
-    char *replName;
081b2d
-    char *replGen;
081b2d
-} cleanruv_purge_data;
081b2d
 
081b2d
 /* replutil.c */
081b2d
 LDAPControl *create_managedsait_control(void);
081b2d
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
081b2d
index 77f4f18e4..e75807a62 100644
081b2d
--- a/ldap/servers/plugins/replication/repl5_replica.c
081b2d
+++ b/ldap/servers/plugins/replication/repl5_replica.c
081b2d
@@ -2120,6 +2120,7 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
081b2d
         char csnstr[CSN_STRSIZE];
081b2d
         char *token = NULL;
081b2d
         char *forcing;
081b2d
+        PRBool original_task;
081b2d
         char *csnpart;
081b2d
         char *ridstr;
081b2d
         char *iter = NULL;
081b2d
@@ -2151,8 +2152,15 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
081b2d
             csn_init_by_string(maxcsn, csnpart);
081b2d
             csn_as_string(maxcsn, PR_FALSE, csnstr);
081b2d
             forcing = ldap_utf8strtok_r(iter, ":", &iter);
081b2d
+            original_task = PR_TRUE;
081b2d
             if (forcing == NULL) {
081b2d
                 forcing = "no";
081b2d
+            } else if (!strcasecmp(forcing, "yes") || !strcasecmp(forcing, "no")) {
081b2d
+                /* forcing was correctly set, lets try to read the original task flag */
081b2d
+                token = ldap_utf8strtok_r(iter, ":", &iter);
081b2d
+                if (token && !atoi(token)) {
081b2d
+                    original_task = PR_FALSE;
081b2d
+                }
081b2d
             }
081b2d
 
081b2d
             slapi_log_err(SLAPI_LOG_NOTICE, repl_plugin_name, "CleanAllRUV Task - cleanAllRUV task found, "
081b2d
@@ -2190,6 +2198,13 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
081b2d
                 data->force = slapi_ch_strdup(forcing);
081b2d
                 data->repl_root = NULL;
081b2d
 
081b2d
+                /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
081b2d
+                 * We retrieved from type_replicaCleanRUV if the cleanAllRuv request
081b2d
+                 * was received from a direct task ADD or if was received via
081b2d
+                 * the cleanAllRuv extop.
081b2d
+                 */
081b2d
+                data->original_task = original_task;
081b2d
+
081b2d
                 thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
081b2d
                                          (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
081b2d
                                          PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
081b2d
@@ -2284,6 +2299,12 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
081b2d
                     data->sdn = slapi_sdn_dup(r->repl_root);
081b2d
                     data->certify = slapi_ch_strdup(certify);
081b2d
 
081b2d
+                    /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
081b2d
+                     * Let's assum this replica was the original receiver of the task.
081b2d
+                     * This flag has no impact on Abort cleanAllRuv
081b2d
+                     */
081b2d
+                    data->original_task = PR_TRUE;
081b2d
+
081b2d
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
081b2d
                                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
081b2d
                                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
081b2d
diff --git a/ldap/servers/plugins/replication/repl5_replica_config.c b/ldap/servers/plugins/replication/repl5_replica_config.c
081b2d
index 005528a41..95b933bb8 100644
081b2d
--- a/ldap/servers/plugins/replication/repl5_replica_config.c
081b2d
+++ b/ldap/servers/plugins/replication/repl5_replica_config.c
081b2d
@@ -1573,6 +1573,11 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
081b2d
     data->repl_root = slapi_ch_strdup(basedn);
081b2d
     data->force = slapi_ch_strdup(force_cleaning);
081b2d
 
081b2d
+    /* It is either a consequence of a direct ADD cleanAllRuv task
081b2d
+     * or modify of the replica to add nsds5task: cleanAllRuv
081b2d
+     */
081b2d
+    data->original_task = PR_TRUE;
081b2d
+
081b2d
     thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread,
081b2d
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
081b2d
                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
081b2d
@@ -1702,7 +1707,7 @@ replica_cleanallruv_thread(void *arg)
081b2d
     /*
081b2d
      *  Add the cleanallruv task to the repl config - so we can handle restarts
081b2d
      */
081b2d
-    add_cleaned_rid(data->rid, data->replica, csnstr, data->force); /* marks config that we started cleaning a rid */
081b2d
+    add_cleaned_rid(data, csnstr); /* marks config that we started cleaning a rid */
081b2d
     cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Cleaning rid (%d)...", data->rid);
081b2d
     /*
081b2d
      *  First, wait for the maxcsn to be covered
081b2d
@@ -1878,7 +1883,13 @@ done:
081b2d
          */
081b2d
         delete_cleaned_rid_config(data);
081b2d
         check_replicas_are_done_cleaning(data);
081b2d
-        remove_keep_alive_entry(data->task, data->rid, data->repl_root);
081b2d
+        if (data->original_task) {
081b2d
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Original task deletes Keep alive entry (%d).", data->rid);
081b2d
+            remove_keep_alive_entry(data->task, data->rid, data->repl_root);
081b2d
+        } else {
081b2d
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Propagated task does not delete Keep alive entry (%d).", data->rid);
081b2d
+        }
081b2d
+
081b2d
         clean_agmts(data);
081b2d
         remove_cleaned_rid(data->rid);
081b2d
         cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Successfully cleaned rid(%d).", data->rid);
081b2d
@@ -2029,7 +2040,7 @@ check_replicas_are_done_cleaning(cleanruv_data *data)
081b2d
                  "Waiting for all the replicas to finish cleaning...");
081b2d
 
081b2d
     csn_as_string(data->maxcsn, PR_FALSE, csnstr);
081b2d
-    filter = PR_smprintf("(%s=%d:%s:%s)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force);
081b2d
+    filter = PR_smprintf("(%s=%d:%s:%s:%d)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force, data->original_task ? 1 : 0);
081b2d
     while (not_all_cleaned && !is_task_aborted(data->rid) && !slapi_is_shutting_down()) {
081b2d
         agmt_obj = agmtlist_get_first_agreement_for_replica(data->replica);
081b2d
         if (agmt_obj == NULL) {
081b2d
@@ -2502,7 +2513,7 @@ set_cleaned_rid(ReplicaId rid)
081b2d
  *  Add the rid and maxcsn to the repl config (so we can resume after a server restart)
081b2d
  */
081b2d
 void
081b2d
-add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
081b2d
+add_cleaned_rid(cleanruv_data *cleanruv_data, char *maxcsn)
081b2d
 {
081b2d
     Slapi_PBlock *pb;
081b2d
     struct berval *vals[2];
081b2d
@@ -2512,6 +2523,16 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
081b2d
     char data[CSN_STRSIZE + 10];
081b2d
     char *dn;
081b2d
     int rc;
081b2d
+    ReplicaId rid;
081b2d
+    Replica *r;
081b2d
+    char *forcing;
081b2d
+
081b2d
+    if (data == NULL) {
081b2d
+        return;
081b2d
+    }
081b2d
+    rid = cleanruv_data->rid;
081b2d
+    r = cleanruv_data->replica;
081b2d
+    forcing = cleanruv_data->force;
081b2d
 
081b2d
     if (r == NULL || maxcsn == NULL) {
081b2d
         return;
081b2d
@@ -2519,7 +2540,7 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
081b2d
     /*
081b2d
      *  Write the rid & maxcsn to the config entry
081b2d
      */
081b2d
-    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s", rid, maxcsn, forcing);
081b2d
+    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s:%d", rid, maxcsn, forcing, cleanruv_data->original_task ? 1 : 0);
081b2d
     dn = replica_get_dn(r);
081b2d
     pb = slapi_pblock_new();
081b2d
     mod.mod_op = LDAP_MOD_ADD | LDAP_MOD_BVALUES;
081b2d
@@ -2961,6 +2982,7 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb __attribute__((unused)),
081b2d
     data->repl_root = slapi_ch_strdup(base_dn);
081b2d
     data->sdn = NULL;
081b2d
     data->certify = slapi_ch_strdup(certify_all);
081b2d
+    data->original_task = PR_TRUE;
081b2d
 
081b2d
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
081b2d
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
081b2d
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
081b2d
index c49c6bd8d..68e2544b4 100644
081b2d
--- a/ldap/servers/plugins/replication/repl_extop.c
081b2d
+++ b/ldap/servers/plugins/replication/repl_extop.c
081b2d
@@ -1412,6 +1412,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
081b2d
     data->rid = rid;
081b2d
     data->repl_root = slapi_ch_strdup(repl_root);
081b2d
     data->certify = slapi_ch_strdup(certify_all);
081b2d
+    data->original_task = PR_FALSE;
081b2d
     /*
081b2d
      *  Set the aborted rid and stop the cleaning
081b2d
      */
081b2d
@@ -1555,6 +1556,7 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb)
081b2d
         data->payload = slapi_ch_bvdup(extop_payload);
081b2d
         data->force = slapi_ch_strdup(force);
081b2d
         data->repl_root = slapi_ch_strdup(repl_root);
081b2d
+        data->original_task = PR_FALSE;
081b2d
 
081b2d
         thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
081b2d
                                  (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
081b2d
-- 
081b2d
2.13.6
081b2d