Blame SOURCES/0053-Ticket-49463-After-cleanALLruv-there-is-a-flow-of-ke.patch

b045b9
From 0ac68e15a9a4048d3c1ad4519000996cd65fdefb Mon Sep 17 00:00:00 2001
b045b9
From: Thierry Bordaz <tbordaz@redhat.com>
b045b9
Date: Fri, 1 Dec 2017 16:23:11 +0100
b045b9
Subject: [PATCH] Ticket 49463 - After cleanALLruv, there is a flow of keep
b045b9
 alive DEL
b045b9
b045b9
Bug Description:
b045b9
	When cleanAllRuv is launched, it spawn cleanAllRuv on all replicas.
b045b9
	Each replica will clean its changelog and database RUV AND in addition
b045b9
	will DEL the keep alive entry of the target ReplicaID.
b045b9
	So for the same entry (keep alive) there will be as many DEL as there are replicas
b045b9
b045b9
	This flow of DEL is useless as only one DEL is enough.
b045b9
	In addition because of https://pagure.io/389-ds-base/issue/49466, replication may
b045b9
	loop on each of those DELs.
b045b9
b045b9
Fix Description:
b045b9
	The fix is only to prevent the flow of DEL.
b045b9
	It adds a flag ('original_task') in the task payload.
b045b9
	The server receiving the task (replica_execute_cleanall_ruv_task) flags the
b045b9
	task as 'original_task'.
b045b9
	In the opposite, the propagated cleanAllRuv (multimaster_extop_cleanruv) does
b045b9
	not flag the task as 'original_task'
b045b9
	Only original task does the DEL of the keep alive entry.
b045b9
	Note the propageted payload (extop) is not changed. In a mixed version
b045b9
	environment "old" servers will DEL the keep alive and flow can still happen
b045b9
b045b9
https://pagure.io/389-ds-base/issue/49466
b045b9
b045b9
Reviewed by: Ludwig Krispenz
b045b9
b045b9
Platforms tested: F23
b045b9
b045b9
Flag Day: no
b045b9
b045b9
Doc impact: no
b045b9
---
b045b9
 ldap/servers/plugins/replication/repl5.h           | 49 ++++++++++++----------
b045b9
 ldap/servers/plugins/replication/repl5_replica.c   | 21 ++++++++++
b045b9
 .../plugins/replication/repl5_replica_config.c     | 32 +++++++++++---
b045b9
 ldap/servers/plugins/replication/repl_extop.c      |  2 +
b045b9
 4 files changed, 76 insertions(+), 28 deletions(-)
b045b9
b045b9
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
b045b9
index 4e206a0fc..e08fec752 100644
b045b9
--- a/ldap/servers/plugins/replication/repl5.h
b045b9
+++ b/ldap/servers/plugins/replication/repl5.h
b045b9
@@ -783,12 +783,37 @@ void multimaster_mtnode_construct_replicas(void);
b045b9
 
b045b9
 void multimaster_be_state_change(void *handle, char *be_name, int old_be_state, int new_be_state);
b045b9
 
b045b9
+#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
b045b9
+
b045b9
+typedef struct _cleanruv_data
b045b9
+{
b045b9
+    Object *repl_obj;
b045b9
+    Replica *replica;
b045b9
+    ReplicaId rid;
b045b9
+    Slapi_Task *task;
b045b9
+    struct berval *payload;
b045b9
+    CSN *maxcsn;
b045b9
+    char *repl_root;
b045b9
+    Slapi_DN *sdn;
b045b9
+    char *certify;
b045b9
+    char *force;
b045b9
+    PRBool original_task;
b045b9
+} cleanruv_data;
b045b9
+
b045b9
+typedef struct _cleanruv_purge_data
b045b9
+{
b045b9
+    int cleaned_rid;
b045b9
+    const Slapi_DN *suffix_sdn;
b045b9
+    char *replName;
b045b9
+    char *replGen;
b045b9
+} cleanruv_purge_data;
b045b9
+
b045b9
 /* In repl5_replica_config.c */
b045b9
 int replica_config_init(void);
b045b9
 void replica_config_destroy(void);
b045b9
 int get_replica_type(Replica *r);
b045b9
 int replica_execute_cleanruv_task_ext(Object *r, ReplicaId rid);
b045b9
-void add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing);
b045b9
+void add_cleaned_rid(cleanruv_data *data, char *maxcsn);
b045b9
 int is_cleaned_rid(ReplicaId rid);
b045b9
 int replica_cleanall_ruv_abort(Slapi_PBlock *pb, Slapi_Entry *e, Slapi_Entry *eAfter, int *returncode, char *returntext, void *arg);
b045b9
 void replica_cleanallruv_thread_ext(void *arg);
b045b9
@@ -808,29 +833,7 @@ void set_cleaned_rid(ReplicaId rid);
b045b9
 void cleanruv_log(Slapi_Task *task, int rid, char *task_type, int sev_level, char *fmt, ...);
b045b9
 char *replica_cleanallruv_get_local_maxcsn(ReplicaId rid, char *base_dn);
b045b9
 
b045b9
-#define CLEANRIDSIZ 64 /* maximum number for concurrent CLEANALLRUV tasks */
b045b9
 
b045b9
-typedef struct _cleanruv_data
b045b9
-{
b045b9
-    Object *repl_obj;
b045b9
-    Replica *replica;
b045b9
-    ReplicaId rid;
b045b9
-    Slapi_Task *task;
b045b9
-    struct berval *payload;
b045b9
-    CSN *maxcsn;
b045b9
-    char *repl_root;
b045b9
-    Slapi_DN *sdn;
b045b9
-    char *certify;
b045b9
-    char *force;
b045b9
-} cleanruv_data;
b045b9
-
b045b9
-typedef struct _cleanruv_purge_data
b045b9
-{
b045b9
-    int cleaned_rid;
b045b9
-    const Slapi_DN *suffix_sdn;
b045b9
-    char *replName;
b045b9
-    char *replGen;
b045b9
-} cleanruv_purge_data;
b045b9
 
b045b9
 /* replutil.c */
b045b9
 LDAPControl *create_managedsait_control(void);
b045b9
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
b045b9
index 77f4f18e4..e75807a62 100644
b045b9
--- a/ldap/servers/plugins/replication/repl5_replica.c
b045b9
+++ b/ldap/servers/plugins/replication/repl5_replica.c
b045b9
@@ -2120,6 +2120,7 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
b045b9
         char csnstr[CSN_STRSIZE];
b045b9
         char *token = NULL;
b045b9
         char *forcing;
b045b9
+        PRBool original_task;
b045b9
         char *csnpart;
b045b9
         char *ridstr;
b045b9
         char *iter = NULL;
b045b9
@@ -2151,8 +2152,15 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
b045b9
             csn_init_by_string(maxcsn, csnpart);
b045b9
             csn_as_string(maxcsn, PR_FALSE, csnstr);
b045b9
             forcing = ldap_utf8strtok_r(iter, ":", &iter);
b045b9
+            original_task = PR_TRUE;
b045b9
             if (forcing == NULL) {
b045b9
                 forcing = "no";
b045b9
+            } else if (!strcasecmp(forcing, "yes") || !strcasecmp(forcing, "no")) {
b045b9
+                /* forcing was correctly set, lets try to read the original task flag */
b045b9
+                token = ldap_utf8strtok_r(iter, ":", &iter);
b045b9
+                if (token && !atoi(token)) {
b045b9
+                    original_task = PR_FALSE;
b045b9
+                }
b045b9
             }
b045b9
 
b045b9
             slapi_log_err(SLAPI_LOG_NOTICE, repl_plugin_name, "CleanAllRUV Task - cleanAllRUV task found, "
b045b9
@@ -2190,6 +2198,13 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
b045b9
                 data->force = slapi_ch_strdup(forcing);
b045b9
                 data->repl_root = NULL;
b045b9
 
b045b9
+                /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
b045b9
+                 * We retrieved from type_replicaCleanRUV if the cleanAllRuv request
b045b9
+                 * was received from a direct task ADD or if was received via
b045b9
+                 * the cleanAllRuv extop.
b045b9
+                 */
b045b9
+                data->original_task = original_task;
b045b9
+
b045b9
                 thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
b045b9
                                          (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
b045b9
                                          PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
b045b9
@@ -2284,6 +2299,12 @@ replica_check_for_tasks(Replica *r, Slapi_Entry *e)
b045b9
                     data->sdn = slapi_sdn_dup(r->repl_root);
b045b9
                     data->certify = slapi_ch_strdup(certify);
b045b9
 
b045b9
+                    /* This is a corner case, a cleanAllRuv task was interrupted by a shutdown or a crash
b045b9
+                     * Let's assum this replica was the original receiver of the task.
b045b9
+                     * This flag has no impact on Abort cleanAllRuv
b045b9
+                     */
b045b9
+                    data->original_task = PR_TRUE;
b045b9
+
b045b9
                     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
b045b9
                                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
b045b9
                                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
b045b9
diff --git a/ldap/servers/plugins/replication/repl5_replica_config.c b/ldap/servers/plugins/replication/repl5_replica_config.c
b045b9
index 005528a41..95b933bb8 100644
b045b9
--- a/ldap/servers/plugins/replication/repl5_replica_config.c
b045b9
+++ b/ldap/servers/plugins/replication/repl5_replica_config.c
b045b9
@@ -1573,6 +1573,11 @@ replica_execute_cleanall_ruv_task(Object *r, ReplicaId rid, Slapi_Task *task, co
b045b9
     data->repl_root = slapi_ch_strdup(basedn);
b045b9
     data->force = slapi_ch_strdup(force_cleaning);
b045b9
 
b045b9
+    /* It is either a consequence of a direct ADD cleanAllRuv task
b045b9
+     * or modify of the replica to add nsds5task: cleanAllRuv
b045b9
+     */
b045b9
+    data->original_task = PR_TRUE;
b045b9
+
b045b9
     thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread,
b045b9
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
b045b9
                              PR_UNJOINABLE_THREAD, SLAPD_DEFAULT_THREAD_STACKSIZE);
b045b9
@@ -1702,7 +1707,7 @@ replica_cleanallruv_thread(void *arg)
b045b9
     /*
b045b9
      *  Add the cleanallruv task to the repl config - so we can handle restarts
b045b9
      */
b045b9
-    add_cleaned_rid(data->rid, data->replica, csnstr, data->force); /* marks config that we started cleaning a rid */
b045b9
+    add_cleaned_rid(data, csnstr); /* marks config that we started cleaning a rid */
b045b9
     cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Cleaning rid (%d)...", data->rid);
b045b9
     /*
b045b9
      *  First, wait for the maxcsn to be covered
b045b9
@@ -1878,7 +1883,13 @@ done:
b045b9
          */
b045b9
         delete_cleaned_rid_config(data);
b045b9
         check_replicas_are_done_cleaning(data);
b045b9
-        remove_keep_alive_entry(data->task, data->rid, data->repl_root);
b045b9
+        if (data->original_task) {
b045b9
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Original task deletes Keep alive entry (%d).", data->rid);
b045b9
+            remove_keep_alive_entry(data->task, data->rid, data->repl_root);
b045b9
+        } else {
b045b9
+            cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Propagated task does not delete Keep alive entry (%d).", data->rid);
b045b9
+        }
b045b9
+
b045b9
         clean_agmts(data);
b045b9
         remove_cleaned_rid(data->rid);
b045b9
         cleanruv_log(data->task, data->rid, CLEANALLRUV_ID, SLAPI_LOG_INFO, "Successfully cleaned rid(%d).", data->rid);
b045b9
@@ -2029,7 +2040,7 @@ check_replicas_are_done_cleaning(cleanruv_data *data)
b045b9
                  "Waiting for all the replicas to finish cleaning...");
b045b9
 
b045b9
     csn_as_string(data->maxcsn, PR_FALSE, csnstr);
b045b9
-    filter = PR_smprintf("(%s=%d:%s:%s)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force);
b045b9
+    filter = PR_smprintf("(%s=%d:%s:%s:%d)", type_replicaCleanRUV, (int)data->rid, csnstr, data->force, data->original_task ? 1 : 0);
b045b9
     while (not_all_cleaned && !is_task_aborted(data->rid) && !slapi_is_shutting_down()) {
b045b9
         agmt_obj = agmtlist_get_first_agreement_for_replica(data->replica);
b045b9
         if (agmt_obj == NULL) {
b045b9
@@ -2502,7 +2513,7 @@ set_cleaned_rid(ReplicaId rid)
b045b9
  *  Add the rid and maxcsn to the repl config (so we can resume after a server restart)
b045b9
  */
b045b9
 void
b045b9
-add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
b045b9
+add_cleaned_rid(cleanruv_data *cleanruv_data, char *maxcsn)
b045b9
 {
b045b9
     Slapi_PBlock *pb;
b045b9
     struct berval *vals[2];
b045b9
@@ -2512,6 +2523,16 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
b045b9
     char data[CSN_STRSIZE + 10];
b045b9
     char *dn;
b045b9
     int rc;
b045b9
+    ReplicaId rid;
b045b9
+    Replica *r;
b045b9
+    char *forcing;
b045b9
+
b045b9
+    if (data == NULL) {
b045b9
+        return;
b045b9
+    }
b045b9
+    rid = cleanruv_data->rid;
b045b9
+    r = cleanruv_data->replica;
b045b9
+    forcing = cleanruv_data->force;
b045b9
 
b045b9
     if (r == NULL || maxcsn == NULL) {
b045b9
         return;
b045b9
@@ -2519,7 +2540,7 @@ add_cleaned_rid(ReplicaId rid, Replica *r, char *maxcsn, char *forcing)
b045b9
     /*
b045b9
      *  Write the rid & maxcsn to the config entry
b045b9
      */
b045b9
-    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s", rid, maxcsn, forcing);
b045b9
+    val.bv_len = PR_snprintf(data, sizeof(data), "%d:%s:%s:%d", rid, maxcsn, forcing, cleanruv_data->original_task ? 1 : 0);
b045b9
     dn = replica_get_dn(r);
b045b9
     pb = slapi_pblock_new();
b045b9
     mod.mod_op = LDAP_MOD_ADD | LDAP_MOD_BVALUES;
b045b9
@@ -2961,6 +2982,7 @@ replica_cleanall_ruv_abort(Slapi_PBlock *pb __attribute__((unused)),
b045b9
     data->repl_root = slapi_ch_strdup(base_dn);
b045b9
     data->sdn = NULL;
b045b9
     data->certify = slapi_ch_strdup(certify_all);
b045b9
+    data->original_task = PR_TRUE;
b045b9
 
b045b9
     thread = PR_CreateThread(PR_USER_THREAD, replica_abort_task_thread,
b045b9
                              (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
b045b9
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
b045b9
index c49c6bd8d..68e2544b4 100644
b045b9
--- a/ldap/servers/plugins/replication/repl_extop.c
b045b9
+++ b/ldap/servers/plugins/replication/repl_extop.c
b045b9
@@ -1412,6 +1412,7 @@ multimaster_extop_abort_cleanruv(Slapi_PBlock *pb)
b045b9
     data->rid = rid;
b045b9
     data->repl_root = slapi_ch_strdup(repl_root);
b045b9
     data->certify = slapi_ch_strdup(certify_all);
b045b9
+    data->original_task = PR_FALSE;
b045b9
     /*
b045b9
      *  Set the aborted rid and stop the cleaning
b045b9
      */
b045b9
@@ -1555,6 +1556,7 @@ multimaster_extop_cleanruv(Slapi_PBlock *pb)
b045b9
         data->payload = slapi_ch_bvdup(extop_payload);
b045b9
         data->force = slapi_ch_strdup(force);
b045b9
         data->repl_root = slapi_ch_strdup(repl_root);
b045b9
+        data->original_task = PR_FALSE;
b045b9
 
b045b9
         thread = PR_CreateThread(PR_USER_THREAD, replica_cleanallruv_thread_ext,
b045b9
                                  (void *)data, PR_PRIORITY_NORMAL, PR_GLOBAL_THREAD,
b045b9
-- 
b045b9
2.13.6
b045b9