Blame SOURCES/0026-Ticket-48179-Starting-a-replica-agreement-can-lead-t.patch

a2f18f
From 1acfdbb4428c70f7f6058da4374ecb29f9bb3149 Mon Sep 17 00:00:00 2001
a2f18f
From: Mark Reynolds <mreynolds@redhat.com>
a2f18f
Date: Fri, 17 Jul 2015 15:08:00 -0400
a2f18f
Subject: [PATCH 26/30] Ticket 48179 - Starting a replica agreement can lead to
a2f18f
  deadlock
a2f18f
a2f18f
Bug Description:  When starting a replica agreement and setting the agmt maxcsn
a2f18f
                  a deadlock can occur with another op updating nsuniqueid index.
a2f18f
                  When setting the agmt maxcsn the server searches for the tombstone
a2f18f
                  ruv which uses the nsuniqueid index, and it does this while holding
a2f18f
                  the repl agmt lock.  If another thread is doing a delete and
a2f18f
                  writing to the change log, it can also grab a write lock on the
a2f18f
                  nsuniqueid index, before it attempts to grab the agmt lock.  This
a2f18f
                  can lead to a deadlock if the timing is right.
a2f18f
a2f18f
Fix Description:  When starting the agmt and setting the agmt maxcsn, search/get
a2f18f
                  the tombstone ruv before we take the repl agmt lock.
a2f18f
a2f18f
https://fedorahosted.org/389/ticket/48179
a2f18f
a2f18f
Reviewed by: nhosoi(Thanks!)
a2f18f
a2f18f
(cherry picked from commit eb3086dcb0c56a23d6cee00a12f38b2584fe59a2)
a2f18f
(cherry picked from commit 23a3ff6082cba3eb749401eff44942b16dc30538)
a2f18f
---
a2f18f
 ldap/servers/plugins/replication/repl5.h      |   1 -
a2f18f
 ldap/servers/plugins/replication/repl5_agmt.c | 211 ++++++++++++--------------
a2f18f
 2 files changed, 101 insertions(+), 111 deletions(-)
a2f18f
a2f18f
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
a2f18f
index 4a5d859..0b0f26b 100644
a2f18f
--- a/ldap/servers/plugins/replication/repl5.h
a2f18f
+++ b/ldap/servers/plugins/replication/repl5.h
a2f18f
@@ -380,7 +380,6 @@ PRUint64 agmt_get_protocol_timeout(Repl_Agmt *agmt);
a2f18f
 void agmt_set_protocol_timeout(Repl_Agmt *agmt, PRUint64 timeout);
a2f18f
 void agmt_update_maxcsn(Replica *r, Slapi_DN *sdn, int op, LDAPMod **mods, CSN *csn);
a2f18f
 void add_agmt_maxcsns(Slapi_Entry *e, Replica *r);
a2f18f
-void agmt_set_maxcsn(Repl_Agmt *ra);
a2f18f
 void agmt_remove_maxcsn(Repl_Agmt *ra);
a2f18f
 int agmt_maxcsn_to_smod (Replica *r, Slapi_Mod *smod);
a2f18f
 int agmt_set_WaitForAsyncResults(Repl_Agmt *ra, const Slapi_Entry *e);
a2f18f
diff --git a/ldap/servers/plugins/replication/repl5_agmt.c b/ldap/servers/plugins/replication/repl5_agmt.c
a2f18f
index 9d1a8f2..f84eacb 100644
a2f18f
--- a/ldap/servers/plugins/replication/repl5_agmt.c
a2f18f
+++ b/ldap/servers/plugins/replication/repl5_agmt.c
a2f18f
@@ -668,43 +668,127 @@ int
a2f18f
 agmt_start(Repl_Agmt *ra)
a2f18f
 {
a2f18f
     Repl_Protocol *prot = NULL;
a2f18f
+    Slapi_PBlock *pb = NULL;
a2f18f
+    Slapi_Entry **entries = NULL;
a2f18f
+    Slapi_DN *repl_sdn = NULL;
a2f18f
+    char *attrs[2];
a2f18f
+    int protocol_state;
a2f18f
+    int found_ruv = 0;
a2f18f
+    int rc = 0;
a2f18f
 
a2f18f
-	int protocol_state;
a2f18f
-
a2f18f
-	/*	To Allow Consumer Initialisation when adding an agreement: */	
a2f18f
-	if (ra->auto_initialize == STATE_PERFORMING_TOTAL_UPDATE)
a2f18f
-	{
a2f18f
-		protocol_state = STATE_PERFORMING_TOTAL_UPDATE;
a2f18f
-	}
a2f18f
-	else
a2f18f
-	{
a2f18f
-		protocol_state = STATE_PERFORMING_INCREMENTAL_UPDATE;
a2f18f
-	}
a2f18f
+    /*	To Allow Consumer Initialisation when adding an agreement: */
a2f18f
+    if (ra->auto_initialize == STATE_PERFORMING_TOTAL_UPDATE){
a2f18f
+        protocol_state = STATE_PERFORMING_TOTAL_UPDATE;
a2f18f
+    } else {
a2f18f
+        protocol_state = STATE_PERFORMING_INCREMENTAL_UPDATE;
a2f18f
+    }
a2f18f
 
a2f18f
     /* First, create a new protocol object */
a2f18f
     if ((prot = prot_new(ra, protocol_state)) == NULL) {
a2f18f
         return -1;
a2f18f
     }
a2f18f
 
a2f18f
-    /* Now it is safe to own the agreement lock */
a2f18f
+    /*
a2f18f
+     * Set the agmt maxcsn
a2f18f
+     *
a2f18f
+     * We need to get the replica ruv before we take the
a2f18f
+     * agmt lock to avoid potential deadlocks on the nsuniqueid
a2f18f
+     * index.
a2f18f
+     */
a2f18f
+    repl_sdn = agmt_get_replarea(ra);
a2f18f
+
a2f18f
+    pb = slapi_pblock_new();
a2f18f
+    attrs[0] = (char*)type_agmtMaxCSN;
a2f18f
+    attrs[1] = NULL;
a2f18f
+    slapi_search_internal_set_pb_ext(
a2f18f
+        pb,
a2f18f
+        repl_sdn,
a2f18f
+        LDAP_SCOPE_BASE,
a2f18f
+        "objectclass=*",
a2f18f
+        attrs,
a2f18f
+        0,
a2f18f
+        NULL,
a2f18f
+        RUV_STORAGE_ENTRY_UNIQUEID,
a2f18f
+        repl_get_plugin_identity (PLUGIN_MULTIMASTER_REPLICATION),
a2f18f
+        OP_FLAG_REPLICATED);
a2f18f
+    slapi_search_internal_pb (pb);
a2f18f
+
a2f18f
+    slapi_pblock_get(pb, SLAPI_PLUGIN_INTOP_RESULT, &rc);
a2f18f
+    if (rc == LDAP_SUCCESS){
a2f18f
+        slapi_pblock_get(pb, SLAPI_PLUGIN_INTOP_SEARCH_ENTRIES, &entries);
a2f18f
+        if (NULL == entries || NULL == entries[0]){
a2f18f
+            slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
a2f18f
+                "agmt_start: replica ruv tombstone entry for "
a2f18f
+                "replica %s not found\n",
a2f18f
+            slapi_sdn_get_dn(ra->replarea));
a2f18f
+        } else {
a2f18f
+            found_ruv = 1;
a2f18f
+        }
a2f18f
+    }
a2f18f
+
a2f18f
+    /*
a2f18f
+     * Now it is safe to own the agreement lock
a2f18f
+     */
a2f18f
     PR_Lock(ra->lock);
a2f18f
 
a2f18f
     /* Check that replication is not already started */
a2f18f
     if (ra->protocol != NULL) {
a2f18f
         slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, "replication already started for agreement \"%s\"\n", agmt_get_long_name(ra));
a2f18f
-        PR_Unlock(ra->lock);
a2f18f
         prot_free(&prot;;
a2f18f
-        return 0;
a2f18f
+        goto done;
a2f18f
     }
a2f18f
 
a2f18f
+    /* Set and start the protocol */
a2f18f
     ra->protocol = prot;
a2f18f
-
a2f18f
-    /* Start the protocol thread */
a2f18f
     prot_start(ra->protocol);
a2f18f
 
a2f18f
-    agmt_set_maxcsn(ra);
a2f18f
+    /*
a2f18f
+     * If we found the repl ruv, set the agmt maxcsn...
a2f18f
+     */
a2f18f
+    if (found_ruv){
a2f18f
+        Replica *r;
a2f18f
+        Object *repl_obj;
a2f18f
+        char **maxcsns = NULL;
a2f18f
+        int i;
a2f18f
 
a2f18f
+        maxcsns = slapi_entry_attr_get_charray(entries[0], type_agmtMaxCSN);
a2f18f
+        repl_obj = prot_get_replica_object(ra->protocol);
a2f18f
+        if(repl_obj && maxcsns){
a2f18f
+            r = (Replica *)object_get_data(repl_obj);
a2f18f
+            if(r){
a2f18f
+                /*
a2f18f
+                 * Loop over all the agmt maxcsns and find ours...
a2f18f
+                 */
a2f18f
+                for(i = 0; maxcsns[i]; i++){
a2f18f
+                    char buf[BUFSIZ];
a2f18f
+                    char unavail_buf[BUFSIZ];
a2f18f
+
a2f18f
+                    PR_snprintf(buf,BUFSIZ,"%s;%s;%s;%d;",slapi_sdn_get_dn(repl_sdn),
a2f18f
+                                slapi_rdn_get_value_by_ref(slapi_rdn_get_rdn(ra->rdn)),
a2f18f
+                                ra->hostname, ra->port);
a2f18f
+                    PR_snprintf(unavail_buf, BUFSIZ,"%s;%s;%s;%d;unavailable", slapi_sdn_get_dn(repl_sdn),
a2f18f
+                                slapi_rdn_get_value_by_ref(slapi_rdn_get_rdn(ra->rdn)),
a2f18f
+                                ra->hostname, ra->port);
a2f18f
+                    if(strstr(maxcsns[i], buf) || strstr(maxcsns[i], unavail_buf)){
a2f18f
+                    	/* Set the maxcsn */
a2f18f
+                        slapi_ch_free_string(&ra->maxcsn);
a2f18f
+                        ra->maxcsn = slapi_ch_strdup(maxcsns[i]);
a2f18f
+                        ra->consumerRID = agmt_maxcsn_get_rid(maxcsns[i]);
a2f18f
+                        ra->tmpConsumerRID = 1;
a2f18f
+                        break;
a2f18f
+                    }
a2f18f
+                }
a2f18f
+            }
a2f18f
+        }
a2f18f
+        slapi_ch_array_free(maxcsns);
a2f18f
+    }
a2f18f
+
a2f18f
+done:
a2f18f
     PR_Unlock(ra->lock);
a2f18f
+    slapi_free_search_results_internal(pb);
a2f18f
+    slapi_pblock_destroy (pb);
a2f18f
+    slapi_sdn_free(&repl_sdn);
a2f18f
+
a2f18f
     return 0;
a2f18f
 }
a2f18f
 
a2f18f
@@ -3052,99 +3136,6 @@ agmt_maxcsn_to_smod (Replica *r, Slapi_Mod *smod)
a2f18f
 }
a2f18f
 
a2f18f
 /*
a2f18f
- *  Called when we start a repl agmt
a2f18f
- */
a2f18f
-void
a2f18f
-agmt_set_maxcsn(Repl_Agmt *ra)
a2f18f
-{
a2f18f
-    Slapi_PBlock *pb = NULL;
a2f18f
-    Slapi_Entry **entries = NULL;
a2f18f
-    Replica *r = NULL;
a2f18f
-    Object *repl_obj;
a2f18f
-    const Slapi_DN *tombstone_sdn = NULL;
a2f18f
-    char *attrs[2];
a2f18f
-    int rc;
a2f18f
-
a2f18f
-    /* read ruv state from the ruv tombstone entry */
a2f18f
-    pb = slapi_pblock_new();
a2f18f
-    if (!pb) {
a2f18f
-        slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name, "agmt_set_maxcsn: Out of memory\n");
a2f18f
-        goto done;
a2f18f
-    }
a2f18f
-    repl_obj = prot_get_replica_object(ra->protocol);
a2f18f
-    if(repl_obj){
a2f18f
-        r = (Replica *)object_get_data(repl_obj);
a2f18f
-        tombstone_sdn = replica_get_root(r);
a2f18f
-    }
a2f18f
-    ra->maxcsn = NULL;
a2f18f
-    attrs[0] = (char*)type_agmtMaxCSN;
a2f18f
-    attrs[1] = NULL;
a2f18f
-    slapi_search_internal_set_pb_ext(
a2f18f
-        pb,
a2f18f
-        (Slapi_DN *)tombstone_sdn,
a2f18f
-        LDAP_SCOPE_BASE,
a2f18f
-        "objectclass=*",
a2f18f
-        attrs,
a2f18f
-        0, /* attrsonly */
a2f18f
-        NULL, /* controls */
a2f18f
-        RUV_STORAGE_ENTRY_UNIQUEID,
a2f18f
-        repl_get_plugin_identity (PLUGIN_MULTIMASTER_REPLICATION),
a2f18f
-        OP_FLAG_REPLICATED);  /* flags */
a2f18f
-    slapi_search_internal_pb (pb);
a2f18f
-
a2f18f
-    slapi_pblock_get(pb, SLAPI_PLUGIN_INTOP_RESULT, &rc);
a2f18f
-    if (rc == LDAP_SUCCESS){
a2f18f
-        Replica *r;
a2f18f
-        Object *repl_obj;
a2f18f
-        char **maxcsns;
a2f18f
-        int i;
a2f18f
-
a2f18f
-        slapi_pblock_get(pb, SLAPI_PLUGIN_INTOP_SEARCH_ENTRIES, &entries);
a2f18f
-        if (NULL == entries || NULL == entries[0]){
a2f18f
-            slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
a2f18f
-                "agmt_set_maxcsn: replica ruv tombstone entry for "
a2f18f
-                "replica %s not found\n",
a2f18f
-            slapi_sdn_get_dn(ra->replarea));
a2f18f
-            goto done;
a2f18f
-        }
a2f18f
-        maxcsns = slapi_entry_attr_get_charray(entries[0], type_agmtMaxCSN);
a2f18f
-        repl_obj = prot_get_replica_object(ra->protocol);
a2f18f
-        if(repl_obj && maxcsns){
a2f18f
-            r = (Replica *)object_get_data(repl_obj);
a2f18f
-            if(r){
a2f18f
-                /*
a2f18f
-                 * Loop over all the agmt maxcsns and find ours
a2f18f
-                 */
a2f18f
-                for(i = 0; maxcsns[i]; i++){
a2f18f
-                    char buf[BUFSIZ];
a2f18f
-                    char unavail_buf[BUFSIZ];
a2f18f
-
a2f18f
-                    PR_snprintf(buf,BUFSIZ,"%s;%s;%s;%d;",slapi_sdn_get_dn(ra->replarea),
a2f18f
-                    		slapi_rdn_get_value_by_ref(slapi_rdn_get_rdn(ra->rdn)),
a2f18f
-                    		ra->hostname, ra->port);
a2f18f
-                    PR_snprintf(unavail_buf, BUFSIZ,"%s;%s;%s;%d;unavailable", slapi_sdn_get_dn(ra->replarea),
a2f18f
-                    		slapi_rdn_get_value_by_ref(slapi_rdn_get_rdn(ra->rdn)),
a2f18f
-                    		ra->hostname, ra->port);
a2f18f
-                    if(strstr(maxcsns[i], buf) || strstr(maxcsns[i], unavail_buf)){
a2f18f
-                        slapi_ch_free_string(&ra->maxcsn);
a2f18f
-                        ra->maxcsn = slapi_ch_strdup(maxcsns[i]);
a2f18f
-                        ra->consumerRID = agmt_maxcsn_get_rid(maxcsns[i]);
a2f18f
-                        ra->tmpConsumerRID = 1;
a2f18f
-                        break;
a2f18f
-                    }
a2f18f
-                }
a2f18f
-            }
a2f18f
-        }
a2f18f
-        slapi_ch_array_free(maxcsns);
a2f18f
-    }
a2f18f
-done:
a2f18f
-    if (NULL != pb){
a2f18f
-        slapi_free_search_results_internal(pb);
a2f18f
-        slapi_pblock_destroy (pb);
a2f18f
-    }
a2f18f
-}
a2f18f
-
a2f18f
-/*
a2f18f
  * Parse out the consumer replicaID from the agmt maxcsn
a2f18f
  *
a2f18f
  *  "repl area;agmt_rdn;hostname;port;consumer_rid;maxcsn"
a2f18f
-- 
a2f18f
1.9.3
a2f18f