74ca47
From 4182dd8bbff22f9e0e45b763a4619c0bc8dcb153 Mon Sep 17 00:00:00 2001
74ca47
From: Mark Reynolds <mreynolds@redhat.com>
74ca47
Date: Tue, 9 May 2017 12:31:58 -0400
74ca47
Subject: [PATCH] Ticket 49238 - AddressSanitizer: heap-use-after-free in
74ca47
 libreplication
74ca47
74ca47
Bug Description:
74ca47
        The bug is detected in csn pending list component, when
74ca47
        accessing a csn that has already been freed.
74ca47
74ca47
        The bug is mostly detectable under ASAN because under normal run
74ca47
        the read access to the csn would only crash if the csn was in
74ca47
        an unmapped page (that is quite difficult to acheive).
74ca47
74ca47
        The bug was observed under the following conditions:
74ca47
            - very slow machine
74ca47
            - all instances running on the same machine
74ca47
74ca47
        The patch address 2 issues
74ca47
74ca47
        Issue - 1
74ca47
        Under specfic circumstance (failure, like "db_deadlock" during changelog update),
74ca47
        the csn was freed but still present in the pending list (fix-1).
74ca47
74ca47
        Issue - 2
74ca47
        Further investigations, showed an other corner case where a
74ca47
        replica could be updated by several suppliers in parallel.
74ca47
        In such scenario, an update (on one thread-2) with a higher csn (let csn-2)
74ca47
        may be applied before an update (on another thread-1) with a smaller
74ca47
        csn (let csn-1).
74ca47
        csn-2 is freed when thread-2 complete but the csn-2 will remain
74ca47
        in the pending list until csn-1 is commited.
74ca47
        so followup of pending list may access a csn that was freed
74ca47
74ca47
Fix Description:
74ca47
        Issue - 1
74ca47
        The fix in repl5_plugins.c, frees the csn (thread private area)
74ca47
        at the condition pending list was roll up for that csn (ruv update).
74ca47
74ca47
        Issue - 2
74ca47
        The fix is in two parts:
74ca47
            If a supplier tries to acquire a replica while it is
74ca47
        already owner of it, the replica is granted.
74ca47
74ca47
            If a supplier owns a replica and is asking again for it,
74ca47
        but this time the replica is not granted, the replica is release and
74ca47
        the supplier disconnected.
74ca47
74ca47
https://pagure.io/389-ds-base/issue/49238
74ca47
74ca47
Reviewed by: Mark Reynolds, Ludwig Krispenz, William Brown (thanks to you all !!)
74ca47
74ca47
Platforms tested: 7.4
74ca47
74ca47
Flag Day: no
74ca47
74ca47
Doc impact: no
74ca47
---
74ca47
 ldap/servers/plugins/replication/repl5.h         |  1 +
74ca47
 ldap/servers/plugins/replication/repl5_plugins.c |  7 +++-
74ca47
 ldap/servers/plugins/replication/repl5_replica.c | 49 +++++++++++++++++++-----
74ca47
 ldap/servers/plugins/replication/repl_extop.c    | 42 ++++++++++++++++++--
74ca47
 4 files changed, 86 insertions(+), 13 deletions(-)
74ca47
74ca47
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
74ca47
index c3bd10c..1d8989c 100644
74ca47
--- a/ldap/servers/plugins/replication/repl5.h
74ca47
+++ b/ldap/servers/plugins/replication/repl5.h
74ca47
@@ -549,6 +549,7 @@ void replica_relinquish_exclusive_access(Replica *r, PRUint64 connid, int opid);
74ca47
 PRBool replica_get_tombstone_reap_active(const Replica *r);
74ca47
 const Slapi_DN *replica_get_root(const Replica *r);
74ca47
 const char *replica_get_name(const Replica *r);
74ca47
+uint64_t replica_get_locking_conn(const Replica *r);
74ca47
 ReplicaId replica_get_rid (const Replica *r);
74ca47
 void replica_set_rid (Replica *r, ReplicaId rid);
74ca47
 PRBool replica_is_initialized (const Replica *r);
74ca47
diff --git a/ldap/servers/plugins/replication/repl5_plugins.c b/ldap/servers/plugins/replication/repl5_plugins.c
74ca47
index ebcc230..9ef06af 100644
74ca47
--- a/ldap/servers/plugins/replication/repl5_plugins.c
74ca47
+++ b/ldap/servers/plugins/replication/repl5_plugins.c
74ca47
@@ -1224,7 +1224,12 @@ common_return:
74ca47
 	opcsn = operation_get_csn(op);
74ca47
 	prim_csn = get_thread_primary_csn();
74ca47
 	if (csn_is_equal(opcsn, prim_csn)) {
74ca47
-		set_thread_primary_csn(NULL);
74ca47
+		if (return_value == 0) {
74ca47
+			/* the primary csn was succesfully committed
74ca47
+			 * unset it in the thread local data
74ca47
+			 */
74ca47
+			set_thread_primary_csn(NULL);
74ca47
+		}
74ca47
 	}
74ca47
 	if (repl_obj) {
74ca47
 		object_release (repl_obj);
74ca47
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
74ca47
index a106f8b..1bdc138 100644
74ca47
--- a/ldap/servers/plugins/replication/repl5_replica.c
74ca47
+++ b/ldap/servers/plugins/replication/repl5_replica.c
74ca47
@@ -64,6 +64,7 @@ struct replica {
74ca47
 	PRBool state_update_inprogress; /* replica state is being updated */
74ca47
 	PRLock *agmt_lock;          	/* protects agreement creation, start and stop */
74ca47
 	char *locking_purl;				/* supplier who has exclusive access */
74ca47
+	uint64_t locking_conn;         	/* The supplier's connection id */
74ca47
 	Slapi_Counter *protocol_timeout;/* protocol shutdown timeout */
74ca47
 	Slapi_Counter *backoff_min;		/* backoff retry minimum */
74ca47
 	Slapi_Counter *backoff_max;		/* backoff retry maximum */
74ca47
@@ -602,19 +603,32 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
74ca47
 				slapi_sdn_get_dn(r->repl_root),
74ca47
 				r->locking_purl ? r->locking_purl : "unknown");
74ca47
 		rval = PR_FALSE;
74ca47
+		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS)) {
74ca47
+			/* inc update */
74ca47
+			if (r->locking_purl && r->locking_conn == connid) {
74ca47
+				/* This is the same supplier connection, reset the replica
74ca47
+				 * purl, and return success */
74ca47
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name,
74ca47
+					"replica_get_exclusive_access - "
74ca47
+					"This is a second acquire attempt from the same replica connection "
74ca47
+					" - return success instead of busy\n");
74ca47
+				slapi_ch_free_string(&r->locking_purl);
74ca47
+				r->locking_purl = slapi_ch_strdup(locking_purl);
74ca47
+				rval = PR_TRUE;
74ca47
+				goto done;
74ca47
+			}
74ca47
+			if (replica_get_release_timeout(r)) {
74ca47
+				/*
74ca47
+				 * Abort the current session so other replicas can acquire
74ca47
+				 * this server.
74ca47
+				 */
74ca47
+				r->abort_session = ABORT_SESSION;
74ca47
+			}
74ca47
+		}
74ca47
 		if (current_purl)
74ca47
 		{
74ca47
 			*current_purl = slapi_ch_strdup(r->locking_purl);
74ca47
 		}
74ca47
-		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS) &&
74ca47
-		    replica_get_release_timeout(r))
74ca47
-		{
74ca47
-			/*
74ca47
-			 * We are not doing a total update, so abort the current session
74ca47
-			 * so other replicas can acquire this server.
74ca47
-			 */
74ca47
-			r->abort_session = ABORT_SESSION;
74ca47
-		}
74ca47
 	}
74ca47
 	else
74ca47
 	{
74ca47
@@ -642,7 +656,9 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
74ca47
 		}
74ca47
 		slapi_ch_free_string(&r->locking_purl);
74ca47
 		r->locking_purl = slapi_ch_strdup(locking_purl);
74ca47
+		r->locking_conn = connid;
74ca47
 	}
74ca47
+done:
74ca47
 	replica_unlock(r->repl_lock);
74ca47
 	return rval;
74ca47
 }
74ca47
@@ -720,6 +736,18 @@ replica_get_name(const Replica *r) /* ONREPL - should we return copy instead? */
74ca47
     return(r->repl_name);
74ca47
 }
74ca47
 
74ca47
+/*
74ca47
+ * Returns locking_conn of this replica
74ca47
+ */
74ca47
+uint64_t
74ca47
+replica_get_locking_conn(const Replica *r)
74ca47
+{
74ca47
+	uint64_t connid;
74ca47
+	replica_lock(r->repl_lock);
74ca47
+	connid = r->locking_conn;
74ca47
+	replica_unlock(r->repl_lock);
74ca47
+	return connid;
74ca47
+}
74ca47
 /* 
74ca47
  * Returns replicaid of this replica 
74ca47
  */
74ca47
@@ -2251,6 +2279,9 @@ _replica_init_from_config (Replica *r, Slapi_Entry *e, char *errortext)
74ca47
     }
74ca47
 
74ca47
     r->tombstone_reap_stop = r->tombstone_reap_active = PR_FALSE;
74ca47
+    
74ca47
+    /* No supplier holding the replica */
74ca47
+    r->locking_conn = ULONG_MAX;
74ca47
 
74ca47
     return (_replica_check_validity (r));
74ca47
 }
74ca47
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
74ca47
index 412caec..a39d918 100644
74ca47
--- a/ldap/servers/plugins/replication/repl_extop.c
74ca47
+++ b/ldap/servers/plugins/replication/repl_extop.c
74ca47
@@ -1138,9 +1138,45 @@ send_response:
74ca47
 		 */
74ca47
 		if (NULL != connext && NULL != connext->replica_acquired)
74ca47
 		{
74ca47
-            Object *r_obj = (Object*)connext->replica_acquired;
74ca47
-			replica_relinquish_exclusive_access((Replica*)object_get_data (r_obj),
74ca47
-												connid, opid);
74ca47
+			Replica *r = (Replica*)object_get_data ((Object*)connext->replica_acquired);
74ca47
+			uint64_t r_locking_conn;
74ca47
+			
74ca47
+			/* At this point the supplier runs a Replica Agreement for 
74ca47
+			 * the specific replica connext->replica_acquired.
74ca47
+			 * The RA does not know it holds the replica (because it is
74ca47
+			 * sending this request).
74ca47
+			 * The situation is confused
74ca47
+			 */
74ca47
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
74ca47
+				"already acquired replica: replica not ready (%d) (replica=%s)\n", response, replica_get_name(r) ? replica_get_name(r) : "no name");
74ca47
+			
74ca47
+			/*
74ca47
+			 * On consumer side, we release the exclusive access at the
74ca47
+			 * condition this is this RA that holds the replica
74ca47
+			 */
74ca47
+			if (r) {
74ca47
+				
74ca47
+				r_locking_conn = replica_get_locking_conn(r);
74ca47
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
74ca47
+				"already acquired replica: locking_conn=%d, current connid=%d\n", (int) r_locking_conn, (int) connid);
74ca47
+				
74ca47
+				if ((r_locking_conn != ULONG_MAX) && (r_locking_conn == connid)) {
74ca47
+					replica_relinquish_exclusive_access(r, connid, opid);
74ca47
+					object_release((Object*) connext->replica_acquired);
74ca47
+					connext->replica_acquired = NULL;
74ca47
+				}
74ca47
+			}
74ca47
+			/*
74ca47
+			 * On consumer side we should not keep a incoming connection
74ca47
+			 * with replica_acquired set although the supplier is not aware of
74ca47
+			 * 
74ca47
+			 * On the supplier, we need to close the connection so
74ca47
+			 * that the RA will restart a new session in a clear state 
74ca47
+			 */
74ca47
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
74ca47
+				"already acquired replica: disconnect conn=%d\n", connid);
74ca47
+			slapi_disconnect_server(conn);
74ca47
+            
74ca47
 		}
74ca47
 		/* Remove any flags that would indicate repl session in progress */
74ca47
 		if (NULL != connext)
74ca47
-- 
74ca47
2.9.4
74ca47