Blame SOURCES/0043-Ticket-49238-AddressSanitizer-heap-use-after-free-in.patch

61f723
From 4182dd8bbff22f9e0e45b763a4619c0bc8dcb153 Mon Sep 17 00:00:00 2001
61f723
From: Mark Reynolds <mreynolds@redhat.com>
61f723
Date: Tue, 9 May 2017 12:31:58 -0400
61f723
Subject: [PATCH] Ticket 49238 - AddressSanitizer: heap-use-after-free in
61f723
 libreplication
61f723
61f723
Bug Description:
61f723
        The bug is detected in csn pending list component, when
61f723
        accessing a csn that has already been freed.
61f723
61f723
        The bug is mostly detectable under ASAN because under normal run
61f723
        the read access to the csn would only crash if the csn was in
61f723
        an unmapped page (that is quite difficult to acheive).
61f723
61f723
        The bug was observed under the following conditions:
61f723
            - very slow machine
61f723
            - all instances running on the same machine
61f723
61f723
        The patch address 2 issues
61f723
61f723
        Issue - 1
61f723
        Under specfic circumstance (failure, like "db_deadlock" during changelog update),
61f723
        the csn was freed but still present in the pending list (fix-1).
61f723
61f723
        Issue - 2
61f723
        Further investigations, showed an other corner case where a
61f723
        replica could be updated by several suppliers in parallel.
61f723
        In such scenario, an update (on one thread-2) with a higher csn (let csn-2)
61f723
        may be applied before an update (on another thread-1) with a smaller
61f723
        csn (let csn-1).
61f723
        csn-2 is freed when thread-2 complete but the csn-2 will remain
61f723
        in the pending list until csn-1 is commited.
61f723
        so followup of pending list may access a csn that was freed
61f723
61f723
Fix Description:
61f723
        Issue - 1
61f723
        The fix in repl5_plugins.c, frees the csn (thread private area)
61f723
        at the condition pending list was roll up for that csn (ruv update).
61f723
61f723
        Issue - 2
61f723
        The fix is in two parts:
61f723
            If a supplier tries to acquire a replica while it is
61f723
        already owner of it, the replica is granted.
61f723
61f723
            If a supplier owns a replica and is asking again for it,
61f723
        but this time the replica is not granted, the replica is release and
61f723
        the supplier disconnected.
61f723
61f723
https://pagure.io/389-ds-base/issue/49238
61f723
61f723
Reviewed by: Mark Reynolds, Ludwig Krispenz, William Brown (thanks to you all !!)
61f723
61f723
Platforms tested: 7.4
61f723
61f723
Flag Day: no
61f723
61f723
Doc impact: no
61f723
---
61f723
 ldap/servers/plugins/replication/repl5.h         |  1 +
61f723
 ldap/servers/plugins/replication/repl5_plugins.c |  7 +++-
61f723
 ldap/servers/plugins/replication/repl5_replica.c | 49 +++++++++++++++++++-----
61f723
 ldap/servers/plugins/replication/repl_extop.c    | 42 ++++++++++++++++++--
61f723
 4 files changed, 86 insertions(+), 13 deletions(-)
61f723
61f723
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
61f723
index c3bd10c..1d8989c 100644
61f723
--- a/ldap/servers/plugins/replication/repl5.h
61f723
+++ b/ldap/servers/plugins/replication/repl5.h
61f723
@@ -549,6 +549,7 @@ void replica_relinquish_exclusive_access(Replica *r, PRUint64 connid, int opid);
61f723
 PRBool replica_get_tombstone_reap_active(const Replica *r);
61f723
 const Slapi_DN *replica_get_root(const Replica *r);
61f723
 const char *replica_get_name(const Replica *r);
61f723
+uint64_t replica_get_locking_conn(const Replica *r);
61f723
 ReplicaId replica_get_rid (const Replica *r);
61f723
 void replica_set_rid (Replica *r, ReplicaId rid);
61f723
 PRBool replica_is_initialized (const Replica *r);
61f723
diff --git a/ldap/servers/plugins/replication/repl5_plugins.c b/ldap/servers/plugins/replication/repl5_plugins.c
61f723
index ebcc230..9ef06af 100644
61f723
--- a/ldap/servers/plugins/replication/repl5_plugins.c
61f723
+++ b/ldap/servers/plugins/replication/repl5_plugins.c
61f723
@@ -1224,7 +1224,12 @@ common_return:
61f723
 	opcsn = operation_get_csn(op);
61f723
 	prim_csn = get_thread_primary_csn();
61f723
 	if (csn_is_equal(opcsn, prim_csn)) {
61f723
-		set_thread_primary_csn(NULL);
61f723
+		if (return_value == 0) {
61f723
+			/* the primary csn was succesfully committed
61f723
+			 * unset it in the thread local data
61f723
+			 */
61f723
+			set_thread_primary_csn(NULL);
61f723
+		}
61f723
 	}
61f723
 	if (repl_obj) {
61f723
 		object_release (repl_obj);
61f723
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
61f723
index a106f8b..1bdc138 100644
61f723
--- a/ldap/servers/plugins/replication/repl5_replica.c
61f723
+++ b/ldap/servers/plugins/replication/repl5_replica.c
61f723
@@ -64,6 +64,7 @@ struct replica {
61f723
 	PRBool state_update_inprogress; /* replica state is being updated */
61f723
 	PRLock *agmt_lock;          	/* protects agreement creation, start and stop */
61f723
 	char *locking_purl;				/* supplier who has exclusive access */
61f723
+	uint64_t locking_conn;         	/* The supplier's connection id */
61f723
 	Slapi_Counter *protocol_timeout;/* protocol shutdown timeout */
61f723
 	Slapi_Counter *backoff_min;		/* backoff retry minimum */
61f723
 	Slapi_Counter *backoff_max;		/* backoff retry maximum */
61f723
@@ -602,19 +603,32 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
61f723
 				slapi_sdn_get_dn(r->repl_root),
61f723
 				r->locking_purl ? r->locking_purl : "unknown");
61f723
 		rval = PR_FALSE;
61f723
+		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS)) {
61f723
+			/* inc update */
61f723
+			if (r->locking_purl && r->locking_conn == connid) {
61f723
+				/* This is the same supplier connection, reset the replica
61f723
+				 * purl, and return success */
61f723
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name,
61f723
+					"replica_get_exclusive_access - "
61f723
+					"This is a second acquire attempt from the same replica connection "
61f723
+					" - return success instead of busy\n");
61f723
+				slapi_ch_free_string(&r->locking_purl);
61f723
+				r->locking_purl = slapi_ch_strdup(locking_purl);
61f723
+				rval = PR_TRUE;
61f723
+				goto done;
61f723
+			}
61f723
+			if (replica_get_release_timeout(r)) {
61f723
+				/*
61f723
+				 * Abort the current session so other replicas can acquire
61f723
+				 * this server.
61f723
+				 */
61f723
+				r->abort_session = ABORT_SESSION;
61f723
+			}
61f723
+		}
61f723
 		if (current_purl)
61f723
 		{
61f723
 			*current_purl = slapi_ch_strdup(r->locking_purl);
61f723
 		}
61f723
-		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS) &&
61f723
-		    replica_get_release_timeout(r))
61f723
-		{
61f723
-			/*
61f723
-			 * We are not doing a total update, so abort the current session
61f723
-			 * so other replicas can acquire this server.
61f723
-			 */
61f723
-			r->abort_session = ABORT_SESSION;
61f723
-		}
61f723
 	}
61f723
 	else
61f723
 	{
61f723
@@ -642,7 +656,9 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
61f723
 		}
61f723
 		slapi_ch_free_string(&r->locking_purl);
61f723
 		r->locking_purl = slapi_ch_strdup(locking_purl);
61f723
+		r->locking_conn = connid;
61f723
 	}
61f723
+done:
61f723
 	replica_unlock(r->repl_lock);
61f723
 	return rval;
61f723
 }
61f723
@@ -720,6 +736,18 @@ replica_get_name(const Replica *r) /* ONREPL - should we return copy instead? */
61f723
     return(r->repl_name);
61f723
 }
61f723
 
61f723
+/*
61f723
+ * Returns locking_conn of this replica
61f723
+ */
61f723
+uint64_t
61f723
+replica_get_locking_conn(const Replica *r)
61f723
+{
61f723
+	uint64_t connid;
61f723
+	replica_lock(r->repl_lock);
61f723
+	connid = r->locking_conn;
61f723
+	replica_unlock(r->repl_lock);
61f723
+	return connid;
61f723
+}
61f723
 /* 
61f723
  * Returns replicaid of this replica 
61f723
  */
61f723
@@ -2251,6 +2279,9 @@ _replica_init_from_config (Replica *r, Slapi_Entry *e, char *errortext)
61f723
     }
61f723
 
61f723
     r->tombstone_reap_stop = r->tombstone_reap_active = PR_FALSE;
61f723
+    
61f723
+    /* No supplier holding the replica */
61f723
+    r->locking_conn = ULONG_MAX;
61f723
 
61f723
     return (_replica_check_validity (r));
61f723
 }
61f723
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
61f723
index 412caec..a39d918 100644
61f723
--- a/ldap/servers/plugins/replication/repl_extop.c
61f723
+++ b/ldap/servers/plugins/replication/repl_extop.c
61f723
@@ -1138,9 +1138,45 @@ send_response:
61f723
 		 */
61f723
 		if (NULL != connext && NULL != connext->replica_acquired)
61f723
 		{
61f723
-            Object *r_obj = (Object*)connext->replica_acquired;
61f723
-			replica_relinquish_exclusive_access((Replica*)object_get_data (r_obj),
61f723
-												connid, opid);
61f723
+			Replica *r = (Replica*)object_get_data ((Object*)connext->replica_acquired);
61f723
+			uint64_t r_locking_conn;
61f723
+			
61f723
+			/* At this point the supplier runs a Replica Agreement for 
61f723
+			 * the specific replica connext->replica_acquired.
61f723
+			 * The RA does not know it holds the replica (because it is
61f723
+			 * sending this request).
61f723
+			 * The situation is confused
61f723
+			 */
61f723
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
61f723
+				"already acquired replica: replica not ready (%d) (replica=%s)\n", response, replica_get_name(r) ? replica_get_name(r) : "no name");
61f723
+			
61f723
+			/*
61f723
+			 * On consumer side, we release the exclusive access at the
61f723
+			 * condition this is this RA that holds the replica
61f723
+			 */
61f723
+			if (r) {
61f723
+				
61f723
+				r_locking_conn = replica_get_locking_conn(r);
61f723
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
61f723
+				"already acquired replica: locking_conn=%d, current connid=%d\n", (int) r_locking_conn, (int) connid);
61f723
+				
61f723
+				if ((r_locking_conn != ULONG_MAX) && (r_locking_conn == connid)) {
61f723
+					replica_relinquish_exclusive_access(r, connid, opid);
61f723
+					object_release((Object*) connext->replica_acquired);
61f723
+					connext->replica_acquired = NULL;
61f723
+				}
61f723
+			}
61f723
+			/*
61f723
+			 * On consumer side we should not keep a incoming connection
61f723
+			 * with replica_acquired set although the supplier is not aware of
61f723
+			 * 
61f723
+			 * On the supplier, we need to close the connection so
61f723
+			 * that the RA will restart a new session in a clear state 
61f723
+			 */
61f723
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
61f723
+				"already acquired replica: disconnect conn=%d\n", connid);
61f723
+			slapi_disconnect_server(conn);
61f723
+            
61f723
 		}
61f723
 		/* Remove any flags that would indicate repl session in progress */
61f723
 		if (NULL != connext)
61f723
-- 
61f723
2.9.4
61f723