Blame SOURCES/0043-Ticket-49238-AddressSanitizer-heap-use-after-free-in.patch

b69e47
From 4182dd8bbff22f9e0e45b763a4619c0bc8dcb153 Mon Sep 17 00:00:00 2001
b69e47
From: Mark Reynolds <mreynolds@redhat.com>
b69e47
Date: Tue, 9 May 2017 12:31:58 -0400
b69e47
Subject: [PATCH] Ticket 49238 - AddressSanitizer: heap-use-after-free in
b69e47
 libreplication
b69e47
b69e47
Bug Description:
b69e47
        The bug is detected in csn pending list component, when
b69e47
        accessing a csn that has already been freed.
b69e47
b69e47
        The bug is mostly detectable under ASAN because under normal run
b69e47
        the read access to the csn would only crash if the csn was in
b69e47
        an unmapped page (that is quite difficult to acheive).
b69e47
b69e47
        The bug was observed under the following conditions:
b69e47
            - very slow machine
b69e47
            - all instances running on the same machine
b69e47
b69e47
        The patch address 2 issues
b69e47
b69e47
        Issue - 1
b69e47
        Under specfic circumstance (failure, like "db_deadlock" during changelog update),
b69e47
        the csn was freed but still present in the pending list (fix-1).
b69e47
b69e47
        Issue - 2
b69e47
        Further investigations, showed an other corner case where a
b69e47
        replica could be updated by several suppliers in parallel.
b69e47
        In such scenario, an update (on one thread-2) with a higher csn (let csn-2)
b69e47
        may be applied before an update (on another thread-1) with a smaller
b69e47
        csn (let csn-1).
b69e47
        csn-2 is freed when thread-2 complete but the csn-2 will remain
b69e47
        in the pending list until csn-1 is commited.
b69e47
        so followup of pending list may access a csn that was freed
b69e47
b69e47
Fix Description:
b69e47
        Issue - 1
b69e47
        The fix in repl5_plugins.c, frees the csn (thread private area)
b69e47
        at the condition pending list was roll up for that csn (ruv update).
b69e47
b69e47
        Issue - 2
b69e47
        The fix is in two parts:
b69e47
            If a supplier tries to acquire a replica while it is
b69e47
        already owner of it, the replica is granted.
b69e47
b69e47
            If a supplier owns a replica and is asking again for it,
b69e47
        but this time the replica is not granted, the replica is release and
b69e47
        the supplier disconnected.
b69e47
b69e47
https://pagure.io/389-ds-base/issue/49238
b69e47
b69e47
Reviewed by: Mark Reynolds, Ludwig Krispenz, William Brown (thanks to you all !!)
b69e47
b69e47
Platforms tested: 7.4
b69e47
b69e47
Flag Day: no
b69e47
b69e47
Doc impact: no
b69e47
---
b69e47
 ldap/servers/plugins/replication/repl5.h         |  1 +
b69e47
 ldap/servers/plugins/replication/repl5_plugins.c |  7 +++-
b69e47
 ldap/servers/plugins/replication/repl5_replica.c | 49 +++++++++++++++++++-----
b69e47
 ldap/servers/plugins/replication/repl_extop.c    | 42 ++++++++++++++++++--
b69e47
 4 files changed, 86 insertions(+), 13 deletions(-)
b69e47
b69e47
diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h
b69e47
index c3bd10c..1d8989c 100644
b69e47
--- a/ldap/servers/plugins/replication/repl5.h
b69e47
+++ b/ldap/servers/plugins/replication/repl5.h
b69e47
@@ -549,6 +549,7 @@ void replica_relinquish_exclusive_access(Replica *r, PRUint64 connid, int opid);
b69e47
 PRBool replica_get_tombstone_reap_active(const Replica *r);
b69e47
 const Slapi_DN *replica_get_root(const Replica *r);
b69e47
 const char *replica_get_name(const Replica *r);
b69e47
+uint64_t replica_get_locking_conn(const Replica *r);
b69e47
 ReplicaId replica_get_rid (const Replica *r);
b69e47
 void replica_set_rid (Replica *r, ReplicaId rid);
b69e47
 PRBool replica_is_initialized (const Replica *r);
b69e47
diff --git a/ldap/servers/plugins/replication/repl5_plugins.c b/ldap/servers/plugins/replication/repl5_plugins.c
b69e47
index ebcc230..9ef06af 100644
b69e47
--- a/ldap/servers/plugins/replication/repl5_plugins.c
b69e47
+++ b/ldap/servers/plugins/replication/repl5_plugins.c
b69e47
@@ -1224,7 +1224,12 @@ common_return:
b69e47
 	opcsn = operation_get_csn(op);
b69e47
 	prim_csn = get_thread_primary_csn();
b69e47
 	if (csn_is_equal(opcsn, prim_csn)) {
b69e47
-		set_thread_primary_csn(NULL);
b69e47
+		if (return_value == 0) {
b69e47
+			/* the primary csn was succesfully committed
b69e47
+			 * unset it in the thread local data
b69e47
+			 */
b69e47
+			set_thread_primary_csn(NULL);
b69e47
+		}
b69e47
 	}
b69e47
 	if (repl_obj) {
b69e47
 		object_release (repl_obj);
b69e47
diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c
b69e47
index a106f8b..1bdc138 100644
b69e47
--- a/ldap/servers/plugins/replication/repl5_replica.c
b69e47
+++ b/ldap/servers/plugins/replication/repl5_replica.c
b69e47
@@ -64,6 +64,7 @@ struct replica {
b69e47
 	PRBool state_update_inprogress; /* replica state is being updated */
b69e47
 	PRLock *agmt_lock;          	/* protects agreement creation, start and stop */
b69e47
 	char *locking_purl;				/* supplier who has exclusive access */
b69e47
+	uint64_t locking_conn;         	/* The supplier's connection id */
b69e47
 	Slapi_Counter *protocol_timeout;/* protocol shutdown timeout */
b69e47
 	Slapi_Counter *backoff_min;		/* backoff retry minimum */
b69e47
 	Slapi_Counter *backoff_max;		/* backoff retry maximum */
b69e47
@@ -602,19 +603,32 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
b69e47
 				slapi_sdn_get_dn(r->repl_root),
b69e47
 				r->locking_purl ? r->locking_purl : "unknown");
b69e47
 		rval = PR_FALSE;
b69e47
+		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS)) {
b69e47
+			/* inc update */
b69e47
+			if (r->locking_purl && r->locking_conn == connid) {
b69e47
+				/* This is the same supplier connection, reset the replica
b69e47
+				 * purl, and return success */
b69e47
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name,
b69e47
+					"replica_get_exclusive_access - "
b69e47
+					"This is a second acquire attempt from the same replica connection "
b69e47
+					" - return success instead of busy\n");
b69e47
+				slapi_ch_free_string(&r->locking_purl);
b69e47
+				r->locking_purl = slapi_ch_strdup(locking_purl);
b69e47
+				rval = PR_TRUE;
b69e47
+				goto done;
b69e47
+			}
b69e47
+			if (replica_get_release_timeout(r)) {
b69e47
+				/*
b69e47
+				 * Abort the current session so other replicas can acquire
b69e47
+				 * this server.
b69e47
+				 */
b69e47
+				r->abort_session = ABORT_SESSION;
b69e47
+			}
b69e47
+		}
b69e47
 		if (current_purl)
b69e47
 		{
b69e47
 			*current_purl = slapi_ch_strdup(r->locking_purl);
b69e47
 		}
b69e47
-		if (!(r->repl_state_flags & REPLICA_TOTAL_IN_PROGRESS) &&
b69e47
-		    replica_get_release_timeout(r))
b69e47
-		{
b69e47
-			/*
b69e47
-			 * We are not doing a total update, so abort the current session
b69e47
-			 * so other replicas can acquire this server.
b69e47
-			 */
b69e47
-			r->abort_session = ABORT_SESSION;
b69e47
-		}
b69e47
 	}
b69e47
 	else
b69e47
 	{
b69e47
@@ -642,7 +656,9 @@ replica_get_exclusive_access(Replica *r, PRBool *isInc, PRUint64 connid, int opi
b69e47
 		}
b69e47
 		slapi_ch_free_string(&r->locking_purl);
b69e47
 		r->locking_purl = slapi_ch_strdup(locking_purl);
b69e47
+		r->locking_conn = connid;
b69e47
 	}
b69e47
+done:
b69e47
 	replica_unlock(r->repl_lock);
b69e47
 	return rval;
b69e47
 }
b69e47
@@ -720,6 +736,18 @@ replica_get_name(const Replica *r) /* ONREPL - should we return copy instead? */
b69e47
     return(r->repl_name);
b69e47
 }
b69e47
 
b69e47
+/*
b69e47
+ * Returns locking_conn of this replica
b69e47
+ */
b69e47
+uint64_t
b69e47
+replica_get_locking_conn(const Replica *r)
b69e47
+{
b69e47
+	uint64_t connid;
b69e47
+	replica_lock(r->repl_lock);
b69e47
+	connid = r->locking_conn;
b69e47
+	replica_unlock(r->repl_lock);
b69e47
+	return connid;
b69e47
+}
b69e47
 /* 
b69e47
  * Returns replicaid of this replica 
b69e47
  */
b69e47
@@ -2251,6 +2279,9 @@ _replica_init_from_config (Replica *r, Slapi_Entry *e, char *errortext)
b69e47
     }
b69e47
 
b69e47
     r->tombstone_reap_stop = r->tombstone_reap_active = PR_FALSE;
b69e47
+    
b69e47
+    /* No supplier holding the replica */
b69e47
+    r->locking_conn = ULONG_MAX;
b69e47
 
b69e47
     return (_replica_check_validity (r));
b69e47
 }
b69e47
diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c
b69e47
index 412caec..a39d918 100644
b69e47
--- a/ldap/servers/plugins/replication/repl_extop.c
b69e47
+++ b/ldap/servers/plugins/replication/repl_extop.c
b69e47
@@ -1138,9 +1138,45 @@ send_response:
b69e47
 		 */
b69e47
 		if (NULL != connext && NULL != connext->replica_acquired)
b69e47
 		{
b69e47
-            Object *r_obj = (Object*)connext->replica_acquired;
b69e47
-			replica_relinquish_exclusive_access((Replica*)object_get_data (r_obj),
b69e47
-												connid, opid);
b69e47
+			Replica *r = (Replica*)object_get_data ((Object*)connext->replica_acquired);
b69e47
+			uint64_t r_locking_conn;
b69e47
+			
b69e47
+			/* At this point the supplier runs a Replica Agreement for 
b69e47
+			 * the specific replica connext->replica_acquired.
b69e47
+			 * The RA does not know it holds the replica (because it is
b69e47
+			 * sending this request).
b69e47
+			 * The situation is confused
b69e47
+			 */
b69e47
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
b69e47
+				"already acquired replica: replica not ready (%d) (replica=%s)\n", response, replica_get_name(r) ? replica_get_name(r) : "no name");
b69e47
+			
b69e47
+			/*
b69e47
+			 * On consumer side, we release the exclusive access at the
b69e47
+			 * condition this is this RA that holds the replica
b69e47
+			 */
b69e47
+			if (r) {
b69e47
+				
b69e47
+				r_locking_conn = replica_get_locking_conn(r);
b69e47
+				slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
b69e47
+				"already acquired replica: locking_conn=%d, current connid=%d\n", (int) r_locking_conn, (int) connid);
b69e47
+				
b69e47
+				if ((r_locking_conn != ULONG_MAX) && (r_locking_conn == connid)) {
b69e47
+					replica_relinquish_exclusive_access(r, connid, opid);
b69e47
+					object_release((Object*) connext->replica_acquired);
b69e47
+					connext->replica_acquired = NULL;
b69e47
+				}
b69e47
+			}
b69e47
+			/*
b69e47
+			 * On consumer side we should not keep a incoming connection
b69e47
+			 * with replica_acquired set although the supplier is not aware of
b69e47
+			 * 
b69e47
+			 * On the supplier, we need to close the connection so
b69e47
+			 * that the RA will restart a new session in a clear state 
b69e47
+			 */
b69e47
+			slapi_log_err(SLAPI_LOG_REPL, repl_plugin_name, "multimaster_extop_StartNSDS50ReplicationRequest - "
b69e47
+				"already acquired replica: disconnect conn=%d\n", connid);
b69e47
+			slapi_disconnect_server(conn);
b69e47
+            
b69e47
 		}
b69e47
 		/* Remove any flags that would indicate repl session in progress */
b69e47
 		if (NULL != connext)
b69e47
-- 
b69e47
2.9.4
b69e47