Blob Blame Raw
From 5391c666e58af5841eab88c98505f99c8ed20d6b Mon Sep 17 00:00:00 2001
From: Thierry Bordaz <tbordaz@redhat.com>
Date: Tue, 10 Jan 2017 14:32:53 +0100
Subject: [PATCH 66/67] Ticket 49079: deadlock on cos cache rebuild

Bug Description:
    To rebuild the cache cos_cache_creation the thread gets cos definitions from backend.
    It means change_lock is held then cos_cache_creation will acquire some backend pages.

    A deadlock can happen if cos_post_op is called while backend is locked.
    For example if a bepreop (urp) does an internal update on a cos definition.
    Then the thread holds backend pages, that will be needed by cos_cache_creation,
    and will acquire change_lock for notification of the cos_cache thread

Fix Description:

    Let cos cache rebuild thread run without holding change_lock.
    The lock prevents parallel run but a flag can do the same.

https://fedorahosted.org/389/ticket/49079

Reviewed by: William Brown and Ludwig Krispenz (thanks to you both !!)

Platforms tested: F23

Flag Day: no

Doc impact: no

(cherry picked from commit ac44337bd97fe63071e7d83e9dcd788f2af1feab)
(cherry picked from commit 3ac12cb94a8873b0fa4ddb12f924cc58bd9c9872)
---
 ldap/servers/plugins/cos/cos_cache.c | 73 ++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 12 deletions(-)

diff --git a/ldap/servers/plugins/cos/cos_cache.c b/ldap/servers/plugins/cos/cos_cache.c
index 8a32630..87b4ba5 100644
--- a/ldap/servers/plugins/cos/cos_cache.c
+++ b/ldap/servers/plugins/cos/cos_cache.c
@@ -111,7 +111,9 @@ void * cos_get_plugin_identity();
 /* the global plugin handle */
 static volatile vattr_sp_handle *vattr_handle = NULL;
 
+/* both variables are protected by change_lock */
 static int cos_cache_notify_flag = 0;
+static PRBool cos_cache_at_work = PR_FALSE;
 
 /* service definition cache structs */
 
@@ -199,7 +201,8 @@ typedef struct _cos_cache cosCache;
 static cosCache *pCache; /* always the current global cache, only use getref to get */
 
 /* the place to start if you want a new cache */
-static int cos_cache_create();
+static int cos_cache_create_unlock(void);
+static int cos_cache_creation_lock(void);
 
 /* cache index related functions */
 static int cos_cache_index_all(cosCache *pCache);
@@ -386,7 +389,7 @@ static void cos_cache_wait_on_change(void *arg)
 	pCache = 0;
 
 	/* create initial cache */
-	cos_cache_create();
+	cos_cache_creation_lock();
 
         slapi_lock_mutex(start_lock);
         started = 1;
@@ -419,7 +422,7 @@ static void cos_cache_wait_on_change(void *arg)
 		 * before we go running off doing lots of stuff lets check if we should stop
 		*/
 		if(keeprunning) {
-			cos_cache_create();				
+			cos_cache_creation_lock();
 		}
 		cos_cache_notify_flag = 0; /* Dealt with it */
 	}/* while */
@@ -431,22 +434,25 @@ static void cos_cache_wait_on_change(void *arg)
 	LDAPDebug( LDAP_DEBUG_TRACE, "<-- cos_cache_wait_on_change thread exit\n",0,0,0);
 }
 
+
 /*
-	cos_cache_create
+	cos_cache_create_unlock
 	---------------------
 	Walks the definitions in the DIT and creates the cache.
 	Once created, it swaps the new cache for the old one,
 	releasing its refcount to the old cache and allowing it
 	to be destroyed.
+
+        called while change_lock is NOT held
 */
-static int cos_cache_create()
+static int cos_cache_create_unlock(void)
 {
 	int ret = -1;
 	cosCache *pNewCache;
 	static int firstTime = 1;
 	int cache_built = 0;
 
-	LDAPDebug( LDAP_DEBUG_TRACE, "--> cos_cache_create\n",0,0,0);
+	LDAPDebug( LDAP_DEBUG_TRACE, "--> cos_cache_create_unlock\n",0,0,0);
 
 	pNewCache = (cosCache*)slapi_ch_malloc(sizeof(cosCache));
 	if(pNewCache)
@@ -509,21 +515,21 @@ static int cos_cache_create()
 				{
 					/* we should not go on without proper schema checking */
 					cos_cache_release(pNewCache);
-					LDAPDebug( LDAP_DEBUG_ANY, "cos_cache_create: failed to cache the schema\n",0,0,0);			
+					LDAPDebug( LDAP_DEBUG_ANY, "cos_cache_create_unlock: failed to cache the schema\n",0,0,0);			
 				}
 			}
 			else
 			{
 				/* currently we cannot go on without the indexes */
 				cos_cache_release(pNewCache);
-				LDAPDebug( LDAP_DEBUG_ANY, "cos_cache_create: failed to index cache\n",0,0,0);			
+				LDAPDebug( LDAP_DEBUG_ANY, "cos_cache_create_unlock: failed to index cache\n",0,0,0);			
 			}
 		}
 		else
 		{
 			if(firstTime)
 			{
-				LDAPDebug( LDAP_DEBUG_PLUGIN, "cos_cache_create: cos disabled\n",0,0,0);
+				LDAPDebug( LDAP_DEBUG_PLUGIN, "cos_cache_create_unlock: cos disabled\n",0,0,0);
 				firstTime = 0;
 			}
 
@@ -531,7 +537,7 @@ static int cos_cache_create()
 		}
 	}
 	else
-		LDAPDebug( LDAP_DEBUG_ANY, "cos_cache_create: memory allocation failure\n",0,0,0);
+		LDAPDebug( LDAP_DEBUG_ANY, "cos_cache_create_unlock: memory allocation failure\n",0,0,0);
 
 
 	/* make sure we have a new cache */
@@ -563,10 +569,53 @@ static int cos_cache_create()
 		
 	}
 
-	LDAPDebug( LDAP_DEBUG_TRACE, "<-- cos_cache_create\n",0,0,0);
+	LDAPDebug( LDAP_DEBUG_TRACE, "<-- cos_cache_create_unlock\n",0,0,0);
 	return ret;
 }
 
+/* cos_cache_creation_lock is called with change_lock being hold:
+ *    slapi_lock_mutex(change_lock)
+ *
+ * To rebuild the cache cos_cache_creation gets cos definitions from backend, that
+ * means change_lock is held then cos_cache_creation will acquire some backend pages.
+ *
+ * A deadlock can happen if cos_post_op is called while backend is locked. 
+ * For example if a bepreop (urp) does an internal update on a cos definition,
+ * the thread holds backend pages that will be needed by cos_cache_creation.
+ *
+ * A solution is to use a flag 'cos_cache_at_work' protected by change_lock,
+ * release change_lock, recreate the cos_cache, acquire change_lock reset the flag.
+ *
+ * returned value: result of cos_cache_create_unlock
+ *
+ */
+static int cos_cache_creation_lock(void)
+{
+	int ret = -1;
+	int max_tries = 10;
+
+	for (; max_tries != 0; max_tries--) {
+		/* if the cos_cache is already under work (cos_cache_create_unlock)
+		 * wait 1 second
+		 */
+		if (cos_cache_at_work) {
+			slapi_log_error(SLAPI_LOG_FATAL, COS_PLUGIN_SUBSYSTEM, "--> cos_cache_creation_lock already rebuilding cos_cache... retry\n");
+			DS_Sleep (PR_MillisecondsToInterval(1000));
+			continue;
+		}
+		cos_cache_at_work = PR_TRUE;
+		slapi_unlock_mutex(change_lock);
+		ret = cos_cache_create_unlock();
+		slapi_lock_mutex(change_lock);
+		cos_cache_at_work = PR_FALSE;
+		break;
+	}
+	if (!max_tries) {
+		slapi_log_error(SLAPI_LOG_FATAL, COS_PLUGIN_SUBSYSTEM, "--> cos_cache_creation_lock  rebuilt was to long, skip this rebuild\n");
+	}
+
+	return ret;
+}
 
 /*
 	cos_cache_build_definition_list
@@ -1639,7 +1688,7 @@ int cos_cache_getref(cos_cache **pptheCache)
 		slapi_lock_mutex(change_lock);
 		if(pCache == NULL)
 		{
-			if(cos_cache_create())
+			if(cos_cache_creation_lock())
 			{
 				/* there was a problem or no COS definitions were found */
 				LDAPDebug( LDAP_DEBUG_PLUGIN, "cos_cache_getref: no cos cache created\n",0,0,0);
-- 
2.9.3