12a457
From d5858adbaa138383bfa17855fec95c59c989a6c0 Mon Sep 17 00:00:00 2001
12a457
From: Kaleb S KEITHLEY <kkeithle@redhat.com>
12a457
Date: Wed, 1 Jun 2016 16:50:08 -0400
12a457
Subject: [PATCH 183/183] common-ha: race/timing issue setting up cluster
12a457
12a457
The ganesha_grace resource agent can start before the ganesha_mon
12a457
resource agent, with the result that the crm_attribute that
12a457
ganesha_grace expects to find has not been created yet.
12a457
12a457
This is never (never? Or just so rarely that it has never actually
12a457
been seen during development) seen with four nodes, but with just
12a457
two nodes it's very repeatable.
12a457
12a457
Note that when long (FQDN) names are used it is not unexpected to
12a457
see Failed Actions in the output of `pcs status`, e.g.:
12a457
12a457
* nfs-grace_monitor_5000 on node1.fully.qualified.domain.name.com
12a457
'unknown error' (1): call=20, status=complete, exitreason='none',
12a457
last-rc-change='Wed Jun  1 12:32:32 2016', queued=0ms, exec=0ms
12a457
* nfs-grace_monitor_5000 on node2.fully.qualified.domain.name.com
12a457
'unknown error' (1): call=18, status=complete, exitreason='none',
12a457
last-rc-change='Wed Jun  1 12:32:42 2016', queued=0ms, exec=0ms
12a457
12a457
and as long as all the ganesha_grace_clone and cluster_ip-1
12a457
resource agents are in Started state then this is okay.
12a457
12a457
backport master:
12a457
> http://review.gluster.org/14607
12a457
> BUG: 1341768
12a457
release-3.8
12a457
> http://review.gluster.org/14609
12a457
> BUG: 1341770
12a457
release-3.7
12a457
> http://review.gluster.org/14610
12a457
> BUG: 1341772
12a457
> Change-Id: I726c9946ceb1ca92872b321612eb0f4c3cc039d8
12a457
12a457
Change-Id: I62db41e46af6cfeba546f96a84d0e19d98e06ff5
12a457
BUG: 1341567
12a457
Signed-off-by: Kaleb S KEITHLEY <kkeithle@redhat.com>
12a457
Reviewed-on: https://code.engineering.redhat.com/gerrit/75679
12a457
Reviewed-by: Jiffin Thottan <jthottan@redhat.com>
12a457
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
12a457
---
12a457
 extras/ganesha/ocf/ganesha_grace     |   25 +++++++++++++++----------
12a457
 extras/ganesha/ocf/ganesha_mon       |   33 ++++++++++++++++++---------------
12a457
 extras/ganesha/scripts/ganesha-ha.sh |    5 +++++
12a457
 3 files changed, 38 insertions(+), 25 deletions(-)
12a457
12a457
diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace
12a457
index 21b95dd..84202df 100644
12a457
--- a/extras/ganesha/ocf/ganesha_grace
12a457
+++ b/extras/ganesha/ocf/ganesha_grace
12a457
@@ -94,15 +94,20 @@ esac
12a457
 ganesha_grace_start()
12a457
 {
12a457
 	local rc=${OCF_ERR_GENERIC}
12a457
-	local short_host=$(hostname -s)
12a457
-	local long_host=$(hostname)
12a457
+	local host=$(hostname -s)
12a457
 
12a457
 	ocf_log debug "ganesha_grace_start()"
12a457
-	attr=$(crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
12a457
+	# give ganesha_mon RA a chance to set the crm_attr first
12a457
+	# I mislike the sleep, but it's not clear that looping
12a457
+	# with a small sleep is necessarily better
12a457
+	# start has a 40sec timeout, so a 5sec sleep here is okay
12a457
+        sleep 5
12a457
+	attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
12a457
         if [ $? -ne 0 ]; then
12a457
-	        attr=$(crm_attribute --query --node=${long_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null )
12a457
+		host=$(hostname)
12a457
+		attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null )
12a457
                 if [ $? -ne 0 ]; then
12a457
-	                ocf_log info "crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} failed"
12a457
+	                ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
12a457
                 fi
12a457
         fi
12a457
 
12a457
@@ -160,16 +165,16 @@ ganesha_grace_notify()
12a457
 ganesha_grace_monitor()
12a457
 {
12a457
 	local rc=${OCF_ERR_GENERIC}
12a457
-	local short_host=$(hostname -s)
12a457
-	local long_host=$(hostname)
12a457
+	local host=$(hostname -s)
12a457
 
12a457
 	ocf_log debug "monitor"
12a457
 
12a457
-	attr=$(crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
12a457
+	attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
12a457
         if [ $? -ne 0 ]; then
12a457
-	        attr=$(crm_attribute --query --node=${long_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
12a457
+		host=$(hostname)
12a457
+	        attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
12a457
                 if [ $? -ne 0 ]; then
12a457
-	                ocf_log info "crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} failed"
12a457
+	                ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
12a457
                 fi
12a457
         fi
12a457
 
12a457
diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon
12a457
index 974eb86..7d0eb6b 100644
12a457
--- a/extras/ganesha/ocf/ganesha_mon
12a457
+++ b/extras/ganesha/ocf/ganesha_mon
12a457
@@ -124,8 +124,7 @@ ganesha_mon_stop()
12a457
 
12a457
 ganesha_mon_monitor()
12a457
 {
12a457
-	local short_host=$(hostname -s)
12a457
-	local long_host=$(hostname)
12a457
+	local host=$(hostname -s)
12a457
 	local pid_file="/var/run/ganesha.nfsd.pid"
12a457
 
12a457
 	# RHEL6 /etc/init.d/nfs-ganesha adds -p /var/run/ganesha.nfsd.pid
12a457
@@ -154,13 +153,15 @@ ganesha_mon_monitor()
12a457
 		# track grace-active crm_attr (attr != crm_attr)
12a457
 		# we can't just use the attr as there's no way to query
12a457
 		# its value in RHEL6 pacemaker
12a457
-		crm_attribute --node=${short_host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
12a457
-                if [ $? -ne 0 ]; then
12a457
-		        crm_attribute --node=${long_host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
12a457
-		        if [ $? -ne 0 ]; then
12a457
-		        	ocf_log info "warning: crm_attribute --node=${short_host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed"
12a457
-		        fi
12a457
-                fi
12a457
+
12a457
+		crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
12a457
+		if [ $? -ne 0 ]; then
12a457
+			host=$(hostname)
12a457
+			crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
12a457
+			if [ $? -ne 0 ]; then
12a457
+				ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed"
12a457
+			fi
12a457
+		fi
12a457
 
12a457
 		return ${OCF_SUCCESS}
12a457
 	fi
12a457
@@ -182,13 +183,15 @@ ganesha_mon_monitor()
12a457
 		ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed"
12a457
 	fi
12a457
 
12a457
-	crm_attribute --node=${short_host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
12a457
+	host=$(hostname -s)
12a457
+	crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
12a457
 	if [ $? -ne 0 ]; then
12a457
-	        crm_attribute --node=${long_host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
12a457
-	        if [ $? -ne 0 ]; then
12a457
-		        ocf_log info "warning: crm_attribute --node=${short_host} --name=${OCF_RESKEY_grace_active} --update=0 failed"
12a457
-	        fi
12a457
-        fi
12a457
+		host=$(hostname)
12a457
+		crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
12a457
+		if [ $? -ne 0 ]; then
12a457
+			ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed"
12a457
+		fi
12a457
+	fi
12a457
 
12a457
 	sleep ${OCF_RESKEY_grace_delay}
12a457
 
12a457
diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh
12a457
index f0796d2..f43dbf4 100644
12a457
--- a/extras/ganesha/scripts/ganesha-ha.sh
12a457
+++ b/extras/ganesha/scripts/ganesha-ha.sh
12a457
@@ -511,6 +511,11 @@ setup_create_resources()
12a457
         logger "warning: pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone failed"
12a457
     fi
12a457
 
12a457
+    # see comment in (/usr/lib/ocf/resource.d/heartbeat/ganesha_grace
12a457
+    # start method. Allow time for ganesha_mon to start and set the
12a457
+    # ganesha-active crm_attribute
12a457
+    sleep 5
12a457
+
12a457
     pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone meta notify=true
12a457
     if [ $? -ne 0 ]; then
12a457
         logger "warning: pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone failed"
12a457
-- 
12a457
1.7.1
12a457