Blob Blame History Raw
From 053bb9c7356eae82b1089582bb2844388ae4df57 Mon Sep 17 00:00:00 2001
From: "Kaleb S. KEITHLEY" <kkeithle@redhat.com>
Date: Wed, 2 Jun 2021 07:49:12 -0400
Subject: [PATCH 550/584] common-ha: stability fixes for ganesha_grace and
 ganesha_mon RAs

Include fixes suggested by ClusterHA devs.

1) It turns out that crm_attribute attrs and attrd_updater attrs really
are one and the same, despite what I was told years ago.

attrs created with crm_attribute ... --lifetime=reboot ... or
attrd_updater are one and same. As per ClusterHA devs having an attr
created with crm_attribute ... --lifetime=forever and also
creating/updating the same attr with attrd_updater is a recipe for
weird things to happen that will be difficult to debug.

2) using hostname -s or hostname for node names in crm_attribute and
attrd_updater potentially could use the wrong name if the host has
been renamed; use ocf_local_nodename() (in ocf-shellfuncs) instead.

https://github.com/gluster/glusterfs/issues/2276
https://github.com/gluster/glusterfs/pull/2283
commit 9bd2c697686ec40e2c4f711df961860c8a735baa

Change-Id:If572d396fae9206628714fb2ce00f72e94f2258f
BUG: 1945143
Signed-off-by: Kaleb S. KEITHLEY <kkeithle@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244593
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 extras/ganesha/ocf/ganesha_grace | 28 +++++++++---------------
 extras/ganesha/ocf/ganesha_mon   | 47 ++++++++++++++--------------------------
 2 files changed, 26 insertions(+), 49 deletions(-)

diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace
index 825f716..edc6fa2 100644
--- a/extras/ganesha/ocf/ganesha_grace
+++ b/extras/ganesha/ocf/ganesha_grace
@@ -94,25 +94,21 @@ esac
 ganesha_grace_start()
 {
 	local rc=${OCF_ERR_GENERIC}
-	local host=$(hostname -s)
+	local host=$(ocf_local_nodename)
 
-	ocf_log debug "ganesha_grace_start()"
-	# give ganesha_mon RA a chance to set the crm_attr first
+	ocf_log debug "ganesha_grace_start ${host}"
+	# give ganesha_mon RA a chance to set the attr first
 	# I mislike the sleep, but it's not clear that looping
 	# with a small sleep is necessarily better
 	# start has a 40sec timeout, so a 5sec sleep here is okay
         sleep 5
-	attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
+	attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
         if [ $? -ne 0 ]; then
-		host=$(hostname)
-		attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null )
-                if [ $? -ne 0 ]; then
-	                ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
-                fi
+	        ocf_log info "grace start: attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
         fi
 
 	# Three possibilities:
-	# 1. There is no attribute at all and attr_updater returns
+	# 1. There is no attribute at all and attrd_updater returns
 	#    a zero length string. This happens when
 	#    ganesha_mon::monitor hasn't run at least once to set
 	#    the attribute. The assumption here is that the system
@@ -164,17 +160,13 @@ ganesha_grace_notify()
 
 ganesha_grace_monitor()
 {
-	local host=$(hostname -s)
+	local host=$(ocf_local_nodename)
 
-	ocf_log debug "monitor"
+	ocf_log debug "ganesha_grace monitor ${host}"
 
-	attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
+	attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
         if [ $? -ne 0 ]; then
-		host=$(hostname)
-	        attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
-                if [ $? -ne 0 ]; then
-	                ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
-                fi
+	        ocf_log info "attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
         fi
 
 	# if there is no attribute (yet), maybe it's because
diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon
index 2b4a9d6..7fbbf70 100644
--- a/extras/ganesha/ocf/ganesha_mon
+++ b/extras/ganesha/ocf/ganesha_mon
@@ -124,7 +124,6 @@ ganesha_mon_stop()
 
 ganesha_mon_monitor()
 {
-	local host=$(hostname -s)
 	local pid_file="/var/run/ganesha.pid"
 	local rhel6_pid_file="/var/run/ganesha.nfsd.pid"
 	local proc_pid="/proc/"
@@ -141,31 +140,27 @@ ganesha_mon_monitor()
 
 	if [ "x${proc_pid}" != "x/proc/" -a -d ${proc_pid} ]; then
 
-		attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1
+		attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1
 		if [ $? -ne 0 ]; then
-			ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 failed"
+			ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 failed"
 		fi
 
 		# ganesha_grace (nfs-grace) RA follows grace-active attr
 		# w/ constraint location
-		attrd_updater -n ${OCF_RESKEY_grace_active} -v 1
+		attrd_updater --name ${OCF_RESKEY_grace_active} -v 1
 		if [ $? -ne 0 ]; then
-			ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 failed"
+			ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 failed"
 		fi
 
 		# ganesha_mon (nfs-mon) and ganesha_grace (nfs-grace)
-		# track grace-active crm_attr (attr != crm_attr)
-		# we can't just use the attr as there's no way to query
-		# its value in RHEL6 pacemaker
-
-		crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
-		if [ $? -ne 0 ]; then
-			host=$(hostname)
-			crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
-			if [ $? -ne 0 ]; then
-				ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed"
-			fi
-		fi
+		# track grace-active attr.
+		#
+		# Originally we were told that attrs set with attrd_updater
+		# are different/distinct than attrs set with crm_attribute.
+		# Now, years later, we are told that they are the same and
+		# that the values of attrs set with attrd_updater can be
+		# retrieved with crm_attribute. Or with attrd_updater -Q
+		# now that we no longer have to deal with rhel6.
 
 		return ${OCF_SUCCESS}
 	fi
@@ -182,26 +177,16 @@ ganesha_mon_monitor()
 	# the remaining ganesha.nfsds into grace before
 	# initiating the VIP fail-over.
 
-	attrd_updater -D -n ${OCF_RESKEY_grace_active}
-	if [ $? -ne 0 ]; then
-		ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed"
-	fi
-
-	host=$(hostname -s)
-	crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
+	attrd_updater --delete --name ${OCF_RESKEY_grace_active}
 	if [ $? -ne 0 ]; then
-		host=$(hostname)
-		crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
-		if [ $? -ne 0 ]; then
-			ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed"
-		fi
+		ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_grace_active} failed"
 	fi
 
 	sleep ${OCF_RESKEY_grace_delay}
 
-	attrd_updater -D -n ${OCF_RESKEY_ganesha_active}
+	attrd_updater --delete --name ${OCF_RESKEY_ganesha_active}
 	if [ $? -ne 0 ]; then
-		ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_ganesha_active} failed"
+		ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} failed"
 	fi
 
 	return ${OCF_SUCCESS}
-- 
1.8.3.1