From 053bb9c7356eae82b1089582bb2844388ae4df57 Mon Sep 17 00:00:00 2001 From: "Kaleb S. KEITHLEY" Date: Wed, 2 Jun 2021 07:49:12 -0400 Subject: [PATCH 550/584] common-ha: stability fixes for ganesha_grace and ganesha_mon RAs Include fixes suggested by ClusterHA devs. 1) It turns out that crm_attribute attrs and attrd_updater attrs really are one and the same, despite what I was told years ago. attrs created with crm_attribute ... --lifetime=reboot ... or attrd_updater are one and same. As per ClusterHA devs having an attr created with crm_attribute ... --lifetime=forever and also creating/updating the same attr with attrd_updater is a recipe for weird things to happen that will be difficult to debug. 2) using hostname -s or hostname for node names in crm_attribute and attrd_updater potentially could use the wrong name if the host has been renamed; use ocf_local_nodename() (in ocf-shellfuncs) instead. https://github.com/gluster/glusterfs/issues/2276 https://github.com/gluster/glusterfs/pull/2283 commit 9bd2c697686ec40e2c4f711df961860c8a735baa Change-Id:If572d396fae9206628714fb2ce00f72e94f2258f BUG: 1945143 Signed-off-by: Kaleb S. KEITHLEY Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244593 Tested-by: RHGS Build Bot Reviewed-by: Sunil Kumar Heggodu Gopala Acharya --- extras/ganesha/ocf/ganesha_grace | 28 +++++++++--------------- extras/ganesha/ocf/ganesha_mon | 47 ++++++++++++++-------------------------- 2 files changed, 26 insertions(+), 49 deletions(-) diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace index 825f716..edc6fa2 100644 --- a/extras/ganesha/ocf/ganesha_grace +++ b/extras/ganesha/ocf/ganesha_grace @@ -94,25 +94,21 @@ esac ganesha_grace_start() { local rc=${OCF_ERR_GENERIC} - local host=$(hostname -s) + local host=$(ocf_local_nodename) - ocf_log debug "ganesha_grace_start()" - # give ganesha_mon RA a chance to set the crm_attr first + ocf_log debug "ganesha_grace_start ${host}" + # give ganesha_mon RA a chance to set the attr first # I mislike the sleep, but it's not clear that looping # with a small sleep is necessarily better # start has a 40sec timeout, so a 5sec sleep here is okay sleep 5 - attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) if [ $? -ne 0 ]; then - host=$(hostname) - attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null ) - if [ $? -ne 0 ]; then - ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" - fi + ocf_log info "grace start: attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" fi # Three possibilities: - # 1. There is no attribute at all and attr_updater returns + # 1. There is no attribute at all and attrd_updater returns # a zero length string. This happens when # ganesha_mon::monitor hasn't run at least once to set # the attribute. The assumption here is that the system @@ -164,17 +160,13 @@ ganesha_grace_notify() ganesha_grace_monitor() { - local host=$(hostname -s) + local host=$(ocf_local_nodename) - ocf_log debug "monitor" + ocf_log debug "ganesha_grace monitor ${host}" - attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) + attr=$(attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) if [ $? -ne 0 ]; then - host=$(hostname) - attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null) - if [ $? -ne 0 ]; then - ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" - fi + ocf_log info "attrd_updater --query --node=${host} --name=${OCF_RESKEY_grace_active} failed" fi # if there is no attribute (yet), maybe it's because diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon index 2b4a9d6..7fbbf70 100644 --- a/extras/ganesha/ocf/ganesha_mon +++ b/extras/ganesha/ocf/ganesha_mon @@ -124,7 +124,6 @@ ganesha_mon_stop() ganesha_mon_monitor() { - local host=$(hostname -s) local pid_file="/var/run/ganesha.pid" local rhel6_pid_file="/var/run/ganesha.nfsd.pid" local proc_pid="/proc/" @@ -141,31 +140,27 @@ ganesha_mon_monitor() if [ "x${proc_pid}" != "x/proc/" -a -d ${proc_pid} ]; then - attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 + attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 if [ $? -ne 0 ]; then - ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_ganesha_active} -v 1 failed" + ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_ganesha_active} -v 1 failed" fi # ganesha_grace (nfs-grace) RA follows grace-active attr # w/ constraint location - attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 + attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 if [ $? -ne 0 ]; then - ocf_log info "warning: attrd_updater -n ${OCF_RESKEY_grace_active} -v 1 failed" + ocf_log info "warning: attrd_updater --name ${OCF_RESKEY_grace_active} -v 1 failed" fi # ganesha_mon (nfs-mon) and ganesha_grace (nfs-grace) - # track grace-active crm_attr (attr != crm_attr) - # we can't just use the attr as there's no way to query - # its value in RHEL6 pacemaker - - crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null - if [ $? -ne 0 ]; then - host=$(hostname) - crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null - if [ $? -ne 0 ]; then - ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed" - fi - fi + # track grace-active attr. + # + # Originally we were told that attrs set with attrd_updater + # are different/distinct than attrs set with crm_attribute. + # Now, years later, we are told that they are the same and + # that the values of attrs set with attrd_updater can be + # retrieved with crm_attribute. Or with attrd_updater -Q + # now that we no longer have to deal with rhel6. return ${OCF_SUCCESS} fi @@ -182,26 +177,16 @@ ganesha_mon_monitor() # the remaining ganesha.nfsds into grace before # initiating the VIP fail-over. - attrd_updater -D -n ${OCF_RESKEY_grace_active} - if [ $? -ne 0 ]; then - ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed" - fi - - host=$(hostname -s) - crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null + attrd_updater --delete --name ${OCF_RESKEY_grace_active} if [ $? -ne 0 ]; then - host=$(hostname) - crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null - if [ $? -ne 0 ]; then - ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed" - fi + ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_grace_active} failed" fi sleep ${OCF_RESKEY_grace_delay} - attrd_updater -D -n ${OCF_RESKEY_ganesha_active} + attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} if [ $? -ne 0 ]; then - ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_ganesha_active} failed" + ocf_log info "warning: attrd_updater --delete --name ${OCF_RESKEY_ganesha_active} failed" fi return ${OCF_SUCCESS} -- 1.8.3.1