|
|
12a457 |
From d5858adbaa138383bfa17855fec95c59c989a6c0 Mon Sep 17 00:00:00 2001
|
|
|
12a457 |
From: Kaleb S KEITHLEY <kkeithle@redhat.com>
|
|
|
12a457 |
Date: Wed, 1 Jun 2016 16:50:08 -0400
|
|
|
12a457 |
Subject: [PATCH 183/183] common-ha: race/timing issue setting up cluster
|
|
|
12a457 |
|
|
|
12a457 |
The ganesha_grace resource agent can start before the ganesha_mon
|
|
|
12a457 |
resource agent, with the result that the crm_attribute that
|
|
|
12a457 |
ganesha_grace expects to find has not been created yet.
|
|
|
12a457 |
|
|
|
12a457 |
This is never (never? Or just so rarely that it has never actually
|
|
|
12a457 |
been seen during development) seen with four nodes, but with just
|
|
|
12a457 |
two nodes it's very repeatable.
|
|
|
12a457 |
|
|
|
12a457 |
Note that when long (FQDN) names are used it is not unexpected to
|
|
|
12a457 |
see Failed Actions in the output of `pcs status`, e.g.:
|
|
|
12a457 |
|
|
|
12a457 |
* nfs-grace_monitor_5000 on node1.fully.qualified.domain.name.com
|
|
|
12a457 |
'unknown error' (1): call=20, status=complete, exitreason='none',
|
|
|
12a457 |
last-rc-change='Wed Jun 1 12:32:32 2016', queued=0ms, exec=0ms
|
|
|
12a457 |
* nfs-grace_monitor_5000 on node2.fully.qualified.domain.name.com
|
|
|
12a457 |
'unknown error' (1): call=18, status=complete, exitreason='none',
|
|
|
12a457 |
last-rc-change='Wed Jun 1 12:32:42 2016', queued=0ms, exec=0ms
|
|
|
12a457 |
|
|
|
12a457 |
and as long as all the ganesha_grace_clone and cluster_ip-1
|
|
|
12a457 |
resource agents are in Started state then this is okay.
|
|
|
12a457 |
|
|
|
12a457 |
backport master:
|
|
|
12a457 |
> http://review.gluster.org/14607
|
|
|
12a457 |
> BUG: 1341768
|
|
|
12a457 |
release-3.8
|
|
|
12a457 |
> http://review.gluster.org/14609
|
|
|
12a457 |
> BUG: 1341770
|
|
|
12a457 |
release-3.7
|
|
|
12a457 |
> http://review.gluster.org/14610
|
|
|
12a457 |
> BUG: 1341772
|
|
|
12a457 |
> Change-Id: I726c9946ceb1ca92872b321612eb0f4c3cc039d8
|
|
|
12a457 |
|
|
|
12a457 |
Change-Id: I62db41e46af6cfeba546f96a84d0e19d98e06ff5
|
|
|
12a457 |
BUG: 1341567
|
|
|
12a457 |
Signed-off-by: Kaleb S KEITHLEY <kkeithle@redhat.com>
|
|
|
12a457 |
Reviewed-on: https://code.engineering.redhat.com/gerrit/75679
|
|
|
12a457 |
Reviewed-by: Jiffin Thottan <jthottan@redhat.com>
|
|
|
12a457 |
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|
|
12a457 |
---
|
|
|
12a457 |
extras/ganesha/ocf/ganesha_grace | 25 +++++++++++++++----------
|
|
|
12a457 |
extras/ganesha/ocf/ganesha_mon | 33 ++++++++++++++++++---------------
|
|
|
12a457 |
extras/ganesha/scripts/ganesha-ha.sh | 5 +++++
|
|
|
12a457 |
3 files changed, 38 insertions(+), 25 deletions(-)
|
|
|
12a457 |
|
|
|
12a457 |
diff --git a/extras/ganesha/ocf/ganesha_grace b/extras/ganesha/ocf/ganesha_grace
|
|
|
12a457 |
index 21b95dd..84202df 100644
|
|
|
12a457 |
--- a/extras/ganesha/ocf/ganesha_grace
|
|
|
12a457 |
+++ b/extras/ganesha/ocf/ganesha_grace
|
|
|
12a457 |
@@ -94,15 +94,20 @@ esac
|
|
|
12a457 |
ganesha_grace_start()
|
|
|
12a457 |
{
|
|
|
12a457 |
local rc=${OCF_ERR_GENERIC}
|
|
|
12a457 |
- local short_host=$(hostname -s)
|
|
|
12a457 |
- local long_host=$(hostname)
|
|
|
12a457 |
+ local host=$(hostname -s)
|
|
|
12a457 |
|
|
|
12a457 |
ocf_log debug "ganesha_grace_start()"
|
|
|
12a457 |
- attr=$(crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
|
|
|
12a457 |
+ # give ganesha_mon RA a chance to set the crm_attr first
|
|
|
12a457 |
+ # I mislike the sleep, but it's not clear that looping
|
|
|
12a457 |
+ # with a small sleep is necessarily better
|
|
|
12a457 |
+ # start has a 40sec timeout, so a 5sec sleep here is okay
|
|
|
12a457 |
+ sleep 5
|
|
|
12a457 |
+ attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
|
|
|
12a457 |
if [ $? -ne 0 ]; then
|
|
|
12a457 |
- attr=$(crm_attribute --query --node=${long_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null )
|
|
|
12a457 |
+ host=$(hostname)
|
|
|
12a457 |
+ attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null )
|
|
|
12a457 |
if [ $? -ne 0 ]; then
|
|
|
12a457 |
- ocf_log info "crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} failed"
|
|
|
12a457 |
+ ocf_log info "grace start: crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
|
|
|
12a457 |
fi
|
|
|
12a457 |
fi
|
|
|
12a457 |
|
|
|
12a457 |
@@ -160,16 +165,16 @@ ganesha_grace_notify()
|
|
|
12a457 |
ganesha_grace_monitor()
|
|
|
12a457 |
{
|
|
|
12a457 |
local rc=${OCF_ERR_GENERIC}
|
|
|
12a457 |
- local short_host=$(hostname -s)
|
|
|
12a457 |
- local long_host=$(hostname)
|
|
|
12a457 |
+ local host=$(hostname -s)
|
|
|
12a457 |
|
|
|
12a457 |
ocf_log debug "monitor"
|
|
|
12a457 |
|
|
|
12a457 |
- attr=$(crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
|
|
|
12a457 |
+ attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
|
|
|
12a457 |
if [ $? -ne 0 ]; then
|
|
|
12a457 |
- attr=$(crm_attribute --query --node=${long_host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
|
|
|
12a457 |
+ host=$(hostname)
|
|
|
12a457 |
+ attr=$(crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} 2> /dev/null)
|
|
|
12a457 |
if [ $? -ne 0 ]; then
|
|
|
12a457 |
- ocf_log info "crm_attribute --query --node=${short_host} --name=${OCF_RESKEY_grace_active} failed"
|
|
|
12a457 |
+ ocf_log info "crm_attribute --query --node=${host} --name=${OCF_RESKEY_grace_active} failed"
|
|
|
12a457 |
fi
|
|
|
12a457 |
fi
|
|
|
12a457 |
|
|
|
12a457 |
diff --git a/extras/ganesha/ocf/ganesha_mon b/extras/ganesha/ocf/ganesha_mon
|
|
|
12a457 |
index 974eb86..7d0eb6b 100644
|
|
|
12a457 |
--- a/extras/ganesha/ocf/ganesha_mon
|
|
|
12a457 |
+++ b/extras/ganesha/ocf/ganesha_mon
|
|
|
12a457 |
@@ -124,8 +124,7 @@ ganesha_mon_stop()
|
|
|
12a457 |
|
|
|
12a457 |
ganesha_mon_monitor()
|
|
|
12a457 |
{
|
|
|
12a457 |
- local short_host=$(hostname -s)
|
|
|
12a457 |
- local long_host=$(hostname)
|
|
|
12a457 |
+ local host=$(hostname -s)
|
|
|
12a457 |
local pid_file="/var/run/ganesha.nfsd.pid"
|
|
|
12a457 |
|
|
|
12a457 |
# RHEL6 /etc/init.d/nfs-ganesha adds -p /var/run/ganesha.nfsd.pid
|
|
|
12a457 |
@@ -154,13 +153,15 @@ ganesha_mon_monitor()
|
|
|
12a457 |
# track grace-active crm_attr (attr != crm_attr)
|
|
|
12a457 |
# we can't just use the attr as there's no way to query
|
|
|
12a457 |
# its value in RHEL6 pacemaker
|
|
|
12a457 |
- crm_attribute --node=${short_host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
|
|
|
12a457 |
- if [ $? -ne 0 ]; then
|
|
|
12a457 |
- crm_attribute --node=${long_host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
|
|
|
12a457 |
- if [ $? -ne 0 ]; then
|
|
|
12a457 |
- ocf_log info "warning: crm_attribute --node=${short_host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed"
|
|
|
12a457 |
- fi
|
|
|
12a457 |
- fi
|
|
|
12a457 |
+
|
|
|
12a457 |
+ crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
|
|
|
12a457 |
+ if [ $? -ne 0 ]; then
|
|
|
12a457 |
+ host=$(hostname)
|
|
|
12a457 |
+ crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 2> /dev/null
|
|
|
12a457 |
+ if [ $? -ne 0 ]; then
|
|
|
12a457 |
+ ocf_log info "mon monitor warning: crm_attribute --node=${host} --lifetime=forever --name=${OCF_RESKEY_grace_active} --update=1 failed"
|
|
|
12a457 |
+ fi
|
|
|
12a457 |
+ fi
|
|
|
12a457 |
|
|
|
12a457 |
return ${OCF_SUCCESS}
|
|
|
12a457 |
fi
|
|
|
12a457 |
@@ -182,13 +183,15 @@ ganesha_mon_monitor()
|
|
|
12a457 |
ocf_log info "warning: attrd_updater -D -n ${OCF_RESKEY_grace_active} failed"
|
|
|
12a457 |
fi
|
|
|
12a457 |
|
|
|
12a457 |
- crm_attribute --node=${short_host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
|
|
|
12a457 |
+ host=$(hostname -s)
|
|
|
12a457 |
+ crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
|
|
|
12a457 |
if [ $? -ne 0 ]; then
|
|
|
12a457 |
- crm_attribute --node=${long_host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
|
|
|
12a457 |
- if [ $? -ne 0 ]; then
|
|
|
12a457 |
- ocf_log info "warning: crm_attribute --node=${short_host} --name=${OCF_RESKEY_grace_active} --update=0 failed"
|
|
|
12a457 |
- fi
|
|
|
12a457 |
- fi
|
|
|
12a457 |
+ host=$(hostname)
|
|
|
12a457 |
+ crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 2> /dev/null
|
|
|
12a457 |
+ if [ $? -ne 0 ]; then
|
|
|
12a457 |
+ ocf_log info "mon monitor warning: crm_attribute --node=${host} --name=${OCF_RESKEY_grace_active} --update=0 failed"
|
|
|
12a457 |
+ fi
|
|
|
12a457 |
+ fi
|
|
|
12a457 |
|
|
|
12a457 |
sleep ${OCF_RESKEY_grace_delay}
|
|
|
12a457 |
|
|
|
12a457 |
diff --git a/extras/ganesha/scripts/ganesha-ha.sh b/extras/ganesha/scripts/ganesha-ha.sh
|
|
|
12a457 |
index f0796d2..f43dbf4 100644
|
|
|
12a457 |
--- a/extras/ganesha/scripts/ganesha-ha.sh
|
|
|
12a457 |
+++ b/extras/ganesha/scripts/ganesha-ha.sh
|
|
|
12a457 |
@@ -511,6 +511,11 @@ setup_create_resources()
|
|
|
12a457 |
logger "warning: pcs resource create nfs-mon ocf:heartbeat:ganesha_mon --clone failed"
|
|
|
12a457 |
fi
|
|
|
12a457 |
|
|
|
12a457 |
+ # see comment in (/usr/lib/ocf/resource.d/heartbeat/ganesha_grace
|
|
|
12a457 |
+ # start method. Allow time for ganesha_mon to start and set the
|
|
|
12a457 |
+ # ganesha-active crm_attribute
|
|
|
12a457 |
+ sleep 5
|
|
|
12a457 |
+
|
|
|
12a457 |
pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone meta notify=true
|
|
|
12a457 |
if [ $? -ne 0 ]; then
|
|
|
12a457 |
logger "warning: pcs resource create nfs-grace ocf:heartbeat:ganesha_grace --clone failed"
|
|
|
12a457 |
--
|
|
|
12a457 |
1.7.1
|
|
|
12a457 |
|