diff -uNr a/heartbeat/pgsql b/heartbeat/pgsql
--- a/heartbeat/pgsql 2017-03-09 11:50:06.365145803 +0100
+++ b/heartbeat/pgsql 2017-03-09 12:19:41.566177608 +0100
@@ -966,8 +966,13 @@
cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\
sort | head -1`
if [ "$cmp_location" != "$my_master_baseline" ]; then
+ # We used to set the failcount to INF for the resource here in
+ # order to move the master to the other node. However, setting
+ # the failcount should be done only by the CRM and so this use
+ # got deprecated in pacemaker version 1.1.17. Now we do the
+ # "ban resource from the node".
ocf_exit_reason "My data is newer than new master's one. New master's location : $master_baseline"
- $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY
+ exec_with_retry 0 $CRM_RESOURCE -B -r $OCF_RESOURCE_INSTANCE -N $NODENAME -Q
return $OCF_ERR_GENERIC
fi
fi
@@ -1526,6 +1531,36 @@
wait $func_pid
}
+# retry command when command doesn't return 0
+# arg1 : count >= 0 (if arg1 is 0, it retries command in infinitum(1day))
+# arg2..argN : command and args
+exec_with_retry() {
+ local count="86400"
+ local output
+ local rc
+
+ if [ "$1" -ne 0 ]; then
+ count=$1
+ fi
+ shift
+
+ while [ $count -gt 0 ]; do
+ output=`$*`
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_log warn "Retrying(remain $count). \"$*\" failed. rc=$rc. stdout=\"$output\"."
+ count=`expr $count - 1`
+ sleep 1
+ else
+ printf "${output}"
+ return 0
+ fi
+ done
+
+ ocf_exit_reason "giving up executing \"$*\""
+ return $rc
+}
+
is_node_online() {
crm_mon -1 -n | tr '[A-Z]' '[a-z]' | grep -e "^node $1 " -e "^node $1:" | grep -q -v "offline"
}
@@ -1734,7 +1769,7 @@
CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot"
CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot"
CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever"
- CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount"
+ CRM_RESOURCE="${HA_SBIN_DIR}/crm_resource"
CAN_NOT_PROMOTE="-INFINITY"
CAN_PROMOTE="100"