From feffc766c48a1010c1bf4f8b1db74795d06dbd50 Mon Sep 17 00:00:00 2001
From: David Vossel <dvossel@redhat.com>
Date: Mon, 25 Aug 2014 14:57:09 -0500
Subject: [PATCH 2/4] ethmonitor updates
---
heartbeat/ethmonitor | 290 +++++++++++++++++++++++++++++++++------------------
1 file changed, 187 insertions(+), 103 deletions(-)
diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor
index b85d7fc..a447391 100755
--- a/heartbeat/ethmonitor
+++ b/heartbeat/ethmonitor
@@ -1,14 +1,14 @@
#!/bin/sh
#
-# OCF Resource Agent compliant script.
-# Monitor the vitality of a local network interface.
+# OCF Resource Agent compliant script.
+# Monitor the vitality of a local network interface.
#
# Based on the work by Robert Euhus and Lars Marowsky-Brée.
#
# Transfered from Ipaddr2 into ethmonitor by Alexander Krauth
#
# Copyright (c) 2011 Robert Euhus, Alexander Krauth, Lars Marowsky-Brée
-# All Rights Reserved.
+# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
@@ -29,12 +29,12 @@
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
-# OCF parameters are as below
+# OCF parameters are as below
#
# OCF_RESKEY_interface
# OCF_RESKEY_multiplicator
# OCF_RESKEY_name
-# OCF_RESKEY_repeat_count
+# OCF_RESKEY_repeat_count
# OCF_RESKEY_repeat_interval
# OCF_RESKEY_pktcnt_timeout
# OCF_RESKEY_arping_count
@@ -70,10 +70,13 @@ The resource configuration requires a monitor operation, because the monitor doe
In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value.
The name of the attribute value is configured in the 'name' option of this RA.
-Example constraint configuration:
+Example constraint configuration using crmsh
location loc_connected_node my_resource_grp \
rule $id="rule_loc_connected_node" -INF: ethmonitor eq 0
+Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available.
+pcs constraint location my_resource rule score=-INFINITY ethmonitor-eth0 ne 1
+
The ethmonitor works in 3 different modes to test the interface vitality.
1. call ip to see if the link status is up (if link is down -> error)
2. call ip and watch the RX counter (if packages come around in a certain time -> success)
@@ -157,14 +160,30 @@ Maximum number of IPs from ARP cache list to check for ARP REQUEST (arping) answ
<content type="integer" default="5"/>
</parameter>
+<parameter name="infiniband_device">
+<longdesc lang="en">
+For interfaces that are infiniband devices.
+</longdesc>
+<shortdesc lang="en">infiniband device</shortdesc>
+<content type="string" />
+</parameter>
+
+<parameter name="infiniband_port">
+<longdesc lang="en">
+For infiniband devices, this is the port to monitor.
+</longdesc>
+<shortdesc lang="en">infiniband port</shortdesc>
+<content type="integer" />
+</parameter>
+
</parameters>
<actions>
-<action name="start" timeout="20s" />
-<action name="stop" timeout="20s" />
-<action name="status" depth="0" timeout="20s" interval="10s" />
-<action name="monitor" depth="0" timeout="20s" interval="10s" />
-<action name="meta-data" timeout="5s" />
-<action name="validate-all" timeout="20s" />
+<action name="start" timeout="60s" />
+<action name="stop" timeout="20s" />
+<action name="status" depth="0" timeout="60s" interval="10s" />
+<action name="monitor" depth="0" timeout="60s" interval="10s" />
+<action name="meta-data" timeout="5s" />
+<action name="validate-all" timeout="20s" />
</actions>
</resource-agent>
END
@@ -173,7 +192,7 @@ END
}
#
-# Return true, if the interface exists
+# Return true, if the interface exists
#
is_interface() {
#
@@ -181,14 +200,25 @@ is_interface() {
#
local iface=`$IP2UTIL -o -f inet addr show | grep " $1 " \
| cut -d ' ' -f2 | sort -u | grep -v '^ipsec[0-9][0-9]*$'`
- [ "$iface" != "" ]
+ [ "$iface" != "" ]
+}
+
+infiniband_status()
+{
+ local device="$OCF_RESKEY_infiniband_device"
+
+ if [ -n "$OCF_RESKEY_infiniband_port" ]; then
+ device="${OCF_RESKEY_infiniband_device}:${OCF_RESKEY_infiniband_port}"
+ fi
+
+ ibstatus ${device} | grep -q ACTIVE
}
if_init() {
local rc
if [ X"$OCF_RESKEY_interface" = "X" ]; then
- ocf_log err "Interface name (the interface parameter) is mandatory"
+ ocf_exit_reason "Interface name (the interface parameter) is mandatory"
exit $OCF_ERR_CONFIGURED
fi
@@ -196,60 +226,67 @@ if_init() {
if is_interface $NIC
then
- case "$NIC" in
- *:*) ocf_log err "Do not specify a virtual interface : $OCF_RESKEY_interface"
- exit $OCF_ERR_CONFIGURED;;
- *) ;;
- esac
+ case "$NIC" in
+ *:*) ocf_exit_reason "Do not specify a virtual interface : $OCF_RESKEY_interface"
+ exit $OCF_ERR_CONFIGURED;;
+ *) ;;
+ esac
else
- case $__OCF_ACTION in
- validate-all) ocf_log err "Interface $NIC does not exist"
- exit $OCF_ERR_CONFIGURED;;
- *) ocf_log warn "Interface $NIC does not exist"
- ## It might be a bond interface which is temporarily not available, therefore we want to continue here
- ;;
- esac
+ case $__OCF_ACTION in
+ validate-all)
+ ocf_exit_reason "Interface $NIC does not exist"
+ exit $OCF_ERR_CONFIGURED;;
+ *)
+ ## It might be a bond interface which is temporarily not available, therefore we want to continue here
+ ocf_log warn "Interface $NIC does not exist"
+ ;;
+ esac
fi
: ${OCF_RESKEY_multiplier:="1"}
if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then
- ocf_log err "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]"
+ ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]"
exit $OCF_ERR_CONFIGURED
fi
ATTRNAME=${OCF_RESKEY_name:-"ethmonitor-$NIC"}
- REP_COUNT=${OCF_RESKEY_repeat_count:-5}
+ REP_COUNT=${OCF_RESKEY_repeat_count:-5}
if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then
- ocf_log err "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]"
+ ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]"
exit $OCF_ERR_CONFIGURED
- fi
+ fi
REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10}
if ! ocf_is_decimal "$REP_INTERVAL_S"; then
- ocf_log err "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]"
+ ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]"
exit $OCF_ERR_CONFIGURED
fi
: ${OCF_RESKEY_pktcnt_timeout:="5"}
if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then
- ocf_log err "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]"
+ ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]"
exit $OCF_ERR_CONFIGURED
fi
: ${OCF_RESKEY_arping_count:="1"}
if ! ocf_is_decimal "$OCF_RESKEY_arping_count"; then
- ocf_log err "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]"
+ ocf_exit_reason "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]"
exit $OCF_ERR_CONFIGURED
fi
: ${OCF_RESKEY_arping_timeout:="1"}
if ! ocf_is_decimal "$OCF_RESKEY_arping_timeout"; then
- ocf_log err "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]"
+ ocf_exit_reason "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]"
exit $OCF_ERR_CONFIGURED
fi
: ${OCF_RESKEY_arping_cache_entries:="5"}
if ! ocf_is_decimal "$OCF_RESKEY_arping_cache_entries"; then
- ocf_log err "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]"
+ ocf_exit_reason "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]"
exit $OCF_ERR_CONFIGURED
fi
- return $OCF_SUCCESS
+
+ if [ -n "$OCF_RESKEY_infiniband_device" ]; then
+ #ibstatus is required if an infiniband_device is provided
+ check_binary ibstatus
+ fi
+ return $OCF_SUCCESS
}
# get the link status on $NIC
@@ -277,7 +314,7 @@ watch_pkt_counter () {
for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do
sleep 0.1
RX_PACKETS_NEW="`get_rx_packets`"
- ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW"
+ ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW"
if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then
ocf_log debug "we received some packets."
return 0
@@ -308,7 +345,7 @@ do_arping () {
}
#
-# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level
+# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level
#
# 09: check for nonempty ARP cache
# 10: watch for packet counter changes
@@ -322,21 +359,47 @@ do_arping () {
# the tests for higher check levels are run.
#
if_check () {
+ local arp_list
# always check link status first
link_status="`get_link_status`"
ocf_log debug "link_status: $link_status (1=up, 0=down)"
- [ $link_status -eq 0 ] && return $OCF_NOT_RUNNING
+
+ if [ $link_status -eq 0 ]; then
+ ocf_log notice "link_status: DOWN"
+ return $OCF_NOT_RUNNING
+ fi
+
+ # if this is an infiniband device, try ibstatus script
+ if [ -n "$OCF_RESKEY_infiniband_device" ]; then
+ if infiniband_status; then
+ return $OCF_SUCCESS
+ fi
+ ocf_log info "Infiniband device $OCF_RESKEY_infiniband_device is not available, check ibstatus for more information"
+ return $OCF_NOT_RUNNING
+ fi
# watch for packet counter changes
- ocf_log debug "watch for packet counter changes"
- watch_pkt_counter && return $OCF_SUCCESS
+ ocf_log debug "watch for packet counter changes"
+ watch_pkt_counter
+ if [ $? -eq 0 ]; then
+ return $OCF_SUCCESS
+ else
+ ocf_log debug "No packets received during packet watch timeout"
+ fi
# check arping ARP cache entries
- ocf_log debug "check arping ARP cache entries"
- for ip in `get_arp_list`; do
+ ocf_log debug "check arping ARP cache entries"
+ arp_list=`get_arp_list`
+ for ip in `echo $arp_list`; do
do_arping $ip && return $OCF_SUCCESS
done
+ # if we get here, the ethernet device is considered not running.
+ # provide some logging information
+ if [ -z "$arp_list" ]; then
+ ocf_log info "No ARP cache entries found to arping"
+ fi
+
# watch for packet counter changes in promiscios mode
# ocf_log debug "watch for packet counter changes in promiscios mode"
# be sure switch off promiscios mode in any case
@@ -362,67 +425,89 @@ END
}
set_cib_value() {
- local score=`expr $1 \* $OCF_RESKEY_multiplier`
- attrd_updater -n $ATTRNAME -v $score -q
- local rc=$?
- case $rc in
- 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;;
- *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";;
- esac
- return $rc
+ local score=`expr $1 \* $OCF_RESKEY_multiplier`
+ attrd_updater -n $ATTRNAME -v $score -q
+ local rc=$?
+ case $rc in
+ 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;;
+ *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";;
+ esac
+ return $rc
}
if_monitor() {
- ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor
- local pseudo_status=$?
- if [ $pseudo_status -ne $OCF_SUCCESS ]; then
- exit $pseudo_status
- fi
-
- local mon_rc=$OCF_NOT_RUNNING
- local attr_rc=$OCF_NOT_RUNNING
- local runs=0
- local start_time
- local end_time
- local sleep_time
- while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]
- do
- start_time=`date +%s%N`
- if_check
- mon_rc=$?
- REP_COUNT=$(( $REP_COUNT - 1 ))
- if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then
- ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left."
- end_time=`date +%s%N`
- sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null`
- sleep $sleep_time 2> /dev/null
- runs=$(($runs + 1))
- fi
-
- if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then
- ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error"
- fi
- done
-
- ocf_log debug "Monitoring return code: $mon_rc"
- if [ $mon_rc -eq $OCF_SUCCESS ]; then
- set_cib_value 1
- attr_rc=$?
- else
- ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed."
- set_cib_value 0
- attr_rc=$?
- fi
-
- ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors.
- ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself.
- exit $attr_rc
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor
+ local pseudo_status=$?
+ if [ $pseudo_status -ne $OCF_SUCCESS ]; then
+ exit $pseudo_status
+ fi
+
+ local mon_rc=$OCF_NOT_RUNNING
+ local attr_rc=$OCF_NOT_RUNNING
+ local runs=0
+ local start_time
+ local end_time
+ local sleep_time
+ while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]
+ do
+ start_time=`date +%s%N`
+ if_check
+ mon_rc=$?
+ REP_COUNT=$(( $REP_COUNT - 1 ))
+ if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then
+ ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left."
+ end_time=`date +%s%N`
+ sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null`
+ sleep $sleep_time 2> /dev/null
+ runs=$(($runs + 1))
+ fi
+
+ if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then
+ ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error"
+ fi
+ done
+
+ ocf_log debug "Monitoring return code: $mon_rc"
+ if [ $mon_rc -eq $OCF_SUCCESS ]; then
+ set_cib_value 1
+ attr_rc=$?
+ else
+ ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed."
+ set_cib_value 0
+ attr_rc=$?
+ fi
+
+ ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors.
+ ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself.
+ exit $attr_rc
+}
+
+if_stop()
+{
+ attrd_updater -D -n $ATTRNAME
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop
}
+if_start()
+{
+ local rc
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE start
+ rc=$?
+ if [ $rc -ne $OCF_SUCCESS ]; then
+ ocf_exit_reason "Failure to create ethmonitor state file"
+ return $rc
+ fi
+
+ # perform the first monitor during the start operation
+ if_monitor
+ return $?
+}
+
+
if_validate() {
- check_binary $IP2UTIL
- check_binary arping
- if_init
+ check_binary $IP2UTIL
+ check_binary arping
+ if_init
}
case $__OCF_ACTION in
@@ -436,18 +521,17 @@ esac
if_validate
case $__OCF_ACTION in
-start) ha_pseudo_resource $OCF_RESOURCE_INSTANCE start
+start) if_start
exit $?
;;
-stop) attrd_updater -D -n $ATTRNAME
- ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop
+stop) if_stop
exit $?
;;
monitor|status) if_monitor
exit $?
;;
validate-all) exit $?
- ;;
+ ;;
*) if_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
--
1.8.4.2