Blob Blame History Raw
From feffc766c48a1010c1bf4f8b1db74795d06dbd50 Mon Sep 17 00:00:00 2001
From: David Vossel <dvossel@redhat.com>
Date: Mon, 25 Aug 2014 14:57:09 -0500
Subject: [PATCH 2/4] ethmonitor updates

---
 heartbeat/ethmonitor | 290 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 187 insertions(+), 103 deletions(-)

diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor
index b85d7fc..a447391 100755
--- a/heartbeat/ethmonitor
+++ b/heartbeat/ethmonitor
@@ -1,14 +1,14 @@
 #!/bin/sh
 #
-#       OCF Resource Agent compliant script.
-#       Monitor the vitality of a local network interface.
+#	   OCF Resource Agent compliant script.
+#	   Monitor the vitality of a local network interface.
 #
 # 	Based on the work by Robert Euhus and Lars Marowsky-Brée.
 #
 #	Transfered from Ipaddr2 into ethmonitor by Alexander Krauth
 #
 # Copyright (c) 2011 Robert Euhus, Alexander Krauth, Lars Marowsky-Brée
-#                    All Rights Reserved.
+#					All Rights Reserved.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of version 2 of the GNU General Public License as
@@ -29,12 +29,12 @@
 # along with this program; if not, write the Free Software Foundation,
 # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
 #
-#     OCF parameters are as below
+#	 OCF parameters are as below
 #
 #	OCF_RESKEY_interface
 #	OCF_RESKEY_multiplicator
 #	OCF_RESKEY_name
-#       OCF_RESKEY_repeat_count
+#	   OCF_RESKEY_repeat_count
 #	OCF_RESKEY_repeat_interval
 #	OCF_RESKEY_pktcnt_timeout
 #	OCF_RESKEY_arping_count
@@ -70,10 +70,13 @@ The resource configuration requires a monitor operation, because the monitor doe
 In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value.
 The name of the attribute value is configured in the 'name' option of this RA.
 
-Example constraint configuration:
+Example constraint configuration using crmsh
 location loc_connected_node my_resource_grp \
         rule $id="rule_loc_connected_node" -INF: ethmonitor eq 0
 
+Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available.
+pcs constraint location my_resource rule score=-INFINITY ethmonitor-eth0 ne 1
+
 The ethmonitor works in 3 different modes to test the interface vitality.
 1. call ip to see if the link status is up (if link is down -> error)
 2. call ip and watch the RX counter (if packages come around in a certain time -> success)
@@ -157,14 +160,30 @@ Maximum number of IPs from ARP cache list to check for ARP REQUEST (arping) answ
 <content type="integer" default="5"/>
 </parameter>
 
+<parameter name="infiniband_device">
+<longdesc lang="en">
+For interfaces that are infiniband devices.
+</longdesc>
+<shortdesc lang="en">infiniband device</shortdesc>
+<content type="string" />
+</parameter>
+
+<parameter name="infiniband_port">
+<longdesc lang="en">
+For infiniband devices, this is the port to monitor.
+</longdesc>
+<shortdesc lang="en">infiniband port</shortdesc>
+<content type="integer" />
+</parameter>
+
 </parameters>
 <actions>
-<action name="start"   timeout="20s" />
-<action name="stop"    timeout="20s" />
-<action name="status" depth="0"  timeout="20s" interval="10s" />
-<action name="monitor" depth="0"  timeout="20s" interval="10s" />
-<action name="meta-data"  timeout="5s" />
-<action name="validate-all"  timeout="20s" />
+<action name="start" timeout="60s" />
+<action name="stop" timeout="20s" />
+<action name="status" depth="0" timeout="60s" interval="10s" />
+<action name="monitor" depth="0" timeout="60s" interval="10s" />
+<action name="meta-data" timeout="5s" />
+<action name="validate-all" timeout="20s" />
 </actions>
 </resource-agent>
 END
@@ -173,7 +192,7 @@ END
 }
 
 #
-#	Return true, if the interface exists
+# Return true, if the interface exists
 #
 is_interface() {
 	#
@@ -181,14 +200,25 @@ is_interface() {
 	#
 	local iface=`$IP2UTIL -o -f inet addr show | grep " $1 " \
 		| cut -d ' ' -f2 | sort -u | grep -v '^ipsec[0-9][0-9]*$'`
-        [ "$iface" != "" ]
+		[ "$iface" != "" ]
+}
+
+infiniband_status()
+{
+	local device="$OCF_RESKEY_infiniband_device"
+
+	if [ -n "$OCF_RESKEY_infiniband_port" ]; then
+		device="${OCF_RESKEY_infiniband_device}:${OCF_RESKEY_infiniband_port}"
+	fi
+	
+	ibstatus ${device} | grep -q ACTIVE 
 }
 
 if_init() {
 	local rc
 
 	if [ X"$OCF_RESKEY_interface" = "X" ]; then
-		ocf_log err "Interface name (the interface parameter) is mandatory"
+		ocf_exit_reason "Interface name (the interface parameter) is mandatory"
 		exit $OCF_ERR_CONFIGURED
 	fi
 
@@ -196,60 +226,67 @@ if_init() {
 
 	if is_interface $NIC
 	then
-	  case "$NIC" in
-	    *:*) ocf_log err "Do not specify a virtual interface : $OCF_RESKEY_interface"
-	         exit $OCF_ERR_CONFIGURED;;
-	    *)  ;;
-	  esac
+		case "$NIC" in
+			*:*) ocf_exit_reason "Do not specify a virtual interface : $OCF_RESKEY_interface"
+				 exit $OCF_ERR_CONFIGURED;;
+			*)   ;;
+		esac
 	else
-	  case $__OCF_ACTION in
-	    validate-all) ocf_log err "Interface $NIC does not exist"
-                            exit $OCF_ERR_CONFIGURED;;
-	    *)   	    ocf_log warn "Interface $NIC does not exist"
-                            ## It might be a bond interface which is temporarily not available, therefore we want to continue here
-	                    ;;
-	  esac
+		case $__OCF_ACTION in
+			validate-all)
+				ocf_exit_reason "Interface $NIC does not exist"
+				exit $OCF_ERR_CONFIGURED;;
+			*)	
+				## It might be a bond interface which is temporarily not available, therefore we want to continue here
+				ocf_log warn "Interface $NIC does not exist"
+				;;
+		esac
 	fi
 
 	: ${OCF_RESKEY_multiplier:="1"}
 	if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then
-		ocf_log err "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]"
+		ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]"
 		exit $OCF_ERR_CONFIGURED
 	fi
 	
 	ATTRNAME=${OCF_RESKEY_name:-"ethmonitor-$NIC"}
 	
-        REP_COUNT=${OCF_RESKEY_repeat_count:-5}
+	REP_COUNT=${OCF_RESKEY_repeat_count:-5}
 	if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then
-		ocf_log err "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]"
+		ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]"
 		exit $OCF_ERR_CONFIGURED
-        fi
+	fi
 	REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10}
 	if ! ocf_is_decimal "$REP_INTERVAL_S"; then
-		ocf_log err "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]"
+		ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]"
 		exit $OCF_ERR_CONFIGURED
 	fi
 	: ${OCF_RESKEY_pktcnt_timeout:="5"}
 	if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then
-		ocf_log err "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]"
+		ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]"
 		exit $OCF_ERR_CONFIGURED
 	fi
 	: ${OCF_RESKEY_arping_count:="1"}
 	if ! ocf_is_decimal "$OCF_RESKEY_arping_count"; then
-		ocf_log err "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]"
+		ocf_exit_reason "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]"
 		exit $OCF_ERR_CONFIGURED
 	fi
 	: ${OCF_RESKEY_arping_timeout:="1"}
 	if ! ocf_is_decimal "$OCF_RESKEY_arping_timeout"; then
-		ocf_log err "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]"
+		ocf_exit_reason "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]"
 		exit $OCF_ERR_CONFIGURED
 	fi
 	: ${OCF_RESKEY_arping_cache_entries:="5"}
 	if ! ocf_is_decimal "$OCF_RESKEY_arping_cache_entries"; then
-		ocf_log err "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]"
+		ocf_exit_reason "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]"
 		exit $OCF_ERR_CONFIGURED
 	fi
-  return $OCF_SUCCESS
+
+	if [ -n "$OCF_RESKEY_infiniband_device" ]; then
+		#ibstatus is required if an infiniband_device is provided
+		check_binary ibstatus
+	fi
+	return $OCF_SUCCESS
 }
 
 # get the link status on $NIC
@@ -277,7 +314,7 @@ watch_pkt_counter () {
 	for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do
 		sleep 0.1
 		RX_PACKETS_NEW="`get_rx_packets`"
-		ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD    RX_PACKETS_NEW: $RX_PACKETS_NEW"
+		ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD	RX_PACKETS_NEW: $RX_PACKETS_NEW"
 		if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then
 			ocf_log debug "we received some packets."
 			return 0
@@ -308,7 +345,7 @@ do_arping () {
 }
 
 #
-# 	Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level
+# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level
 #
 # 09: check for nonempty ARP cache
 # 10: watch for packet counter changes
@@ -322,21 +359,47 @@ do_arping () {
 # the tests for higher check levels are run.
 #
 if_check () {
+	local arp_list
 	# always check link status first
 	link_status="`get_link_status`"
 	ocf_log debug "link_status: $link_status (1=up, 0=down)"
-        [ $link_status -eq 0 ] && return $OCF_NOT_RUNNING
+
+	if [ $link_status -eq 0 ]; then
+		ocf_log notice "link_status: DOWN"
+		return $OCF_NOT_RUNNING
+	fi
+
+	# if this is an infiniband device, try ibstatus script
+	if [ -n "$OCF_RESKEY_infiniband_device" ]; then
+		if infiniband_status; then
+			return $OCF_SUCCESS
+		fi
+		ocf_log info "Infiniband device $OCF_RESKEY_infiniband_device is not available, check ibstatus for more information"
+		return $OCF_NOT_RUNNING	
+	fi
 
 	# watch for packet counter changes
-	ocf_log debug "watch for packet counter changes" 
-	watch_pkt_counter && return $OCF_SUCCESS
+	ocf_log debug "watch for packet counter changes"
+	watch_pkt_counter
+	if [ $? -eq 0 ]; then
+		return $OCF_SUCCESS
+	else 
+		ocf_log debug "No packets received during packet watch timeout"
+	fi
 
 	# check arping ARP cache entries
-	ocf_log debug "check arping ARP cache entries" 
-	for ip in `get_arp_list`; do
+	ocf_log debug "check arping ARP cache entries"
+	arp_list=`get_arp_list`
+	for ip in `echo $arp_list`; do
 		do_arping $ip && return $OCF_SUCCESS
 	done
 
+	# if we get here, the ethernet device is considered not running.
+	# provide some logging information
+	if [ -z "$arp_list" ]; then
+		ocf_log info "No ARP cache entries found to arping" 
+	fi
+
 	# watch for packet counter changes in promiscios mode
 #	ocf_log debug "watch for packet counter changes in promiscios mode" 
 	# be sure switch off promiscios mode in any case
@@ -362,67 +425,89 @@ END
 }
 
 set_cib_value() {
-    local score=`expr $1 \* $OCF_RESKEY_multiplier`
-    attrd_updater -n $ATTRNAME -v $score -q
-    local rc=$?
-    case $rc in
-        0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;;
-        *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";;
-    esac
-    return $rc
+	local score=`expr $1 \* $OCF_RESKEY_multiplier`
+	attrd_updater -n $ATTRNAME -v $score -q
+	local rc=$?
+	case $rc in
+		0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;;
+		*) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";;
+	esac
+	return $rc
 }
 
 if_monitor() {
-    ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor
-    local pseudo_status=$?
-    if [ $pseudo_status -ne $OCF_SUCCESS ]; then
-      exit $pseudo_status
-    fi
-    
-    local mon_rc=$OCF_NOT_RUNNING
-    local attr_rc=$OCF_NOT_RUNNING
-    local runs=0
-    local start_time
-    local end_time
-    local sleep_time
-    while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]
-    do
-      start_time=`date +%s%N`
-      if_check
-      mon_rc=$?
-      REP_COUNT=$(( $REP_COUNT - 1 ))
-      if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then
-        ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left."
-	end_time=`date +%s%N`
-	sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null`
-        sleep $sleep_time 2> /dev/null
-        runs=$(($runs + 1))
-      fi
-
-      if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then
-        ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error"
-      fi
-    done
-    
-    ocf_log debug "Monitoring return code: $mon_rc"
-    if [ $mon_rc -eq $OCF_SUCCESS ]; then
-      set_cib_value 1
-      attr_rc=$?
-    else
-      ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed."
-      set_cib_value 0
-      attr_rc=$?
-    fi
-
-    ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors.
-    ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself.
-    exit $attr_rc
+	ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor
+	local pseudo_status=$?
+	if [ $pseudo_status -ne $OCF_SUCCESS ]; then
+		exit $pseudo_status
+	fi
+	
+	local mon_rc=$OCF_NOT_RUNNING
+	local attr_rc=$OCF_NOT_RUNNING
+	local runs=0
+	local start_time
+	local end_time
+	local sleep_time
+	while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]
+	do
+		start_time=`date +%s%N`
+		if_check
+		mon_rc=$?
+		REP_COUNT=$(( $REP_COUNT - 1 ))
+		if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then
+			ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left."
+			end_time=`date +%s%N`
+			sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null`
+			sleep $sleep_time 2> /dev/null
+			runs=$(($runs + 1))
+		fi
+
+		if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then
+			ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error"
+		fi
+	done
+	
+	ocf_log debug "Monitoring return code: $mon_rc"
+	if [ $mon_rc -eq $OCF_SUCCESS ]; then
+		set_cib_value 1
+		attr_rc=$?
+	else
+		ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed."
+		set_cib_value 0
+		attr_rc=$?
+	fi
+
+	## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors.
+	## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself.
+	exit $attr_rc
+}
+
+if_stop()
+{
+	attrd_updater -D -n $ATTRNAME
+	ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop
 }
 
+if_start()
+{
+	local rc
+	ha_pseudo_resource $OCF_RESOURCE_INSTANCE start
+	rc=$?
+	if [ $rc -ne $OCF_SUCCESS ]; then
+		ocf_exit_reason "Failure to create ethmonitor state file"
+		return $rc
+	fi
+
+	# perform the first monitor during the start operation
+	if_monitor
+	return $?
+}
+
+
 if_validate() {
-    check_binary $IP2UTIL
-    check_binary arping
-    if_init
+	check_binary $IP2UTIL
+	check_binary arping
+	if_init
 }
 
 case $__OCF_ACTION in
@@ -436,18 +521,17 @@ esac
 if_validate
 
 case $__OCF_ACTION in
-start)		ha_pseudo_resource $OCF_RESOURCE_INSTANCE start
+start)		if_start
 		exit $?
 		;;
-stop)		attrd_updater -D -n $ATTRNAME
-                ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop
+stop)		if_stop
 		exit $?
 		;;
 monitor|status)	if_monitor
 		exit $?
 		;;
 validate-all)	exit $?
-                ;;
+		;;
 *)		if_usage
 		exit $OCF_ERR_UNIMPLEMENTED
 		;;
-- 
1.8.4.2