Blob Blame History Raw
From 8c92227bce9cc4fe177eea5b2f7c9016e96434f9 Mon Sep 17 00:00:00 2001
From: David Vossel <dvossel@redhat.com>
Date: Mon, 29 Jun 2015 13:03:17 -0500
Subject: [PATCH 1/3] bz1214360-NovaCompute-update1.patch

---
 doc/man/Makefile.am    |   1 +
 heartbeat/Makefile.am  |   3 +-
 heartbeat/NovaCompute  |  73 ++++++------
 heartbeat/NovaEvacuate | 311 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 352 insertions(+), 36 deletions(-)
 create mode 100755 heartbeat/NovaEvacuate

diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
index 42a57fe..d32426b 100644
--- a/doc/man/Makefile.am
+++ b/doc/man/Makefile.am
@@ -74,6 +74,7 @@ man_MANS	       = ocf_heartbeat_AoEtarget.7 \
                           ocf_heartbeat_ManageRAID.7 \
                           ocf_heartbeat_ManageVE.7 \
                           ocf_heartbeat_NovaCompute.7 \
+                          ocf_heartbeat_NovaEvacuate.7 \
                           ocf_heartbeat_Pure-FTPd.7 \
                           ocf_heartbeat_Raid1.7 \
                           ocf_heartbeat_Route.7 \
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
index 0bebf97..1034632 100644
--- a/heartbeat/Makefile.am
+++ b/heartbeat/Makefile.am
@@ -52,7 +52,8 @@ send_ua_SOURCES         = send_ua.c IPv6addr_utils.c
 IPv6addr_LDADD          = -lplumb $(LIBNETLIBS)
 send_ua_LDADD           = $(LIBNETLIBS)
 
-osp_SCRIPTS	     =  NovaCompute
+osp_SCRIPTS	     =  NovaCompute		\
+			NovaEvacuate
 
 ocf_SCRIPTS	     =  ClusterMon		\
 			CTDB			\
diff --git a/heartbeat/NovaCompute b/heartbeat/NovaCompute
index f71abeb..09eee38 100644
--- a/heartbeat/NovaCompute
+++ b/heartbeat/NovaCompute
@@ -107,15 +107,26 @@ Disable shared storage recovery for instances. Use at your own risk!
 <content type="boolean" default="0" />
 </parameter>
 
+<parameter name="evacuation_delay" unique="0" required="0">
+<longdesc lang="en">
+How long to wait for nova to finish evacuating instances elsewhere
+before starting nova-compute.  Only used when the agent detects
+evacuations might be in progress.
+
+You may need to increase the start timeout when increasing this value.
+</longdesc>
+<shortdesc lang="en">Delay to allow evacuations time to complete</shortdesc>
+<content type="integer" default="120" />
+</parameter>
+
 </parameters>
 
 <actions>
-<action name="start"        timeout="120" />
+<action name="start"        timeout="600" />
 <action name="stop"         timeout="300" />
 <action name="monitor"      timeout="20" interval="10" depth="0"/>
 <action name="validate-all" timeout="20" />
 <action name="meta-data"    timeout="5" />
-<action name="notify"       timeout="600" />
 </actions>
 </resource-agent>
 END
@@ -132,7 +143,7 @@ sigterm_handler() {
 
 nova_usage() {
 	cat <<END
-usage: $0 {start|stop|monitor|notify|validate-all|meta-data}
+usage: $0 {start|stop|monitor|validate-all|meta-data}
 
 Expects to have a fully populated OCF RA-compliant environment set.
 END
@@ -148,6 +159,26 @@ nova_start() {
 	return $OCF_SUCCESS
     fi
 
+    state=$(attrd_updater -p -n evacute -N ${NOVA_HOST} | sed -e 's/.*value=//' | tr -d '"' )
+    if [ "x$state" = x ]; then
+	: never been fenced
+
+    elif [ "x$state" = xno ]; then
+	: has been evacuated, however it could have been 1s ago
+	ocf_log info "Pausing to give evacuations from ${NOVA_HOST} time to complete"
+	sleep ${OCF_RESKEY_evacuation_delay}
+
+    else
+	ocf_log info "Waiting for pending evacuations from ${NOVA_HOST}"
+	while [ "x$state" != "xno" ]; do
+	    state=$(attrd_updater -p -n evacute -N ${NOVA_HOST} | sed -e 's/.*value=//' | tr -d '"' )
+	    sleep 5
+	done
+
+	ocf_log info "Pausing to give evacuations from ${NOVA_HOST} time to complete"
+	sleep ${OCF_RESKEY_evacuation_delay}
+    fi
+
     export LIBGUESTFS_ATTACH_METHOD=appliance
     su nova -s /bin/sh -c /usr/bin/nova-compute &
 
@@ -212,33 +243,7 @@ nova_monitor() {
 }
 
 nova_notify() {
-    if [ "x${OCF_RESKEY_CRM_meta_notify_operation}" != "xstop" ]; then
-	return $OCF_SUCCESS
-    elif [ "x${OCF_RESKEY_CRM_meta_notify_type}" != "xpost" ]; then
-	return $OCF_SUCCESS
-    fi
-
-    # Only the first node not stopping performs evacuates for now
-    # Can we allow all of them to do it?  It would make this block much simpler.
-    for host in ${OCF_RESKEY_CRM_meta_notify_active_uname}; do
-	for stop in ${OCF_RESKEY_CRM_meta_notify_stop_uname}; do
-	    if [ "$stop" = "$host" ]; then
-		: $host is one of the nodes that is stopping
-
-	    elif [ "x$(echo ${host} | awk -F. '{print $1}')" != "x$(uname -n | awk -F. '{print $1}')" ]; then
-		: We are not the first non-stopping node
-		return $OCF_SUCCESS
-
-	    else
-		# Also repeat for any peer NOT in active_uname somehow?
-		for node in $OCF_RESKEY_CRM_meta_notify_stop_uname; do
-		    ocf_log info "Performing evacuations for $node"
-		    fence_compute ${fence_options} -o reboot -n $node
-		done
-		return $OCF_SUCCESS
-	    fi
-	done
-    done
+    return $OCF_SUCCESS
 }
 
 nova_validate() {
@@ -246,7 +251,6 @@ nova_validate() {
     fence_options=""
 
     check_binary openstack-config
-    check_binary fence_compute
     check_binary nova-compute
 
     if [ ! -f /etc/nova/nova.conf ]; then
@@ -337,6 +341,7 @@ nova_validate() {
     return $rc
 }
 
+: ${OCF_RESKEY_evacuation_delay=120}
 case $__OCF_ACTION in
 meta-data)	meta_data
 		exit $OCF_SUCCESS
@@ -346,12 +351,10 @@ usage|help)	nova_usage
 		;;
 esac
 
-nova_validate
-
 case $__OCF_ACTION in
-start)		nova_start;;
+start)		nova_validate; nova_start;;
 stop)		nova_stop;;
-monitor)	nova_monitor;;
+monitor)	nova_validate; nova_monitor;;
 notify)		nova_notify;;
 validate-all)	exit $OCF_SUCCESS;;
 *)		nova_usage
diff --git a/heartbeat/NovaEvacuate b/heartbeat/NovaEvacuate
new file mode 100755
index 0000000..f9a24f1
--- /dev/null
+++ b/heartbeat/NovaEvacuate
@@ -0,0 +1,311 @@
+#!/bin/sh
+#
+#
+# NovaCompute agent manages compute daemons.
+#
+# Copyright (c) 2015
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+
+#######################################################################
+# Initialization:
+
+###
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+###
+
+: ${__OCF_ACTION=$1}
+
+#######################################################################
+
+meta_data() {
+	cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="NovaEvacuate" version="1.0">
+<version>1.0</version>
+
+<longdesc lang="en">
+Facility for tacking a list of compute nodes and reliably evacuating the ones that fence_evacuate has flagged.
+</longdesc>
+<shortdesc lang="en">Evacuator for OpenStack Nova Compute Server</shortdesc>
+
+<parameters>
+
+<parameter name="auth_url" unique="0" required="1">
+<longdesc lang="en">
+Authorization URL for connecting to keystone in admin context
+</longdesc>
+<shortdesc lang="en">Authorization URL</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="username" unique="0" required="1">
+<longdesc lang="en">
+Username for connecting to keystone in admin context
+</longdesc>
+<shortdesc lang="en">Username</shortdesc>
+</parameter>
+
+<parameter name="password" unique="0" required="1">
+<longdesc lang="en">
+Password for connecting to keystone in admin context
+</longdesc>
+<shortdesc lang="en">Password</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="tenant_name" unique="0" required="1">
+<longdesc lang="en">
+Tenant name for connecting to keystone in admin context.
+Note that with Keystone V3 tenant names are only unique within a domain.
+</longdesc>
+<shortdesc lang="en">Tenant name</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="endpoint_type" unique="0" required="0">
+<longdesc lang="en">
+Nova API location (internal, public or admin URL)
+</longdesc>
+<shortdesc lang="en">Nova API location (internal, public or admin URL)</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="no_shared_storage" unique="0" required="0">
+<longdesc lang="en">
+Disable shared storage recovery for instances. Use at your own risk!
+</longdesc>
+<shortdesc lang="en">Disable shared storage recovery for instances</shortdesc>
+<content type="boolean" default="0" />
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start"        timeout="20" />
+<action name="stop"         timeout="20" />
+<action name="monitor"      timeout="600" interval="10" depth="0"/>
+<action name="validate-all" timeout="20" />
+<action name="meta-data"    timeout="5" />
+</actions>
+</resource-agent>
+END
+}
+
+#######################################################################
+
+# don't exit on TERM, to test that lrmd makes sure that we do exit
+trap sigterm_handler TERM
+sigterm_handler() {
+	ocf_log info "They use TERM to bring us down. No such luck."
+	return
+}
+
+evacuate_usage() {
+	cat <<END
+usage: $0 {start|stop|monitor|validate-all|meta-data}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+}
+
+evacuate_stop() {
+    rm -f "$statefile"
+    return $OCF_SUCCESS
+}
+
+evacuate_start() {
+    touch "$statefile"
+    # Do not invole monitor here so that the start timeout can be low
+    return $?
+}
+
+update_evacuation() {
+    attrd_updater -p -n evacute -Q -N ${1} -v ${2}
+    arc=$?
+    if [ ${arc} != 0 ]; then
+	ocf_log warn "Can not set evacuation state of ${1} to ${2}: ${arc}"
+    fi
+    return ${arc}
+}
+
+handle_evacuations() {
+    while [ $# -gt 0 ]; do
+	node=$1 
+	state=$2
+	shift; shift;
+	need_evacuate=0
+
+	case $state in
+	    "") ;;
+	    no)  ocf_log debug "$node is either fine or already handled";;
+	    yes) need_evacuate=1;; 
+	    *@*)
+		where=$(echo $state | awk -F@ '{print $1}')
+		when=$(echo $state | awk -F@ '{print $2}')
+		now=$(date +%s)
+
+		if [ $(($now - $when)) -gt 60 ]; then
+		    ocf_log info "Processing partial evacuation of $node by $where at $when"		
+		    need_evacuate=1
+		else
+		    # Give some time for any in-flight evacuations to either complete or fail
+		    # Nova won't react well if there are two overlapping requests 
+		    ocf_log info "Deferring processing partial evacuation of $node by $where at $when"
+		fi
+		;;
+	esac
+
+	if [ $need_evacuate = 1 ]; then
+	    found=0
+	    ocf_log notice "Initiating evacuation of $node"
+
+	    for known in $(fence_compute ${fence_options} -o list | tr -d ','); do
+		if [ ${known} = ${node} ]; then
+		    found=1
+		    break
+		fi
+	    done
+
+	    if [ $found = 0 ]; then
+		ocf_log info "Nova does not know about ${node}"
+		# Dont mark as no because perhaps nova is unavailable right now
+		continue
+	    fi
+
+	    update_evacuation ${node} "$(uname -n)@$(date +%s)"
+	    if [ $? != 0 ]; then
+		return $OCF_SUCCESS
+	    fi
+
+	    fence_compute ${fence_options} -o reboot -n $node
+	    rc=$?
+
+	    if [ $rc = 0 ]; then
+		update_evacuation ${node} no
+		ocf_log notice "Completed evacuation of $node"
+	    else
+		ocf_log warn "Evacuation of $node failed: $rc"
+		update_evacuation ${node} yes
+	    fi
+	fi
+    done
+
+    return $OCF_SUCCESS
+}
+
+evacuate_monitor() {
+    if [ ! -f "$statefile" ]; then
+	return $OCF_NOT_RUNNING
+    fi
+
+    handle_evacuations $(attrd_updater -n evacute -A | tr '="' '  ' | awk '{print $4" "$6}')
+    return $OCF_SUCCESS
+}
+
+evacuate_validate() {
+    rc=$OCF_SUCCESS
+    fence_options=""
+
+    check_binary fence_compute
+
+    # Is the state directory writable? 
+    state_dir=$(dirname $statefile)
+    touch "$state_dir/$$"
+    if [ $? != 0 ]; then
+	ocf_exit_reason "Invalid state directory: $state_dir"
+	return $OCF_ERR_ARGS
+    fi
+    rm -f "$state_dir/$$"
+
+    if [ -z "${OCF_RESKEY_auth_url}" ]; then
+	   ocf_exit_reason "auth_url not configured"
+	   exit $OCF_ERR_CONFIGURED
+    fi
+
+    fence_options="${fence_options} -k ${OCF_RESKEY_auth_url}"
+
+    if [ -z "${OCF_RESKEY_username}" ]; then
+	   ocf_exit_reason "username not configured"
+	   exit $OCF_ERR_CONFIGURED
+    fi
+
+    fence_options="${fence_options} -l ${OCF_RESKEY_username}"
+
+    if [ -z "${OCF_RESKEY_password}" ]; then
+	   ocf_exit_reason "password not configured"
+	   exit $OCF_ERR_CONFIGURED
+    fi
+
+    fence_options="${fence_options} -p ${OCF_RESKEY_password}"
+
+    if [ -z "${OCF_RESKEY_tenant_name}" ]; then
+	   ocf_exit_reason "tenant_name not configured"
+	   exit $OCF_ERR_CONFIGURED
+    fi
+
+    fence_options="${fence_options} -t ${OCF_RESKEY_tenant_name}"
+
+    if [ -n "${OCF_RESKEY_no_shared_storage}" ]; then
+	if ocf_is_true "${OCF_RESKEY_no_shared_storage}"; then
+	    fence_options="${fence_options} --no-shared-storage"
+	fi
+    fi
+
+    if [ -n "${OCF_RESKEY_endpoint_type}" ]; then
+	case ${OCF_RESKEY_endpoint_type} in
+	    adminURL|publicURL|internalURL) ;;
+	    *)
+		ocf_exit_reason "endpoint_type ${OCF_RESKEY_endpoint_type} not valid. Use adminURL or publicURL or internalURL"
+		exit $OCF_ERR_CONFIGURED
+	    ;;
+	esac
+	fence_options="${fence_options} -e ${OCF_RESKEY_endpoint_type}"
+    fi
+
+    if [ $rc != $OCF_SUCCESS ]; then
+	exit $rc
+    fi
+    return $rc
+}
+
+statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"
+
+case $__OCF_ACTION in
+start)		evacuate_validate; evacuate_start;;
+stop)		evacuate_stop;;
+monitor)	evacuate_validate; evacuate_monitor;;
+meta-data)	meta_data
+		exit $OCF_SUCCESS
+		;;
+usage|help)	evacuate_usage
+		exit $OCF_SUCCESS
+		;;
+validate-all)	exit $OCF_SUCCESS;;
+*)		evacuate_usage
+		exit $OCF_ERR_UNIMPLEMENTED
+		;;
+esac
+rc=$?
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+exit $rc
-- 
1.8.4.2