From 8c92227bce9cc4fe177eea5b2f7c9016e96434f9 Mon Sep 17 00:00:00 2001
From: David Vossel <dvossel@redhat.com>
Date: Mon, 29 Jun 2015 13:03:17 -0500
Subject: [PATCH 1/3] bz1214360-NovaCompute-update1.patch
---
doc/man/Makefile.am | 1 +
heartbeat/Makefile.am | 3 +-
heartbeat/NovaCompute | 73 ++++++------
heartbeat/NovaEvacuate | 311 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 352 insertions(+), 36 deletions(-)
create mode 100755 heartbeat/NovaEvacuate
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
index 42a57fe..d32426b 100644
--- a/doc/man/Makefile.am
+++ b/doc/man/Makefile.am
@@ -74,6 +74,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
ocf_heartbeat_ManageRAID.7 \
ocf_heartbeat_ManageVE.7 \
ocf_heartbeat_NovaCompute.7 \
+ ocf_heartbeat_NovaEvacuate.7 \
ocf_heartbeat_Pure-FTPd.7 \
ocf_heartbeat_Raid1.7 \
ocf_heartbeat_Route.7 \
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
index 0bebf97..1034632 100644
--- a/heartbeat/Makefile.am
+++ b/heartbeat/Makefile.am
@@ -52,7 +52,8 @@ send_ua_SOURCES = send_ua.c IPv6addr_utils.c
IPv6addr_LDADD = -lplumb $(LIBNETLIBS)
send_ua_LDADD = $(LIBNETLIBS)
-osp_SCRIPTS = NovaCompute
+osp_SCRIPTS = NovaCompute \
+ NovaEvacuate
ocf_SCRIPTS = ClusterMon \
CTDB \
diff --git a/heartbeat/NovaCompute b/heartbeat/NovaCompute
index f71abeb..09eee38 100644
--- a/heartbeat/NovaCompute
+++ b/heartbeat/NovaCompute
@@ -107,15 +107,26 @@ Disable shared storage recovery for instances. Use at your own risk!
<content type="boolean" default="0" />
</parameter>
+<parameter name="evacuation_delay" unique="0" required="0">
+<longdesc lang="en">
+How long to wait for nova to finish evacuating instances elsewhere
+before starting nova-compute. Only used when the agent detects
+evacuations might be in progress.
+
+You may need to increase the start timeout when increasing this value.
+</longdesc>
+<shortdesc lang="en">Delay to allow evacuations time to complete</shortdesc>
+<content type="integer" default="120" />
+</parameter>
+
</parameters>
<actions>
-<action name="start" timeout="120" />
+<action name="start" timeout="600" />
<action name="stop" timeout="300" />
<action name="monitor" timeout="20" interval="10" depth="0"/>
<action name="validate-all" timeout="20" />
<action name="meta-data" timeout="5" />
-<action name="notify" timeout="600" />
</actions>
</resource-agent>
END
@@ -132,7 +143,7 @@ sigterm_handler() {
nova_usage() {
cat <<END
-usage: $0 {start|stop|monitor|notify|validate-all|meta-data}
+usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
@@ -148,6 +159,26 @@ nova_start() {
return $OCF_SUCCESS
fi
+ state=$(attrd_updater -p -n evacute -N ${NOVA_HOST} | sed -e 's/.*value=//' | tr -d '"' )
+ if [ "x$state" = x ]; then
+ : never been fenced
+
+ elif [ "x$state" = xno ]; then
+ : has been evacuated, however it could have been 1s ago
+ ocf_log info "Pausing to give evacuations from ${NOVA_HOST} time to complete"
+ sleep ${OCF_RESKEY_evacuation_delay}
+
+ else
+ ocf_log info "Waiting for pending evacuations from ${NOVA_HOST}"
+ while [ "x$state" != "xno" ]; do
+ state=$(attrd_updater -p -n evacute -N ${NOVA_HOST} | sed -e 's/.*value=//' | tr -d '"' )
+ sleep 5
+ done
+
+ ocf_log info "Pausing to give evacuations from ${NOVA_HOST} time to complete"
+ sleep ${OCF_RESKEY_evacuation_delay}
+ fi
+
export LIBGUESTFS_ATTACH_METHOD=appliance
su nova -s /bin/sh -c /usr/bin/nova-compute &
@@ -212,33 +243,7 @@ nova_monitor() {
}
nova_notify() {
- if [ "x${OCF_RESKEY_CRM_meta_notify_operation}" != "xstop" ]; then
- return $OCF_SUCCESS
- elif [ "x${OCF_RESKEY_CRM_meta_notify_type}" != "xpost" ]; then
- return $OCF_SUCCESS
- fi
-
- # Only the first node not stopping performs evacuates for now
- # Can we allow all of them to do it? It would make this block much simpler.
- for host in ${OCF_RESKEY_CRM_meta_notify_active_uname}; do
- for stop in ${OCF_RESKEY_CRM_meta_notify_stop_uname}; do
- if [ "$stop" = "$host" ]; then
- : $host is one of the nodes that is stopping
-
- elif [ "x$(echo ${host} | awk -F. '{print $1}')" != "x$(uname -n | awk -F. '{print $1}')" ]; then
- : We are not the first non-stopping node
- return $OCF_SUCCESS
-
- else
- # Also repeat for any peer NOT in active_uname somehow?
- for node in $OCF_RESKEY_CRM_meta_notify_stop_uname; do
- ocf_log info "Performing evacuations for $node"
- fence_compute ${fence_options} -o reboot -n $node
- done
- return $OCF_SUCCESS
- fi
- done
- done
+ return $OCF_SUCCESS
}
nova_validate() {
@@ -246,7 +251,6 @@ nova_validate() {
fence_options=""
check_binary openstack-config
- check_binary fence_compute
check_binary nova-compute
if [ ! -f /etc/nova/nova.conf ]; then
@@ -337,6 +341,7 @@ nova_validate() {
return $rc
}
+: ${OCF_RESKEY_evacuation_delay=120}
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
@@ -346,12 +351,10 @@ usage|help) nova_usage
;;
esac
-nova_validate
-
case $__OCF_ACTION in
-start) nova_start;;
+start) nova_validate; nova_start;;
stop) nova_stop;;
-monitor) nova_monitor;;
+monitor) nova_validate; nova_monitor;;
notify) nova_notify;;
validate-all) exit $OCF_SUCCESS;;
*) nova_usage
diff --git a/heartbeat/NovaEvacuate b/heartbeat/NovaEvacuate
new file mode 100755
index 0000000..f9a24f1
--- /dev/null
+++ b/heartbeat/NovaEvacuate
@@ -0,0 +1,311 @@
+#!/bin/sh
+#
+#
+# NovaCompute agent manages compute daemons.
+#
+# Copyright (c) 2015
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like. Any license provided herein, whether implied or
+# otherwise, applies only to this software file. Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+
+#######################################################################
+# Initialization:
+
+###
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+###
+
+: ${__OCF_ACTION=$1}
+
+#######################################################################
+
+meta_data() {
+ cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="NovaEvacuate" version="1.0">
+<version>1.0</version>
+
+<longdesc lang="en">
+Facility for tacking a list of compute nodes and reliably evacuating the ones that fence_evacuate has flagged.
+</longdesc>
+<shortdesc lang="en">Evacuator for OpenStack Nova Compute Server</shortdesc>
+
+<parameters>
+
+<parameter name="auth_url" unique="0" required="1">
+<longdesc lang="en">
+Authorization URL for connecting to keystone in admin context
+</longdesc>
+<shortdesc lang="en">Authorization URL</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="username" unique="0" required="1">
+<longdesc lang="en">
+Username for connecting to keystone in admin context
+</longdesc>
+<shortdesc lang="en">Username</shortdesc>
+</parameter>
+
+<parameter name="password" unique="0" required="1">
+<longdesc lang="en">
+Password for connecting to keystone in admin context
+</longdesc>
+<shortdesc lang="en">Password</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="tenant_name" unique="0" required="1">
+<longdesc lang="en">
+Tenant name for connecting to keystone in admin context.
+Note that with Keystone V3 tenant names are only unique within a domain.
+</longdesc>
+<shortdesc lang="en">Tenant name</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="endpoint_type" unique="0" required="0">
+<longdesc lang="en">
+Nova API location (internal, public or admin URL)
+</longdesc>
+<shortdesc lang="en">Nova API location (internal, public or admin URL)</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="no_shared_storage" unique="0" required="0">
+<longdesc lang="en">
+Disable shared storage recovery for instances. Use at your own risk!
+</longdesc>
+<shortdesc lang="en">Disable shared storage recovery for instances</shortdesc>
+<content type="boolean" default="0" />
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start" timeout="20" />
+<action name="stop" timeout="20" />
+<action name="monitor" timeout="600" interval="10" depth="0"/>
+<action name="validate-all" timeout="20" />
+<action name="meta-data" timeout="5" />
+</actions>
+</resource-agent>
+END
+}
+
+#######################################################################
+
+# don't exit on TERM, to test that lrmd makes sure that we do exit
+trap sigterm_handler TERM
+sigterm_handler() {
+ ocf_log info "They use TERM to bring us down. No such luck."
+ return
+}
+
+evacuate_usage() {
+ cat <<END
+usage: $0 {start|stop|monitor|validate-all|meta-data}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+}
+
+evacuate_stop() {
+ rm -f "$statefile"
+ return $OCF_SUCCESS
+}
+
+evacuate_start() {
+ touch "$statefile"
+ # Do not invole monitor here so that the start timeout can be low
+ return $?
+}
+
+update_evacuation() {
+ attrd_updater -p -n evacute -Q -N ${1} -v ${2}
+ arc=$?
+ if [ ${arc} != 0 ]; then
+ ocf_log warn "Can not set evacuation state of ${1} to ${2}: ${arc}"
+ fi
+ return ${arc}
+}
+
+handle_evacuations() {
+ while [ $# -gt 0 ]; do
+ node=$1
+ state=$2
+ shift; shift;
+ need_evacuate=0
+
+ case $state in
+ "") ;;
+ no) ocf_log debug "$node is either fine or already handled";;
+ yes) need_evacuate=1;;
+ *@*)
+ where=$(echo $state | awk -F@ '{print $1}')
+ when=$(echo $state | awk -F@ '{print $2}')
+ now=$(date +%s)
+
+ if [ $(($now - $when)) -gt 60 ]; then
+ ocf_log info "Processing partial evacuation of $node by $where at $when"
+ need_evacuate=1
+ else
+ # Give some time for any in-flight evacuations to either complete or fail
+ # Nova won't react well if there are two overlapping requests
+ ocf_log info "Deferring processing partial evacuation of $node by $where at $when"
+ fi
+ ;;
+ esac
+
+ if [ $need_evacuate = 1 ]; then
+ found=0
+ ocf_log notice "Initiating evacuation of $node"
+
+ for known in $(fence_compute ${fence_options} -o list | tr -d ','); do
+ if [ ${known} = ${node} ]; then
+ found=1
+ break
+ fi
+ done
+
+ if [ $found = 0 ]; then
+ ocf_log info "Nova does not know about ${node}"
+ # Dont mark as no because perhaps nova is unavailable right now
+ continue
+ fi
+
+ update_evacuation ${node} "$(uname -n)@$(date +%s)"
+ if [ $? != 0 ]; then
+ return $OCF_SUCCESS
+ fi
+
+ fence_compute ${fence_options} -o reboot -n $node
+ rc=$?
+
+ if [ $rc = 0 ]; then
+ update_evacuation ${node} no
+ ocf_log notice "Completed evacuation of $node"
+ else
+ ocf_log warn "Evacuation of $node failed: $rc"
+ update_evacuation ${node} yes
+ fi
+ fi
+ done
+
+ return $OCF_SUCCESS
+}
+
+evacuate_monitor() {
+ if [ ! -f "$statefile" ]; then
+ return $OCF_NOT_RUNNING
+ fi
+
+ handle_evacuations $(attrd_updater -n evacute -A | tr '="' ' ' | awk '{print $4" "$6}')
+ return $OCF_SUCCESS
+}
+
+evacuate_validate() {
+ rc=$OCF_SUCCESS
+ fence_options=""
+
+ check_binary fence_compute
+
+ # Is the state directory writable?
+ state_dir=$(dirname $statefile)
+ touch "$state_dir/$$"
+ if [ $? != 0 ]; then
+ ocf_exit_reason "Invalid state directory: $state_dir"
+ return $OCF_ERR_ARGS
+ fi
+ rm -f "$state_dir/$$"
+
+ if [ -z "${OCF_RESKEY_auth_url}" ]; then
+ ocf_exit_reason "auth_url not configured"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ fence_options="${fence_options} -k ${OCF_RESKEY_auth_url}"
+
+ if [ -z "${OCF_RESKEY_username}" ]; then
+ ocf_exit_reason "username not configured"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ fence_options="${fence_options} -l ${OCF_RESKEY_username}"
+
+ if [ -z "${OCF_RESKEY_password}" ]; then
+ ocf_exit_reason "password not configured"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ fence_options="${fence_options} -p ${OCF_RESKEY_password}"
+
+ if [ -z "${OCF_RESKEY_tenant_name}" ]; then
+ ocf_exit_reason "tenant_name not configured"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ fence_options="${fence_options} -t ${OCF_RESKEY_tenant_name}"
+
+ if [ -n "${OCF_RESKEY_no_shared_storage}" ]; then
+ if ocf_is_true "${OCF_RESKEY_no_shared_storage}"; then
+ fence_options="${fence_options} --no-shared-storage"
+ fi
+ fi
+
+ if [ -n "${OCF_RESKEY_endpoint_type}" ]; then
+ case ${OCF_RESKEY_endpoint_type} in
+ adminURL|publicURL|internalURL) ;;
+ *)
+ ocf_exit_reason "endpoint_type ${OCF_RESKEY_endpoint_type} not valid. Use adminURL or publicURL or internalURL"
+ exit $OCF_ERR_CONFIGURED
+ ;;
+ esac
+ fence_options="${fence_options} -e ${OCF_RESKEY_endpoint_type}"
+ fi
+
+ if [ $rc != $OCF_SUCCESS ]; then
+ exit $rc
+ fi
+ return $rc
+}
+
+statefile="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.active"
+
+case $__OCF_ACTION in
+start) evacuate_validate; evacuate_start;;
+stop) evacuate_stop;;
+monitor) evacuate_validate; evacuate_monitor;;
+meta-data) meta_data
+ exit $OCF_SUCCESS
+ ;;
+usage|help) evacuate_usage
+ exit $OCF_SUCCESS
+ ;;
+validate-all) exit $OCF_SUCCESS;;
+*) evacuate_usage
+ exit $OCF_ERR_UNIMPLEMENTED
+ ;;
+esac
+rc=$?
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+exit $rc
--
1.8.4.2