|
|
919373 |
diff -uNr a/configure.ac b/configure.ac
|
|
|
919373 |
--- a/configure.ac 2020-04-16 11:54:08.466619607 +0200
|
|
|
919373 |
+++ b/configure.ac 2020-04-16 12:05:17.241352586 +0200
|
|
|
919373 |
@@ -30,6 +30,8 @@
|
|
|
919373 |
PKG_FEATURES=""
|
|
|
919373 |
|
|
|
919373 |
AC_CONFIG_AUX_DIR(.)
|
|
|
919373 |
+AC_CONFIG_MACRO_DIR([m4])
|
|
|
919373 |
+
|
|
|
919373 |
AC_CANONICAL_HOST
|
|
|
919373 |
|
|
|
919373 |
dnl Where #defines go (e.g. `AC_CHECK_HEADERS' below)
|
|
|
919373 |
@@ -72,6 +74,11 @@
|
|
|
919373 |
[AC_MSG_ERROR([systemd support requested but pkg-config unable to query systemd package])])
|
|
|
919373 |
with_systemdsystemunitdir=no],
|
|
|
919373 |
[with_systemdsystemunitdir="$def_systemdsystemunitdir"])])
|
|
|
919373 |
+if test "x$with_systemdsystemunitdir" != "xno" && \
|
|
|
919373 |
+ test "x${prefix}" != "xNONE" && \
|
|
|
919373 |
+ test "x${prefix}" != "x/usr"; then
|
|
|
919373 |
+ with_systemdsystemunitdir="${prefix}/$with_systemdsystemunitdir"
|
|
|
919373 |
+fi
|
|
|
919373 |
AS_IF([test "x$with_systemdsystemunitdir" != "xno"],
|
|
|
919373 |
[AC_SUBST([systemdsystemunitdir], [$with_systemdsystemunitdir])])
|
|
|
919373 |
AM_CONDITIONAL([HAVE_SYSTEMD], [test "x$with_systemdsystemunitdir" != "xno"])
|
|
|
919373 |
@@ -79,6 +86,11 @@
|
|
|
919373 |
AC_ARG_WITH([systemdtmpfilesdir],
|
|
|
919373 |
AS_HELP_STRING([--with-systemdtmpfilesdir=DIR], [Directory for systemd tmp files]),
|
|
|
919373 |
[], [with_systemdtmpfilesdir=$($PKGCONFIG --variable=tmpfilesdir systemd)])
|
|
|
919373 |
+ if test "x$with_systemdtmpfilesdir" != xno && \
|
|
|
919373 |
+ test "x${prefix}" != "xNONE" && \
|
|
|
919373 |
+ test "x${prefix}" != "x/usr"; then
|
|
|
919373 |
+ with_systemdtmpfilesdir="${prefix}/$with_systemdtmpfilesdir"
|
|
|
919373 |
+ fi
|
|
|
919373 |
if test "x$with_systemdtmpfilesdir" != xno; then
|
|
|
919373 |
AC_SUBST([systemdtmpfilesdir], [$with_systemdtmpfilesdir])
|
|
|
919373 |
fi
|
|
|
919373 |
@@ -501,12 +513,35 @@
|
|
|
919373 |
AC_SUBST(RM)
|
|
|
919373 |
AC_SUBST(TEST)
|
|
|
919373 |
|
|
|
919373 |
+dnl Ensure PYTHON is an absolute path
|
|
|
919373 |
+AC_PATH_PROG([PYTHON], [$PYTHON])
|
|
|
919373 |
+
|
|
|
919373 |
AM_PATH_PYTHON
|
|
|
919373 |
if test -z "$PYTHON"; then
|
|
|
919373 |
echo "*** Essential program python not found" 1>&2
|
|
|
919373 |
- exit 1
|
|
|
919373 |
fi
|
|
|
919373 |
|
|
|
919373 |
+AC_PYTHON_MODULE(googleapiclient)
|
|
|
919373 |
+AC_PYTHON_MODULE(pyroute2)
|
|
|
919373 |
+
|
|
|
919373 |
+AS_VERSION_COMPARE([$PYTHON_VERSION], [2.7], [BUILD_OCF_PY=0], [BUILD_OCF_PY=1], [BUILD_OCF_PY=1])
|
|
|
919373 |
+
|
|
|
919373 |
+BUILD_AZURE_EVENTS=1
|
|
|
919373 |
+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
|
|
|
919373 |
+ BUILD_AZURE_EVENTS=0
|
|
|
919373 |
+ AC_MSG_WARN("Not building azure-events")
|
|
|
919373 |
+fi
|
|
|
919373 |
+AM_CONDITIONAL(BUILD_AZURE_EVENTS, test $BUILD_AZURE_EVENTS -eq 1)
|
|
|
919373 |
+
|
|
|
919373 |
+BUILD_GCP_PD_MOVE=1
|
|
|
919373 |
+AM_CONDITIONAL(BUILD_GCP_PD_MOVE, test $BUILD_GCP_PD_MOVE -eq 1)
|
|
|
919373 |
+
|
|
|
919373 |
+BUILD_GCP_VPC_MOVE_ROUTE=1
|
|
|
919373 |
+AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_ROUTE, test $BUILD_GCP_VPC_MOVE_ROUTE -eq 1)
|
|
|
919373 |
+
|
|
|
919373 |
+BUILD_GCP_VPC_MOVE_VIP=1
|
|
|
919373 |
+AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_VIP, test $BUILD_GCP_VPC_MOVE_VIP -eq 1)
|
|
|
919373 |
+
|
|
|
919373 |
AC_PATH_PROGS(ROUTE, route)
|
|
|
919373 |
AC_DEFINE_UNQUOTED(ROUTE, "$ROUTE", path to route command)
|
|
|
919373 |
|
|
|
919373 |
@@ -541,6 +576,12 @@
|
|
|
919373 |
if test x"${STYLESHEET_PREFIX}" = x""; then
|
|
|
919373 |
DIRS=$(find "${datadir}" -name $(basename $(dirname ${DOCBOOK_XSL_PATH})) \
|
|
|
919373 |
-type d | LC_ALL=C sort)
|
|
|
919373 |
+ if test x"${DIRS}" = x""; then
|
|
|
919373 |
+ # when datadir is not standard OS path, we cannot find docbook.xsl
|
|
|
919373 |
+ # use standard OS path as backup
|
|
|
919373 |
+ DIRS=$(find "/usr/share" -name $(basename $(dirname ${DOCBOOK_XSL_PATH})) \
|
|
|
919373 |
+ -type d | LC_ALL=C sort)
|
|
|
919373 |
+ fi
|
|
|
919373 |
XSLT=$(basename ${DOCBOOK_XSL_PATH})
|
|
|
919373 |
for d in ${DIRS}; do
|
|
|
919373 |
if test -f "${d}/${XSLT}"; then
|
|
|
919373 |
@@ -948,6 +989,7 @@
|
|
|
919373 |
)
|
|
|
919373 |
|
|
|
919373 |
dnl Files we output that need to be executable
|
|
|
919373 |
+AC_CONFIG_FILES([heartbeat/azure-events], [chmod +x heartbeat/azure-events])
|
|
|
919373 |
AC_CONFIG_FILES([heartbeat/AoEtarget], [chmod +x heartbeat/AoEtarget])
|
|
|
919373 |
AC_CONFIG_FILES([heartbeat/ManageRAID], [chmod +x heartbeat/ManageRAID])
|
|
|
919373 |
AC_CONFIG_FILES([heartbeat/ManageVE], [chmod +x heartbeat/ManageVE])
|
|
|
919373 |
@@ -1021,7 +1063,7 @@
|
|
|
919373 |
AC_MSG_RESULT([])
|
|
|
919373 |
AC_MSG_RESULT([$PACKAGE configuration:])
|
|
|
919373 |
AC_MSG_RESULT([ Version = ${VERSION}])
|
|
|
919373 |
-AC_MSG_RESULT([ Build Version = e711383fd5c7bef9c24ff6bc85465e59f91080f9])
|
|
|
919373 |
+AC_MSG_RESULT([ Build Version = $Format:%H$])
|
|
|
919373 |
AC_MSG_RESULT([ Features =${PKG_FEATURES}])
|
|
|
919373 |
AC_MSG_RESULT([])
|
|
|
919373 |
AC_MSG_RESULT([ Prefix = ${prefix}])
|
|
|
919373 |
diff -uNr a/doc/man/Makefile.am b/doc/man/Makefile.am
|
|
|
919373 |
--- a/doc/man/Makefile.am 2020-04-16 11:54:08.466619607 +0200
|
|
|
919373 |
+++ b/doc/man/Makefile.am 2020-04-16 12:08:34.913726440 +0200
|
|
|
919373 |
@@ -55,7 +55,7 @@
|
|
|
919373 |
# 12126 on savannah.gnu.org. But, maybe it gets fixed soon, it was
|
|
|
919373 |
# first reported in 1995 and added to Savannah in in 2005...
|
|
|
919373 |
if BUILD_DOC
|
|
|
919373 |
-man_MANS = ocf_heartbeat_AoEtarget.7 \
|
|
|
919373 |
+man_MANS = ocf_heartbeat_AoEtarget.7 \
|
|
|
919373 |
ocf_heartbeat_AudibleAlarm.7 \
|
|
|
919373 |
ocf_heartbeat_ClusterMon.7 \
|
|
|
919373 |
ocf_heartbeat_CTDB.7 \
|
|
|
919373 |
@@ -183,6 +183,22 @@
|
|
|
919373 |
man_MANS += ocf_heartbeat_IPv6addr.7
|
|
|
919373 |
endif
|
|
|
919373 |
|
|
|
919373 |
+if BUILD_AZURE_EVENTS
|
|
|
919373 |
+man_MANS += ocf_heartbeat_azure-events.7
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
+if BUILD_GCP_PD_MOVE
|
|
|
919373 |
+man_MANS += ocf_heartbeat_gcp-pd-move.7
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
+if BUILD_GCP_VPC_MOVE_ROUTE
|
|
|
919373 |
+man_MANS += ocf_heartbeat_gcp-vpc-move-route.7
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
+if BUILD_GCP_VPC_MOVE_VIP
|
|
|
919373 |
+man_MANS += ocf_heartbeat_gcp-vpc-move-vip.7
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
xmlfiles = $(man_MANS:.7=.xml)
|
|
|
919373 |
|
|
|
919373 |
%.1 %.5 %.7 %.8: %.xml
|
|
|
919373 |
diff -uNr a/heartbeat/azure-events.in b/heartbeat/azure-events.in
|
|
|
919373 |
--- a/heartbeat/azure-events.in 1970-01-01 01:00:00.000000000 +0100
|
|
|
919373 |
+++ b/heartbeat/azure-events.in 2020-04-16 12:02:15.114693551 +0200
|
|
|
919373 |
@@ -0,0 +1,824 @@
|
|
|
919373 |
+#!@PYTHON@ -tt
|
|
|
919373 |
+#
|
|
|
919373 |
+# Resource agent for monitoring Azure Scheduled Events
|
|
|
919373 |
+#
|
|
|
919373 |
+# License: GNU General Public License (GPL)
|
|
|
919373 |
+# (c) 2018 Tobias Niekamp, Microsoft Corp.
|
|
|
919373 |
+# and Linux-HA contributors
|
|
|
919373 |
+
|
|
|
919373 |
+import os
|
|
|
919373 |
+import sys
|
|
|
919373 |
+import time
|
|
|
919373 |
+import subprocess
|
|
|
919373 |
+import json
|
|
|
919373 |
+try:
|
|
|
919373 |
+ import urllib2
|
|
|
919373 |
+except ImportError:
|
|
|
919373 |
+ import urllib.request as urllib2
|
|
|
919373 |
+import socket
|
|
|
919373 |
+from collections import defaultdict
|
|
|
919373 |
+
|
|
|
919373 |
+OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT"))
|
|
|
919373 |
+sys.path.append(OCF_FUNCTIONS_DIR)
|
|
|
919373 |
+import ocf
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+
|
|
|
919373 |
+VERSION = "0.10"
|
|
|
919373 |
+USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro())
|
|
|
919373 |
+
|
|
|
919373 |
+attr_globalPullState = "azure-events_globalPullState"
|
|
|
919373 |
+attr_lastDocVersion = "azure-events_lastDocVersion"
|
|
|
919373 |
+attr_curNodeState = "azure-events_curNodeState"
|
|
|
919373 |
+attr_pendingEventIDs = "azure-events_pendingEventIDs"
|
|
|
919373 |
+
|
|
|
919373 |
+default_loglevel = ocf.logging.INFO
|
|
|
919373 |
+default_relevantEventTypes = set(["Reboot", "Redeploy"])
|
|
|
919373 |
+
|
|
|
919373 |
+global_pullMaxAttempts = 3
|
|
|
919373 |
+global_pullDelaySecs = 1
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+class attrDict(defaultdict):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ A wrapper for accessing dict keys like an attribute
|
|
|
919373 |
+ """
|
|
|
919373 |
+ def __init__(self, data):
|
|
|
919373 |
+ super(attrDict, self).__init__(attrDict)
|
|
|
919373 |
+ for d in data.keys():
|
|
|
919373 |
+ self.__setattr__(d, data[d])
|
|
|
919373 |
+
|
|
|
919373 |
+ def __getattr__(self, key):
|
|
|
919373 |
+ try:
|
|
|
919373 |
+ return self[key]
|
|
|
919373 |
+ except KeyError:
|
|
|
919373 |
+ raise AttributeError(key)
|
|
|
919373 |
+
|
|
|
919373 |
+ def __setattr__(self, key, value):
|
|
|
919373 |
+ self[key] = value
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+class azHelper:
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Helper class for Azure's metadata API (including Scheduled Events)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ metadata_host = "http://169.254.169.254/metadata"
|
|
|
919373 |
+ instance_api = "instance"
|
|
|
919373 |
+ events_api = "scheduledevents"
|
|
|
919373 |
+ api_version = "2017-08-01"
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def _sendMetadataRequest(endpoint, postData=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Send a request to Azure's Azure Metadata Service API
|
|
|
919373 |
+ """
|
|
|
919373 |
+ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version)
|
|
|
919373 |
+ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData))
|
|
|
919373 |
+ ocf.logger.debug("_sendMetadataRequest: url = %s" % url)
|
|
|
919373 |
+
|
|
|
919373 |
+ req = urllib2.Request(url, postData)
|
|
|
919373 |
+ req.add_header("Metadata", "true")
|
|
|
919373 |
+ req.add_header("User-Agent", USER_AGENT)
|
|
|
919373 |
+ resp = urllib2.urlopen(req)
|
|
|
919373 |
+ data = resp.read()
|
|
|
919373 |
+ ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
|
|
|
919373 |
+ if data:
|
|
|
919373 |
+ data = json.loads(data)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("_sendMetadataRequest: finished")
|
|
|
919373 |
+ return data
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def getInstanceInfo():
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Fetch details about the current VM from Azure's Azure Metadata Service API
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("getInstanceInfo: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api)
|
|
|
919373 |
+ ocf.logger.debug("getInstanceInfo: json = %s" % jsondata)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"]))
|
|
|
919373 |
+ return attrDict(jsondata["compute"])
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def pullScheduledEvents():
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Retrieve all currently scheduled events via Azure Metadata Service API
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("pullScheduledEvents: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ jsondata = azHelper._sendMetadataRequest(azHelper.events_api)
|
|
|
919373 |
+ ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("pullScheduledEvents: finished")
|
|
|
919373 |
+ return attrDict(jsondata)
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def forceEvents(eventIDs):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Force a set of events to start immediately
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("forceEvents: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ events = []
|
|
|
919373 |
+ for e in eventIDs:
|
|
|
919373 |
+ events.append({
|
|
|
919373 |
+ "EventId": e,
|
|
|
919373 |
+ })
|
|
|
919373 |
+ postData = {
|
|
|
919373 |
+ "StartRequests" : events
|
|
|
919373 |
+ }
|
|
|
919373 |
+ ocf.logger.info("forceEvents: postData = %s" % postData)
|
|
|
919373 |
+ resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData))
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("forceEvents: finished")
|
|
|
919373 |
+ return
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+class clusterHelper:
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Helper functions for Pacemaker control via crm
|
|
|
919373 |
+ """
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def _getLocation(node):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Helper function to retrieve local/global attributes
|
|
|
919373 |
+ """
|
|
|
919373 |
+ if node:
|
|
|
919373 |
+ return ["--node", node]
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ return ["--type", "crm_config"]
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def _exec(command, *args):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Helper function to execute a UNIX command
|
|
|
919373 |
+ """
|
|
|
919373 |
+ args = list(args)
|
|
|
919373 |
+ ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args)))
|
|
|
919373 |
+
|
|
|
919373 |
+ def flatten(*n):
|
|
|
919373 |
+ return (str(e) for a in n
|
|
|
919373 |
+ for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),)))
|
|
|
919373 |
+ command = list(flatten([command] + args))
|
|
|
919373 |
+ ocf.logger.debug("_exec: cmd = %s" % " ".join(command))
|
|
|
919373 |
+ try:
|
|
|
919373 |
+ ret = subprocess.check_output(command)
|
|
|
919373 |
+ ocf.logger.debug("_exec: return = %s" % ret)
|
|
|
919373 |
+ return ret.rstrip()
|
|
|
919373 |
+ except Exception as err:
|
|
|
919373 |
+ ocf.logger.exception(err)
|
|
|
919373 |
+ return None
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def setAttr(key, value, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Set the value of a specific global/local attribute in the Pacemaker cluster
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node))
|
|
|
919373 |
+
|
|
|
919373 |
+ if value:
|
|
|
919373 |
+ ret = clusterHelper._exec("crm_attribute",
|
|
|
919373 |
+ "--name", key,
|
|
|
919373 |
+ "--update", value,
|
|
|
919373 |
+ clusterHelper._getLocation(node))
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ret = clusterHelper._exec("crm_attribute",
|
|
|
919373 |
+ "--name", key,
|
|
|
919373 |
+ "--delete",
|
|
|
919373 |
+ clusterHelper._getLocation(node))
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("setAttr: finished")
|
|
|
919373 |
+ return len(ret) == 0
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def getAttr(key, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Retrieve a global/local attribute from the Pacemaker cluster
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node))
|
|
|
919373 |
+
|
|
|
919373 |
+ val = clusterHelper._exec("crm_attribute",
|
|
|
919373 |
+ "--name", key,
|
|
|
919373 |
+ "--query", "--quiet",
|
|
|
919373 |
+ "--default", "",
|
|
|
919373 |
+ clusterHelper._getLocation(node))
|
|
|
919373 |
+ ocf.logger.debug("getAttr: finished")
|
|
|
919373 |
+ if not val:
|
|
|
919373 |
+ return None
|
|
|
919373 |
+ return val if not val.isdigit() else int(val)
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def getAllNodes():
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Get a list of hostnames for all nodes in the Pacemaker cluster
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("getAllNodes: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ nodes = []
|
|
|
919373 |
+ nodeList = clusterHelper._exec("crm_node", "--list")
|
|
|
919373 |
+ for n in nodeList.decode().split("\n"):
|
|
|
919373 |
+ nodes.append(n.split()[1])
|
|
|
919373 |
+ ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes))
|
|
|
919373 |
+
|
|
|
919373 |
+ return nodes
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def getHostNameFromAzName(azName):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Helper function to get the actual host name from an Azure node name
|
|
|
919373 |
+ """
|
|
|
919373 |
+ return clusterHelper.getAttr("hostName_%s" % azName)
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def removeHoldFromNodes():
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Remove the ON_HOLD state from all nodes in the Pacemaker cluster
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("removeHoldFromNodes: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ for n in clusterHelper.getAllNodes():
|
|
|
919373 |
+ if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD":
|
|
|
919373 |
+ clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n)
|
|
|
919373 |
+ ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("removeHoldFromNodes: finished")
|
|
|
919373 |
+ return False
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def otherNodesAvailable(exceptNode):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode)
|
|
|
919373 |
+
|
|
|
919373 |
+ for n in clusterHelper.getAllNodes():
|
|
|
919373 |
+ state = clusterHelper.getAttr(attr_curNodeState, node=n)
|
|
|
919373 |
+ state = stringToNodeState(state) if state else AVAILABLE
|
|
|
919373 |
+ if state == AVAILABLE and n != exceptNode.hostName:
|
|
|
919373 |
+ ocf.logger.info("otherNodesAvailable: at least %s is available" % n)
|
|
|
919373 |
+ ocf.logger.debug("otherNodesAvailable: finished")
|
|
|
919373 |
+ return True
|
|
|
919373 |
+ ocf.logger.info("otherNodesAvailable: no other nodes are available")
|
|
|
919373 |
+ ocf.logger.debug("otherNodesAvailable: finished")
|
|
|
919373 |
+
|
|
|
919373 |
+ return False
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def transitionSummary():
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ # <tniek> Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node?
|
|
|
919373 |
+ # # crm_simulate -Ls
|
|
|
919373 |
+ # Transition Summary:
|
|
|
919373 |
+ # * Promote rsc_SAPHana_HN1_HDB03:0 (Slave -> Master hsr3-db1)
|
|
|
919373 |
+ # * Stop rsc_SAPHana_HN1_HDB03:1 (hsr3-db0)
|
|
|
919373 |
+ # * Move rsc_ip_HN1_HDB03 (Started hsr3-db0 -> hsr3-db1)
|
|
|
919373 |
+ # * Start rsc_nc_HN1_HDB03 (hsr3-db1)
|
|
|
919373 |
+ # # Excepted result when there are no pending actions:
|
|
|
919373 |
+ # Transition Summary:
|
|
|
919373 |
+ ocf.logger.debug("transitionSummary: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ summary = clusterHelper._exec("crm_simulate", "-Ls")
|
|
|
919373 |
+ if not summary:
|
|
|
919373 |
+ ocf.logger.warning("transitionSummary: could not load transition summary")
|
|
|
919373 |
+ return False
|
|
|
919373 |
+ if summary.find("Transition Summary:") < 0:
|
|
|
919373 |
+ ocf.logger.warning("transitionSummary: received unexpected transition summary: %s" % summary)
|
|
|
919373 |
+ return False
|
|
|
919373 |
+ summary = summary.split("Transition Summary:")[1]
|
|
|
919373 |
+ ret = summary.decode().split("\n").pop(0)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret))
|
|
|
919373 |
+ return ret
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def listOperationsOnNode(node):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Get a list of all current operations for a given node (used to check if any resources are pending)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ # hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0
|
|
|
919373 |
+ # rsc_azure-events (ocf::heartbeat:azure-events): Started: rsc_azure-events_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=115ms): complete
|
|
|
919373 |
+ # rsc_azure-events (ocf::heartbeat:azure-events): Started: rsc_azure-events_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun 8 22:37:47 2018, exec=197ms): complete
|
|
|
919373 |
+ # rsc_SAPHana_HN1_HDB03 (ocf::suse:SAPHana): Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun 8 22:37:46 2018, exec=0ms): pending
|
|
|
919373 |
+ # rsc_SAPHanaTopology_HN1_HDB03 (ocf::suse:SAPHanaTopology): Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=3214ms): complete
|
|
|
919373 |
+ ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node)
|
|
|
919373 |
+ if len(resources) == 0:
|
|
|
919373 |
+ ret = []
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ret = resources.decode().split("\n")
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret))
|
|
|
919373 |
+ return ret
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def noPendingResourcesOnNode(node):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Check that there are no pending resources on a given node
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ for r in clusterHelper.listOperationsOnNode(node):
|
|
|
919373 |
+ ocf.logger.debug("noPendingResourcesOnNode: * %s" % r)
|
|
|
919373 |
+ resource = r.split()[-1]
|
|
|
919373 |
+ if resource == "pending":
|
|
|
919373 |
+ ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource)
|
|
|
919373 |
+ ocf.logger.debug("noPendingResourcesOnNode: finished; return = False")
|
|
|
919373 |
+ return False
|
|
|
919373 |
+ ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node)
|
|
|
919373 |
+ ocf.logger.debug("noPendingResourcesOnNode: finished; return = True")
|
|
|
919373 |
+
|
|
|
919373 |
+ return True
|
|
|
919373 |
+
|
|
|
919373 |
+ @staticmethod
|
|
|
919373 |
+ def allResourcesStoppedOnNode(node):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Check that all resources on a given node are stopped
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ if clusterHelper.noPendingResourcesOnNode(node):
|
|
|
919373 |
+ if len(clusterHelper.transitionSummary()) == 0:
|
|
|
919373 |
+ ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node)
|
|
|
919373 |
+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True")
|
|
|
919373 |
+ return True
|
|
|
919373 |
+ ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty")
|
|
|
919373 |
+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
|
|
|
919373 |
+ return False
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node)
|
|
|
919373 |
+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
|
|
|
919373 |
+ return False
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+AVAILABLE = 0 # Node is online and ready to handle events
|
|
|
919373 |
+STOPPING = 1 # Standby has been triggered, but some resources are still running
|
|
|
919373 |
+IN_EVENT = 2 # All resources are stopped, and event has been initiated via Azure Metadata Service
|
|
|
919373 |
+ON_HOLD = 3 # Node has a pending event that cannot be started there are no other nodes available
|
|
|
919373 |
+
|
|
|
919373 |
+def stringToNodeState(name):
|
|
|
919373 |
+ if type(name) == int: return name
|
|
|
919373 |
+ if name == "STOPPING": return STOPPING
|
|
|
919373 |
+ if name == "IN_EVENT": return IN_EVENT
|
|
|
919373 |
+ if name == "ON_HOLD": return ON_HOLD
|
|
|
919373 |
+ return AVAILABLE
|
|
|
919373 |
+
|
|
|
919373 |
+def nodeStateToString(state):
|
|
|
919373 |
+ if state == STOPPING: return "STOPPING"
|
|
|
919373 |
+ if state == IN_EVENT: return "IN_EVENT"
|
|
|
919373 |
+ if state == ON_HOLD: return "ON_HOLD"
|
|
|
919373 |
+ return "AVAILABLE"
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+class Node:
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Core class implementing logic for a cluster node
|
|
|
919373 |
+ """
|
|
|
919373 |
+ def __init__(self, ra):
|
|
|
919373 |
+ self.raOwner = ra
|
|
|
919373 |
+ self.azInfo = azHelper.getInstanceInfo()
|
|
|
919373 |
+ self.azName = self.azInfo.name
|
|
|
919373 |
+ self.hostName = socket.gethostname()
|
|
|
919373 |
+ self.setAttr("azName", self.azName)
|
|
|
919373 |
+ clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
|
|
|
919373 |
+
|
|
|
919373 |
+ def getAttr(self, key):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Get a local attribute
|
|
|
919373 |
+ """
|
|
|
919373 |
+ return clusterHelper.getAttr(key, node=self.hostName)
|
|
|
919373 |
+
|
|
|
919373 |
+ def setAttr(self, key, value):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Set a local attribute
|
|
|
919373 |
+ """
|
|
|
919373 |
+ return clusterHelper.setAttr(key, value, node=self.hostName)
|
|
|
919373 |
+
|
|
|
919373 |
+ def selfOrOtherNode(self, node):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Helper function to distinguish self/other node
|
|
|
919373 |
+ """
|
|
|
919373 |
+ return node if node else self.hostName
|
|
|
919373 |
+
|
|
|
919373 |
+ def setState(self, state, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Set the state for a given node (or self)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ node = self.selfOrOtherNode(node)
|
|
|
919373 |
+ ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state)))
|
|
|
919373 |
+
|
|
|
919373 |
+ clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("setState: finished")
|
|
|
919373 |
+
|
|
|
919373 |
+ def getState(self, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Get the state for a given node (or self)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ node = self.selfOrOtherNode(node)
|
|
|
919373 |
+ ocf.logger.debug("getState: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ state = clusterHelper.getAttr(attr_curNodeState, node=node)
|
|
|
919373 |
+ ocf.logger.debug("getState: state = %s" % state)
|
|
|
919373 |
+ ocf.logger.debug("getState: finished")
|
|
|
919373 |
+ if not state:
|
|
|
919373 |
+ return AVAILABLE
|
|
|
919373 |
+ return stringToNodeState(state)
|
|
|
919373 |
+
|
|
|
919373 |
+ def setEventIDs(self, eventIDs, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Set pending EventIDs for a given node (or self)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ node = self.selfOrOtherNode(node)
|
|
|
919373 |
+ ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs)))
|
|
|
919373 |
+
|
|
|
919373 |
+ if eventIDs:
|
|
|
919373 |
+ eventIDStr = ",".join(eventIDs)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ eventIDStr = None
|
|
|
919373 |
+ clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("setEventIDs: finished")
|
|
|
919373 |
+ return
|
|
|
919373 |
+
|
|
|
919373 |
+ def getEventIDs(self, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Get pending EventIDs for a given node (or self)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ node = self.selfOrOtherNode(node)
|
|
|
919373 |
+ ocf.logger.debug("getEventIDs: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node)
|
|
|
919373 |
+ if eventIDStr:
|
|
|
919373 |
+ eventIDs = eventIDStr.decode().split(",")
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ eventIDs = None
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs))
|
|
|
919373 |
+ return eventIDs
|
|
|
919373 |
+
|
|
|
919373 |
+ def updateNodeStateAndEvents(self, state, eventIDs, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Set the state and pending EventIDs for a given node (or self)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs)))
|
|
|
919373 |
+
|
|
|
919373 |
+ self.setState(state, node=node)
|
|
|
919373 |
+ self.setEventIDs(eventIDs, node=node)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("updateNodeStateAndEvents: finished")
|
|
|
919373 |
+ return state
|
|
|
919373 |
+
|
|
|
919373 |
+ def putNodeStandby(self, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Put self to standby
|
|
|
919373 |
+ """
|
|
|
919373 |
+ node = self.selfOrOtherNode(node)
|
|
|
919373 |
+ ocf.logger.debug("putNodeStandby: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ clusterHelper._exec("crm_attribute",
|
|
|
919373 |
+ "-t", "nodes",
|
|
|
919373 |
+ "-N", node,
|
|
|
919373 |
+ "-n", "standby",
|
|
|
919373 |
+ "-v", "on",
|
|
|
919373 |
+ "--lifetime=forever")
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("putNodeStandby: finished")
|
|
|
919373 |
+
|
|
|
919373 |
+ def putNodeOnline(self, node=None):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Put self back online
|
|
|
919373 |
+ """
|
|
|
919373 |
+ node = self.selfOrOtherNode(node)
|
|
|
919373 |
+ ocf.logger.debug("putNodeOnline: begin; node = %s" % node)
|
|
|
919373 |
+
|
|
|
919373 |
+ clusterHelper._exec("crm_attribute",
|
|
|
919373 |
+ "-t", "nodes",
|
|
|
919373 |
+ "-N", node,
|
|
|
919373 |
+ "-n", "standby",
|
|
|
919373 |
+ "-v", "off",
|
|
|
919373 |
+ "--lifetime=forever")
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("putNodeOnline: finished")
|
|
|
919373 |
+
|
|
|
919373 |
+ def separateEvents(self, events):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Split own/other nodes' events
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("separateEvents: begin; events = %s" % str(events))
|
|
|
919373 |
+
|
|
|
919373 |
+ localEvents = []
|
|
|
919373 |
+ remoteEvents = []
|
|
|
919373 |
+ for e in events:
|
|
|
919373 |
+ e = attrDict(e)
|
|
|
919373 |
+ if e.EventType not in self.raOwner.relevantEventTypes:
|
|
|
919373 |
+ continue
|
|
|
919373 |
+ if self.azName in e.Resources:
|
|
|
919373 |
+ localEvents.append(e)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ remoteEvents.append(e)
|
|
|
919373 |
+ ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents)))
|
|
|
919373 |
+ return (localEvents, remoteEvents)
|
|
|
919373 |
+
|
|
|
919373 |
+ def removeOrphanedEvents(self, azEvents):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Remove remote events that are already finished
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("removeOrphanedEvents: begin; azEvents = %s" % str(azEvents))
|
|
|
919373 |
+
|
|
|
919373 |
+ azEventIDs = set()
|
|
|
919373 |
+ for e in azEvents:
|
|
|
919373 |
+ azEventIDs.add(e.EventId)
|
|
|
919373 |
+ # for all nodes except self ...
|
|
|
919373 |
+ for n in clusterHelper.getAllNodes():
|
|
|
919373 |
+ if n == self.hostName:
|
|
|
919373 |
+ continue
|
|
|
919373 |
+ curState = self.getState(node=n)
|
|
|
919373 |
+ # ... that still show in an event or shutting down resources ...
|
|
|
919373 |
+ if curState in (STOPPING, IN_EVENT):
|
|
|
919373 |
+ ocf.logger.info("removeOrphanedEvents: node %s has state %s" % (n, curState))
|
|
|
919373 |
+ clusterEventIDs = self.getEventIDs(node=n)
|
|
|
919373 |
+ stillActive = False
|
|
|
919373 |
+ # ... but don't have any more events running according to Azure, ...
|
|
|
919373 |
+ for p in clusterEventIDs:
|
|
|
919373 |
+ if p in azEventIDs:
|
|
|
919373 |
+ ocf.logger.info("removeOrphanedEvents: (at least) event %s on node %s has not yet finished" % (str(p), n))
|
|
|
919373 |
+ stillActive = True
|
|
|
919373 |
+ break
|
|
|
919373 |
+ if not stillActive:
|
|
|
919373 |
+ # ... put them back online.
|
|
|
919373 |
+ ocf.logger.info("removeOrphanedEvents: clusterEvents %s on node %s are not in azEvents %s -> bring node back online" % (str(clusterEventIDs), n, str(azEventIDs)))
|
|
|
919373 |
+ self.putNodeOnline(node=n)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("removeOrphanedEvents: finished")
|
|
|
919373 |
+
|
|
|
919373 |
+ def handleRemoteEvents(self, azEvents):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Handle a list of events (as provided by Azure Metadata Service) for other nodes
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("handleRemoteEvents: begin; hostName = %s, events = %s" % (self.hostName, str(azEvents)))
|
|
|
919373 |
+
|
|
|
919373 |
+ if len(azEvents) == 0:
|
|
|
919373 |
+ ocf.logger.debug("handleRemoteEvents: no remote events to handle")
|
|
|
919373 |
+ ocf.logger.debug("handleRemoteEvents: finished")
|
|
|
919373 |
+ return
|
|
|
919373 |
+ eventIDsForNode = {}
|
|
|
919373 |
+
|
|
|
919373 |
+ # iterate through all current events as per Azure
|
|
|
919373 |
+ for e in azEvents:
|
|
|
919373 |
+ ocf.logger.info("handleRemoteEvents: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources)))
|
|
|
919373 |
+ # before we can force an event to start, we need to ensure all nodes involved have stopped their resources
|
|
|
919373 |
+ if e.EventStatus == "Scheduled":
|
|
|
919373 |
+ allNodesStopped = True
|
|
|
919373 |
+ for azName in e.Resources:
|
|
|
919373 |
+ hostName = clusterHelper.getHostNameFromAzName(azName)
|
|
|
919373 |
+ state = self.getState(node=hostName)
|
|
|
919373 |
+ if state == STOPPING:
|
|
|
919373 |
+ # the only way we can continue is when node state is STOPPING, but all resources have been stopped
|
|
|
919373 |
+ if not clusterHelper.allResourcesStoppedOnNode(hostName):
|
|
|
919373 |
+ ocf.logger.info("handleRemoteEvents: (at least) node %s has still resources running -> wait" % hostName)
|
|
|
919373 |
+ allNodesStopped = False
|
|
|
919373 |
+ break
|
|
|
919373 |
+ elif state in (AVAILABLE, IN_EVENT, ON_HOLD):
|
|
|
919373 |
+ ocf.logger.info("handleRemoteEvents: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state)))
|
|
|
919373 |
+ allNodesStopped = False
|
|
|
919373 |
+ break
|
|
|
919373 |
+ if allNodesStopped:
|
|
|
919373 |
+ ocf.logger.info("handleRemoteEvents: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId))
|
|
|
919373 |
+ for n in e.Resources:
|
|
|
919373 |
+ hostName = clusterHelper.getHostNameFromAzName(n)
|
|
|
919373 |
+ if hostName in eventIDsForNode:
|
|
|
919373 |
+ eventIDsForNode[hostName].append(e.EventId)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ eventIDsForNode[hostName] = [e.EventId]
|
|
|
919373 |
+ elif e.EventStatus == "Started":
|
|
|
919373 |
+ ocf.logger.info("handleRemoteEvents: remote event already started")
|
|
|
919373 |
+
|
|
|
919373 |
+ # force the start of all events whose nodes are ready (i.e. have no more resources running)
|
|
|
919373 |
+ if len(eventIDsForNode.keys()) > 0:
|
|
|
919373 |
+ eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist])
|
|
|
919373 |
+ ocf.logger.info("handleRemoteEvents: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce)))
|
|
|
919373 |
+ for node, eventId in eventIDsForNode.items():
|
|
|
919373 |
+ self.updateNodeStateAndEvents(IN_EVENT, eventId, node=node)
|
|
|
919373 |
+ azHelper.forceEvents(eventIDsToForce)
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("handleRemoteEvents: finished")
|
|
|
919373 |
+
|
|
|
919373 |
+ def handleLocalEvents(self, azEvents):
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Handle a list of own events (as provided by Azure Metadata Service)
|
|
|
919373 |
+ """
|
|
|
919373 |
+ ocf.logger.debug("handleLocalEvents: begin; hostName = %s, azEvents = %s" % (self.hostName, str(azEvents)))
|
|
|
919373 |
+
|
|
|
919373 |
+ azEventIDs = set()
|
|
|
919373 |
+ for e in azEvents:
|
|
|
919373 |
+ azEventIDs.add(e.EventId)
|
|
|
919373 |
+
|
|
|
919373 |
+ curState = self.getState()
|
|
|
919373 |
+ clusterEventIDs = self.getEventIDs()
|
|
|
919373 |
+ mayUpdateDocVersion = False
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: current state = %s; pending local clusterEvents = %s" % (nodeStateToString(curState), str(clusterEventIDs)))
|
|
|
919373 |
+
|
|
|
919373 |
+ # check if there are currently/still events set for the node
|
|
|
919373 |
+ if clusterEventIDs:
|
|
|
919373 |
+ # there are pending events set, so our state must be STOPPING or IN_EVENT
|
|
|
919373 |
+ i = 0; touchedEventIDs = False
|
|
|
919373 |
+ while i < len(clusterEventIDs):
|
|
|
919373 |
+ # clean up pending events that are already finished according to AZ
|
|
|
919373 |
+ if clusterEventIDs[i] not in azEventIDs:
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: remove finished local clusterEvent %s" % (clusterEventIDs[i]))
|
|
|
919373 |
+ clusterEventIDs.pop(i)
|
|
|
919373 |
+ touchedEventIDs = True
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ i += 1
|
|
|
919373 |
+ if len(clusterEventIDs) > 0:
|
|
|
919373 |
+ # there are still pending events (either because we're still stopping, or because the event is still in place)
|
|
|
919373 |
+ # either way, we need to wait
|
|
|
919373 |
+ if touchedEventIDs:
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: added new local clusterEvent %s" % str(clusterEventIDs))
|
|
|
919373 |
+ self.setEventIDs(clusterEventIDs)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: no local clusterEvents were updated")
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ # there are no more pending events left after cleanup
|
|
|
919373 |
+ if clusterHelper.noPendingResourcesOnNode(self.hostName):
|
|
|
919373 |
+ # and no pending resources on the node -> set it back online
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: all local events finished -> clean up, put node online and AVAILABLE")
|
|
|
919373 |
+ curState = self.updateNodeStateAndEvents(AVAILABLE, None)
|
|
|
919373 |
+ self.putNodeOnline()
|
|
|
919373 |
+ clusterHelper.removeHoldFromNodes()
|
|
|
919373 |
+ # repeat handleLocalEvents() since we changed status to AVAILABLE
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: all local events finished, but some resources have not completed startup yet -> wait")
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ # there are no pending events set for us (yet)
|
|
|
919373 |
+ if curState == AVAILABLE:
|
|
|
919373 |
+ if len(azEventIDs) > 0:
|
|
|
919373 |
+ if clusterHelper.otherNodesAvailable(self):
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: can handle local events %s -> set state STOPPING" % (str(azEventIDs)))
|
|
|
919373 |
+ # this will also set mayUpdateDocVersion = True
|
|
|
919373 |
+ curState = self.updateNodeStateAndEvents(STOPPING, azEventIDs)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(azEventIDs))
|
|
|
919373 |
+ self.setState(ON_HOLD)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.debug("handleLocalEvents: no local azEvents to handle")
|
|
|
919373 |
+ if curState == STOPPING:
|
|
|
919373 |
+ if clusterHelper.noPendingResourcesOnNode(self.hostName):
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: all local resources are started properly -> put node standby")
|
|
|
919373 |
+ self.putNodeStandby()
|
|
|
919373 |
+ mayUpdateDocVersion = True
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.info("handleLocalEvents: some local resources are not clean yet -> wait")
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("handleLocalEvents: finished; mayUpdateDocVersion = %s" % str(mayUpdateDocVersion))
|
|
|
919373 |
+ return mayUpdateDocVersion
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+class raAzEvents:
|
|
|
919373 |
+ """
|
|
|
919373 |
+ Main class for resource agent
|
|
|
919373 |
+ """
|
|
|
919373 |
+ def __init__(self, relevantEventTypes):
|
|
|
919373 |
+ self.node = Node(self)
|
|
|
919373 |
+ self.relevantEventTypes = relevantEventTypes
|
|
|
919373 |
+
|
|
|
919373 |
+ def monitor(self):
|
|
|
919373 |
+ ocf.logger.debug("monitor: begin")
|
|
|
919373 |
+
|
|
|
919373 |
+ pullFailedAttemps = 0
|
|
|
919373 |
+ while True:
|
|
|
919373 |
+ # check if another node is pulling at the same time;
|
|
|
919373 |
+ # this should only be a concern for the first pull, as setting up Scheduled Events may take up to 2 minutes.
|
|
|
919373 |
+ if clusterHelper.getAttr(attr_globalPullState) == "PULLING":
|
|
|
919373 |
+ pullFailedAttemps += 1
|
|
|
919373 |
+ if pullFailedAttemps == global_pullMaxAttempts:
|
|
|
919373 |
+ ocf.logger.warning("monitor: exceeded maximum number of attempts (%d) to pull events" % global_pullMaxAttempts)
|
|
|
919373 |
+ ocf.logger.debug("monitor: finished")
|
|
|
919373 |
+ return ocf.OCF_SUCCESS
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.info("monitor: another node is pulling; retry in %d seconds" % global_pullDelaySecs)
|
|
|
919373 |
+ time.sleep(global_pullDelaySecs)
|
|
|
919373 |
+ continue
|
|
|
919373 |
+
|
|
|
919373 |
+ # we can pull safely from Azure Metadata Service
|
|
|
919373 |
+ clusterHelper.setAttr(attr_globalPullState, "PULLING")
|
|
|
919373 |
+ events = azHelper.pullScheduledEvents()
|
|
|
919373 |
+ clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
|
|
919373 |
+
|
|
|
919373 |
+ # get current document version
|
|
|
919373 |
+ curDocVersion = events.DocumentIncarnation
|
|
|
919373 |
+ lastDocVersion = self.node.getAttr(attr_lastDocVersion)
|
|
|
919373 |
+ ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion))
|
|
|
919373 |
+
|
|
|
919373 |
+ # split events local/remote
|
|
|
919373 |
+ (localEvents, remoteEvents) = self.node.separateEvents(events.Events)
|
|
|
919373 |
+
|
|
|
919373 |
+ # ensure local events are only executing once
|
|
|
919373 |
+ if curDocVersion != lastDocVersion:
|
|
|
919373 |
+ ocf.logger.debug("monitor: curDocVersion has not been handled yet")
|
|
|
919373 |
+ # handleLocalEvents() returns True if mayUpdateDocVersion is True;
|
|
|
919373 |
+ # this is only the case if we can ensure there are no pending events
|
|
|
919373 |
+ if self.node.handleLocalEvents(localEvents):
|
|
|
919373 |
+ ocf.logger.info("monitor: handleLocalEvents completed successfully -> update curDocVersion")
|
|
|
919373 |
+ self.node.setAttr(attr_lastDocVersion, curDocVersion)
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.debug("monitor: handleLocalEvents still waiting -> keep curDocVersion")
|
|
|
919373 |
+ else:
|
|
|
919373 |
+ ocf.logger.info("monitor: already handled curDocVersion, skip")
|
|
|
919373 |
+
|
|
|
919373 |
+ # remove orphaned remote events and then handle the remaining remote events
|
|
|
919373 |
+ self.node.removeOrphanedEvents(remoteEvents)
|
|
|
919373 |
+ self.node.handleRemoteEvents(remoteEvents)
|
|
|
919373 |
+ break
|
|
|
919373 |
+
|
|
|
919373 |
+ ocf.logger.debug("monitor: finished")
|
|
|
919373 |
+ return ocf.OCF_SUCCESS
|
|
|
919373 |
+
|
|
|
919373 |
+##############################################################################
|
|
|
919373 |
+
|
|
|
919373 |
+def setLoglevel(verbose):
|
|
|
919373 |
+ # set up writing into syslog
|
|
|
919373 |
+ loglevel = default_loglevel
|
|
|
919373 |
+ if verbose:
|
|
|
919373 |
+ opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
|
|
|
919373 |
+ urllib2.install_opener(opener)
|
|
|
919373 |
+ loglevel = ocf.logging.DEBUG
|
|
|
919373 |
+ ocf.log.setLevel(loglevel)
|
|
|
919373 |
+
|
|
|
919373 |
+description = (
|
|
|
919373 |
+ "Microsoft Azure Scheduled Events monitoring agent",
|
|
|
919373 |
+ """This resource agent implements a monitor for scheduled
|
|
|
919373 |
+(maintenance) events for a Microsoft Azure VM.
|
|
|
919373 |
+
|
|
|
919373 |
+If any relevant events are found, it moves all Pacemaker resources
|
|
|
919373 |
+away from the affected node to allow for a graceful shutdown.
|
|
|
919373 |
+
|
|
|
919373 |
+ Usage:
|
|
|
919373 |
+ [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events ACTION
|
|
|
919373 |
+
|
|
|
919373 |
+ action (required): Supported values: monitor, help, meta-data
|
|
|
919373 |
+ eventTypes (optional): List of event types to be considered
|
|
|
919373 |
+ relevant by the resource agent (comma-separated).
|
|
|
919373 |
+ Supported values: Freeze,Reboot,Redeploy
|
|
|
919373 |
+ Default = Reboot,Redeploy
|
|
|
919373 |
+/ verbose (optional): If set to true, displays debug info.
|
|
|
919373 |
+ Default = false
|
|
|
919373 |
+
|
|
|
919373 |
+ Deployment:
|
|
|
919373 |
+ crm configure primitive rsc_azure-events ocf:heartbeat:azure-events \
|
|
|
919373 |
+ op monitor interval=10s
|
|
|
919373 |
+ crm configure clone cln_azure-events rsc_azure-events
|
|
|
919373 |
+
|
|
|
919373 |
+For further information on Microsoft Azure Scheduled Events, please
|
|
|
919373 |
+refer to the following documentation:
|
|
|
919373 |
+https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events
|
|
|
919373 |
+""")
|
|
|
919373 |
+
|
|
|
919373 |
+def monitor_action(eventTypes):
|
|
|
919373 |
+ relevantEventTypes = set(eventTypes.split(",") if eventTypes else [])
|
|
|
919373 |
+ ra = raAzEvents(relevantEventTypes)
|
|
|
919373 |
+ return ra.monitor()
|
|
|
919373 |
+
|
|
|
919373 |
+def validate_action(eventTypes):
|
|
|
919373 |
+ if eventTypes:
|
|
|
919373 |
+ for event in eventTypes.split(","):
|
|
|
919373 |
+ if event not in ("Freeze", "Reboot", "Redeploy"):
|
|
|
919373 |
+ ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes)
|
|
|
919373 |
+ return ocf.OCF_ERR_CONFIGURED
|
|
|
919373 |
+ return ocf.OCF_SUCCESS
|
|
|
919373 |
+
|
|
|
919373 |
+def main():
|
|
|
919373 |
+ agent = ocf.Agent("azure-events", shortdesc=description[0], longdesc=description[1])
|
|
|
919373 |
+ agent.add_parameter(
|
|
|
919373 |
+ "eventTypes",
|
|
|
919373 |
+ shortdesc="List of resources to be considered",
|
|
|
919373 |
+ longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)",
|
|
|
919373 |
+ content_type="string",
|
|
|
919373 |
+ default="Reboot,Redeploy")
|
|
|
919373 |
+ agent.add_parameter(
|
|
|
919373 |
+ "verbose",
|
|
|
919373 |
+ shortdesc="Enable verbose agent logging",
|
|
|
919373 |
+ longdesc="Set to true to enable verbose logging",
|
|
|
919373 |
+ content_type="boolean",
|
|
|
919373 |
+ default="false")
|
|
|
919373 |
+ agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
|
|
|
919373 |
+ agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
|
|
|
919373 |
+ agent.add_action("validate-all", timeout=20, handler=validate_action)
|
|
|
919373 |
+ agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action)
|
|
|
919373 |
+ setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false")))
|
|
|
919373 |
+ agent.run()
|
|
|
919373 |
+
|
|
|
919373 |
+if __name__ == '__main__':
|
|
|
919373 |
+ main()
|
|
|
919373 |
diff -uNr a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
|
|
919373 |
--- a/heartbeat/Makefile.am 2020-04-16 11:54:08.467619588 +0200
|
|
|
919373 |
+++ b/heartbeat/Makefile.am 2020-04-16 12:08:07.788224036 +0200
|
|
|
919373 |
@@ -55,7 +55,7 @@
|
|
|
919373 |
osp_SCRIPTS = nova-compute-wait \
|
|
|
919373 |
NovaEvacuate
|
|
|
919373 |
|
|
|
919373 |
-ocf_SCRIPTS = AoEtarget \
|
|
|
919373 |
+ocf_SCRIPTS = AoEtarget \
|
|
|
919373 |
AudibleAlarm \
|
|
|
919373 |
ClusterMon \
|
|
|
919373 |
CTDB \
|
|
|
919373 |
@@ -116,10 +116,7 @@
|
|
|
919373 |
fio \
|
|
|
919373 |
galera \
|
|
|
919373 |
garbd \
|
|
|
919373 |
- gcp-pd-move \
|
|
|
919373 |
gcp-vpc-move-ip \
|
|
|
919373 |
- gcp-vpc-move-vip \
|
|
|
919373 |
- gcp-vpc-move-route \
|
|
|
919373 |
iSCSILogicalUnit \
|
|
|
919373 |
iSCSITarget \
|
|
|
919373 |
ids \
|
|
|
919373 |
@@ -177,6 +174,22 @@
|
|
|
919373 |
vsftpd \
|
|
|
919373 |
zabbixserver
|
|
|
919373 |
|
|
|
919373 |
+if BUILD_AZURE_EVENTS
|
|
|
919373 |
+ocf_SCRIPTS += azure-events
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
+if BUILD_GCP_PD_MOVE
|
|
|
919373 |
+ocf_SCRIPTS += gcp-pd-move
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
+if BUILD_GCP_VPC_MOVE_ROUTE
|
|
|
919373 |
+ocf_SCRIPTS += gcp-vpc-move-route
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
+if BUILD_GCP_VPC_MOVE_VIP
|
|
|
919373 |
+ocf_SCRIPTS += gcp-vpc-move-vip
|
|
|
919373 |
+endif
|
|
|
919373 |
+
|
|
|
919373 |
ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat
|
|
|
919373 |
ocfcommon_DATA = ocf-shellfuncs \
|
|
|
919373 |
ocf-binaries \
|
|
|
919373 |
@@ -205,3 +218,13 @@
|
|
|
919373 |
|
|
|
919373 |
%.check: %
|
|
|
919373 |
OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) ./$< meta-data | xmllint --path $(abs_srcdir) --noout --relaxng $(abs_srcdir)/metadata.rng -
|
|
|
919373 |
+
|
|
|
919373 |
+do_spellcheck = printf '[%s]\n' "$(agent)"; \
|
|
|
919373 |
+ OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) \
|
|
|
919373 |
+ ./$(agent) meta-data 2>/dev/null \
|
|
|
919373 |
+ | xsltproc $(top_srcdir)/make/extract_text.xsl - \
|
|
|
919373 |
+ | aspell pipe list -d en_US --ignore-case \
|
|
|
919373 |
+ --home-dir=$(top_srcdir)/make -p spellcheck-ignore \
|
|
|
919373 |
+ | sed -n 's|^&\([^:]*\):.*|\1|p';
|
|
|
919373 |
+spellcheck:
|
|
|
919373 |
+ @$(foreach agent,$(ocf_SCRIPTS), $(do_spellcheck))
|
|
|
919373 |
diff -uNr a/m4/ac_python_module.m4 b/m4/ac_python_module.m4
|
|
|
919373 |
--- a/m4/ac_python_module.m4 1970-01-01 01:00:00.000000000 +0100
|
|
|
919373 |
+++ b/m4/ac_python_module.m4 2020-04-14 11:11:26.325806378 +0200
|
|
|
919373 |
@@ -0,0 +1,30 @@
|
|
|
919373 |
+dnl @synopsis AC_PYTHON_MODULE(modname[, fatal])
|
|
|
919373 |
+dnl
|
|
|
919373 |
+dnl Checks for Python module.
|
|
|
919373 |
+dnl
|
|
|
919373 |
+dnl If fatal is non-empty then absence of a module will trigger an
|
|
|
919373 |
+dnl error.
|
|
|
919373 |
+dnl
|
|
|
919373 |
+dnl @category InstalledPackages
|
|
|
919373 |
+dnl @author Andrew Collier <colliera@nu.ac.za>.
|
|
|
919373 |
+dnl @version 2004-07-14
|
|
|
919373 |
+dnl @license AllPermissive
|
|
|
919373 |
+
|
|
|
919373 |
+AC_DEFUN([AC_PYTHON_MODULE],[
|
|
|
919373 |
+ AC_MSG_CHECKING(python module: $1)
|
|
|
919373 |
+ $PYTHON -c "import $1" 2>/dev/null
|
|
|
919373 |
+ if test $? -eq 0;
|
|
|
919373 |
+ then
|
|
|
919373 |
+ AC_MSG_RESULT(yes)
|
|
|
919373 |
+ eval AS_TR_CPP(HAVE_PYMOD_$1)=yes
|
|
|
919373 |
+ else
|
|
|
919373 |
+ AC_MSG_RESULT(no)
|
|
|
919373 |
+ eval AS_TR_CPP(HAVE_PYMOD_$1)=no
|
|
|
919373 |
+ #
|
|
|
919373 |
+ if test -n "$2"
|
|
|
919373 |
+ then
|
|
|
919373 |
+ AC_MSG_ERROR(failed to find required module $1)
|
|
|
919373 |
+ exit 1
|
|
|
919373 |
+ fi
|
|
|
919373 |
+ fi
|
|
|
919373 |
+])
|