Tree - rpms/resource-agents - CentOS Git server

rpms / resource-agents

Files

Commit: 5c52dd9975c1528749bc2983381cf4462e21857f
Blob Blame History Raw
diff -uNr a/configure.ac b/configure.ac
--- a/configure.ac	2020-04-08 13:20:19.895631628 +0200
+++ b/configure.ac	2020-04-14 11:23:31.169755539 +0200
@@ -30,6 +30,8 @@
 PKG_FEATURES=""
 
 AC_CONFIG_AUX_DIR(.)
+AC_CONFIG_MACRO_DIR([m4])
+
 AC_CANONICAL_HOST
 
 dnl Where #defines go (e.g. `AC_CHECK_HEADERS' below)
@@ -72,6 +74,11 @@
                 [AC_MSG_ERROR([systemd support requested but pkg-config unable to query systemd package])])
           with_systemdsystemunitdir=no],
          [with_systemdsystemunitdir="$def_systemdsystemunitdir"])])
+if test "x$with_systemdsystemunitdir" != "xno" && \
+   test "x${prefix}" != "xNONE" && \
+   test "x${prefix}" != "x/usr"; then
+       with_systemdsystemunitdir="${prefix}/$with_systemdsystemunitdir"
+fi
 AS_IF([test "x$with_systemdsystemunitdir" != "xno"],
       [AC_SUBST([systemdsystemunitdir], [$with_systemdsystemunitdir])])
 AM_CONDITIONAL([HAVE_SYSTEMD], [test "x$with_systemdsystemunitdir" != "xno"])
@@ -79,6 +86,11 @@
 AC_ARG_WITH([systemdtmpfilesdir],
      AS_HELP_STRING([--with-systemdtmpfilesdir=DIR], [Directory for systemd tmp files]),
      [], [with_systemdtmpfilesdir=$($PKGCONFIG --variable=tmpfilesdir systemd)])
+     if test "x$with_systemdtmpfilesdir" != xno && \
+        test "x${prefix}" != "xNONE" && \
+        test "x${prefix}" != "x/usr"; then
+            with_systemdtmpfilesdir="${prefix}/$with_systemdtmpfilesdir"
+     fi
      if test "x$with_systemdtmpfilesdir" != xno; then
          AC_SUBST([systemdtmpfilesdir], [$with_systemdtmpfilesdir])
      fi
@@ -431,7 +443,7 @@
 
 # Expand $prefix
 eval HA_RSCTMPDIR="`eval echo ${HA_RSCTMPDIR}`"
-AC_DEFINE_UNQUOTED(HA_RSCTMPDIR,"$HA_RSCTMPDIR", Where Resouce agents keep state files)
+AC_DEFINE_UNQUOTED(HA_RSCTMPDIR,"$HA_RSCTMPDIR", Where Resource agents keep state files)
 AC_SUBST(HA_RSCTMPDIR)
 
 dnl Eventually move out of the heartbeat dir tree and create symlinks when needed
@@ -501,12 +513,35 @@
 AC_SUBST(RM)
 AC_SUBST(TEST)
 
+dnl Ensure PYTHON is an absolute path
+AC_PATH_PROG([PYTHON], [$PYTHON])
+
 AM_PATH_PYTHON
 if test -z "$PYTHON"; then
     echo "*** Essential program python not found" 1>&2
-    exit 1
 fi
 
+AC_PYTHON_MODULE(googleapiclient)
+AC_PYTHON_MODULE(pyroute2)
+
+AS_VERSION_COMPARE([$PYTHON_VERSION], [2.7], [BUILD_OCF_PY=0], [BUILD_OCF_PY=1], [BUILD_OCF_PY=1])
+
+BUILD_AZURE_EVENTS=1
+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
+    BUILD_AZURE_EVENTS=0
+    AC_MSG_WARN("Not building azure-events")
+fi
+AM_CONDITIONAL(BUILD_AZURE_EVENTS, test $BUILD_AZURE_EVENTS -eq 1)
+
+BUILD_GCP_PD_MOVE=1
+AM_CONDITIONAL(BUILD_GCP_PD_MOVE, test $BUILD_GCP_PD_MOVE -eq 1)
+
+BUILD_GCP_VPC_MOVE_ROUTE=1
+AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_ROUTE, test $BUILD_GCP_VPC_MOVE_ROUTE -eq 1)
+
+BUILD_GCP_VPC_MOVE_VIP=1
+AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_VIP, test $BUILD_GCP_VPC_MOVE_VIP -eq 1)
+
 AC_PATH_PROGS(ROUTE, route)
 AC_DEFINE_UNQUOTED(ROUTE, "$ROUTE", path to route command)
 
@@ -541,6 +576,12 @@
     if test x"${STYLESHEET_PREFIX}" = x""; then
         DIRS=$(find "${datadir}" -name $(basename $(dirname ${DOCBOOK_XSL_PATH})) \
                -type d | LC_ALL=C sort)
+        if test x"${DIRS}" = x""; then
+            # when datadir is not standard OS path, we cannot find docbook.xsl
+            # use standard OS path as backup
+            DIRS=$(find "/usr/share" -name $(basename $(dirname ${DOCBOOK_XSL_PATH})) \
+                   -type d | LC_ALL=C sort)
+        fi
         XSLT=$(basename ${DOCBOOK_XSL_PATH})
         for d in ${DIRS}; do
             if test -f "${d}/${XSLT}"; then
@@ -911,6 +952,7 @@
    heartbeat/ocf-shellfuncs					\
    heartbeat/shellfuncs						\
 systemd/Makefile						\
+   systemd/resource-agents.conf					\
 tools/Makefile							\
    tools/ocf-tester						\
    tools/ocft/Makefile						\
@@ -947,6 +989,7 @@
 		)
 
 dnl Files we output that need to be executable
+AC_CONFIG_FILES([heartbeat/azure-events], [chmod +x heartbeat/azure-events])
 AC_CONFIG_FILES([heartbeat/AoEtarget], [chmod +x heartbeat/AoEtarget])
 AC_CONFIG_FILES([heartbeat/ManageRAID], [chmod +x heartbeat/ManageRAID])
 AC_CONFIG_FILES([heartbeat/ManageVE], [chmod +x heartbeat/ManageVE])
@@ -1020,7 +1063,7 @@
 AC_MSG_RESULT([])
 AC_MSG_RESULT([$PACKAGE configuration:])
 AC_MSG_RESULT([  Version                  = ${VERSION}])
-AC_MSG_RESULT([  Build Version            = e711383fd5c7bef9c24ff6bc85465e59f91080f9])
+AC_MSG_RESULT([  Build Version            = $Format:%H$])
 AC_MSG_RESULT([  Features                 =${PKG_FEATURES}])
 AC_MSG_RESULT([])
 AC_MSG_RESULT([  Prefix                   = ${prefix}])
diff -uNr a/doc/man/Makefile.am b/doc/man/Makefile.am
--- a/doc/man/Makefile.am	2020-04-08 13:20:19.914631297 +0200
+++ b/doc/man/Makefile.am	2020-04-14 11:11:26.334806204 +0200
@@ -55,7 +55,7 @@
 # 12126 on savannah.gnu.org. But, maybe it gets fixed soon, it was
 # first reported in 1995 and added to Savannah in in 2005...
 if BUILD_DOC
-man_MANS	       = ocf_heartbeat_AoEtarget.7 \
+man_MANS                = ocf_heartbeat_AoEtarget.7 \
                           ocf_heartbeat_AudibleAlarm.7 \
                           ocf_heartbeat_ClusterMon.7 \
                           ocf_heartbeat_CTDB.7 \
@@ -186,6 +186,22 @@
 man_MANS           	+= ocf_heartbeat_IPv6addr.7
 endif
 
+if BUILD_AZURE_EVENTS
+man_MANS           	+= ocf_heartbeat_azure-events.7
+endif
+
+if BUILD_GCP_PD_MOVE
+man_MANS           	+= ocf_heartbeat_gcp-pd-move.7
+endif
+
+if BUILD_GCP_VPC_MOVE_ROUTE
+man_MANS           	+= ocf_heartbeat_gcp-vpc-move-route.7
+endif
+
+if BUILD_GCP_VPC_MOVE_VIP
+man_MANS           	+= ocf_heartbeat_gcp-vpc-move-vip.7
+endif
+
 xmlfiles		= $(man_MANS:.7=.xml)
 
 %.1 %.5 %.7 %.8: %.xml
diff -uNr a/heartbeat/azure-events.in b/heartbeat/azure-events.in
--- a/heartbeat/azure-events.in	1970-01-01 01:00:00.000000000 +0100
+++ b/heartbeat/azure-events.in	2020-04-14 11:11:26.325806378 +0200
@@ -0,0 +1,824 @@
+#!@PYTHON@ -tt
+#
+#	Resource agent for monitoring Azure Scheduled Events
+#
+# 	License:	GNU General Public License (GPL)
+#	(c) 2018 	Tobias Niekamp, Microsoft Corp.
+#				and Linux-HA contributors
+
+import os
+import sys
+import time
+import subprocess
+import json
+try:
+		import urllib2
+except ImportError:
+		import urllib.request as urllib2
+import socket
+from collections import defaultdict
+
+OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT"))
+sys.path.append(OCF_FUNCTIONS_DIR)
+import ocf
+
+##############################################################################
+
+
+VERSION = "0.10"
+USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro())
+
+attr_globalPullState = "azure-events_globalPullState"
+attr_lastDocVersion  = "azure-events_lastDocVersion"
+attr_curNodeState = "azure-events_curNodeState"
+attr_pendingEventIDs = "azure-events_pendingEventIDs"
+
+default_loglevel = ocf.logging.INFO
+default_relevantEventTypes = set(["Reboot", "Redeploy"])
+
+global_pullMaxAttempts = 3
+global_pullDelaySecs = 1
+
+##############################################################################
+
+class attrDict(defaultdict):
+	"""
+	A wrapper for accessing dict keys like an attribute
+	"""
+	def __init__(self, data):
+		super(attrDict, self).__init__(attrDict)
+		for d in data.keys():
+			self.__setattr__(d, data[d])
+
+	def __getattr__(self, key):
+		try:
+			return self[key]
+		except KeyError:
+			raise AttributeError(key)
+
+	def __setattr__(self, key, value):
+		self[key] = value
+
+##############################################################################
+
+class azHelper:
+	"""
+	Helper class for Azure's metadata API (including Scheduled Events)
+	"""
+	metadata_host = "http://169.254.169.254/metadata"
+	instance_api  = "instance"
+	events_api    = "scheduledevents"
+	api_version   = "2017-08-01"
+
+	@staticmethod
+	def _sendMetadataRequest(endpoint, postData=None):
+		"""
+		Send a request to Azure's Azure Metadata Service API
+		"""
+		url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version)
+		ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData))
+		ocf.logger.debug("_sendMetadataRequest: url = %s" % url)
+
+		req = urllib2.Request(url, postData)
+		req.add_header("Metadata", "true")
+		req.add_header("User-Agent", USER_AGENT)
+		resp = urllib2.urlopen(req)
+		data = resp.read()
+		ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
+		if data:
+			data = json.loads(data)
+
+		ocf.logger.debug("_sendMetadataRequest: finished")
+		return data
+
+	@staticmethod
+	def getInstanceInfo():
+		"""
+		Fetch details about the current VM from Azure's Azure Metadata Service API
+		"""
+		ocf.logger.debug("getInstanceInfo: begin")
+
+		jsondata = azHelper._sendMetadataRequest(azHelper.instance_api)
+		ocf.logger.debug("getInstanceInfo: json = %s" % jsondata)
+
+		ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"]))
+		return attrDict(jsondata["compute"])
+
+	@staticmethod
+	def pullScheduledEvents():
+		"""
+		Retrieve all currently scheduled events via Azure Metadata Service API
+		"""
+		ocf.logger.debug("pullScheduledEvents: begin")
+
+		jsondata = azHelper._sendMetadataRequest(azHelper.events_api)
+		ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata)
+
+		ocf.logger.debug("pullScheduledEvents: finished")
+		return attrDict(jsondata)
+
+	@staticmethod
+	def forceEvents(eventIDs):
+		"""
+		Force a set of events to start immediately
+		"""
+		ocf.logger.debug("forceEvents: begin")
+
+		events = []
+		for e in eventIDs:
+			events.append({
+				"EventId": e,
+			})
+		postData = {
+			"StartRequests" : events
+		}
+		ocf.logger.info("forceEvents: postData = %s" % postData)
+		resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData))
+
+		ocf.logger.debug("forceEvents: finished")
+		return
+
+##############################################################################
+
+class clusterHelper:
+	"""
+	Helper functions for Pacemaker control via crm
+	"""
+	@staticmethod
+	def _getLocation(node):
+		"""
+		Helper function to retrieve local/global attributes
+		"""
+		if node:
+			return ["--node", node]
+		else:
+			return ["--type", "crm_config"]
+
+	@staticmethod
+	def _exec(command, *args):
+		"""
+		Helper function to execute a UNIX command
+		"""
+		args = list(args)
+		ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args)))
+
+		def flatten(*n):
+			return (str(e) for a in n
+				for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),)))
+		command = list(flatten([command] + args))
+		ocf.logger.debug("_exec: cmd = %s" % " ".join(command))
+		try:
+			ret = subprocess.check_output(command)
+			ocf.logger.debug("_exec: return = %s" % ret)
+			return ret.rstrip()
+		except Exception as err:
+			ocf.logger.exception(err)
+			return None
+
+	@staticmethod
+	def setAttr(key, value, node=None):
+		"""
+		Set the value of a specific global/local attribute in the Pacemaker cluster
+		"""
+		ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node))
+
+		if value:
+			ret = clusterHelper._exec("crm_attribute",
+									  "--name", key,
+									  "--update", value,
+									  clusterHelper._getLocation(node))
+		else:
+			ret = clusterHelper._exec("crm_attribute",
+									  "--name", key,
+									  "--delete",
+									  clusterHelper._getLocation(node))
+
+		ocf.logger.debug("setAttr: finished")
+		return len(ret) == 0
+
+	@staticmethod
+	def getAttr(key, node=None):
+		"""
+		Retrieve a global/local attribute from the Pacemaker cluster
+		"""
+		ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node))
+
+		val = clusterHelper._exec("crm_attribute",
+								  "--name", key,
+								  "--query", "--quiet",
+								  "--default", "",
+								  clusterHelper._getLocation(node))
+		ocf.logger.debug("getAttr: finished")
+		if not val:
+			return None
+		return val if not val.isdigit() else int(val)
+
+	@staticmethod
+	def getAllNodes():
+		"""
+		Get a list of hostnames for all nodes in the Pacemaker cluster
+		"""
+		ocf.logger.debug("getAllNodes: begin")
+
+		nodes = []
+		nodeList = clusterHelper._exec("crm_node", "--list")
+		for n in nodeList.decode().split("\n"):
+			nodes.append(n.split()[1])
+		ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes))
+
+		return nodes
+
+	@staticmethod
+	def getHostNameFromAzName(azName):
+		"""
+		Helper function to get the actual host name from an Azure node name
+		"""
+		return clusterHelper.getAttr("hostName_%s" % azName)
+
+	@staticmethod
+	def removeHoldFromNodes():
+		"""
+		Remove the ON_HOLD state from all nodes in the Pacemaker cluster
+		"""
+		ocf.logger.debug("removeHoldFromNodes: begin")
+
+		for n in clusterHelper.getAllNodes():
+			if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD":
+				clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n)
+				ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n)
+
+		ocf.logger.debug("removeHoldFromNodes: finished")
+		return False
+
+	@staticmethod
+	def otherNodesAvailable(exceptNode):
+		"""
+		Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE
+		"""
+		ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode)
+
+		for n in clusterHelper.getAllNodes():
+			state = clusterHelper.getAttr(attr_curNodeState, node=n)
+			state = stringToNodeState(state) if state else AVAILABLE
+			if state == AVAILABLE and n != exceptNode.hostName:
+				ocf.logger.info("otherNodesAvailable: at least %s is available" % n)
+				ocf.logger.debug("otherNodesAvailable: finished")
+				return True
+		ocf.logger.info("otherNodesAvailable: no other nodes are available")
+		ocf.logger.debug("otherNodesAvailable: finished")
+
+		return False
+
+	@staticmethod
+	def transitionSummary():
+		"""
+		Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby)
+		"""
+		# <tniek> Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node?
+		# # crm_simulate -Ls
+		# 	Transition Summary:
+		# 	 * Promote rsc_SAPHana_HN1_HDB03:0      (Slave -> Master hsr3-db1)
+		# 	 * Stop    rsc_SAPHana_HN1_HDB03:1      (hsr3-db0)
+		# 	 * Move    rsc_ip_HN1_HDB03     (Started hsr3-db0 -> hsr3-db1)
+		# 	 * Start   rsc_nc_HN1_HDB03     (hsr3-db1)
+		# # Excepted result when there are no pending actions:
+		# 	Transition Summary:
+		ocf.logger.debug("transitionSummary: begin")
+
+		summary = clusterHelper._exec("crm_simulate", "-Ls")
+		if not summary:
+			ocf.logger.warning("transitionSummary: could not load transition summary")
+			return False
+		if summary.find("Transition Summary:") < 0:
+			ocf.logger.warning("transitionSummary: received unexpected transition summary: %s" % summary)
+			return False
+		summary = summary.split("Transition Summary:")[1]
+		ret = summary.decode().split("\n").pop(0)
+
+		ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret))
+		return ret
+
+	@staticmethod
+	def listOperationsOnNode(node):
+		"""
+		Get a list of all current operations for a given node (used to check if any resources are pending)
+		"""
+		# hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0
+		# rsc_azure-events    (ocf::heartbeat:azure-events):      Started: rsc_azure-events_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun  8 22:37:46 2018, exec=115ms): complete
+		# rsc_azure-events    (ocf::heartbeat:azure-events):      Started: rsc_azure-events_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun  8 22:37:47 2018, exec=197ms): complete
+		# rsc_SAPHana_HN1_HDB03   (ocf::suse:SAPHana):    Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun  8 22:37:46 2018, exec=0ms): pending
+		# rsc_SAPHanaTopology_HN1_HDB03   (ocf::suse:SAPHanaTopology):    Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun  8 22:37:46 2018, exec=3214ms): complete
+		ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node)
+
+		resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node)
+		if len(resources) == 0:
+			ret = []
+		else:
+			ret = resources.decode().split("\n")
+
+		ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret))
+		return ret
+
+	@staticmethod
+	def noPendingResourcesOnNode(node):
+		"""
+		Check that there are no pending resources on a given node
+		"""
+		ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node)
+
+		for r in clusterHelper.listOperationsOnNode(node):
+			ocf.logger.debug("noPendingResourcesOnNode: * %s" % r)
+			resource = r.split()[-1]
+			if resource == "pending":
+				ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource)
+				ocf.logger.debug("noPendingResourcesOnNode: finished; return = False")
+				return False
+		ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node)
+		ocf.logger.debug("noPendingResourcesOnNode: finished; return = True")
+
+		return True
+
+	@staticmethod
+	def allResourcesStoppedOnNode(node):
+		"""
+		Check that all resources on a given node are stopped
+		"""
+		ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node)
+
+		if clusterHelper.noPendingResourcesOnNode(node):
+			if len(clusterHelper.transitionSummary()) == 0:
+				ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node)
+				ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True")
+				return True
+			ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty")
+			ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
+			return False
+
+		ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node)
+		ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
+		return False
+
+##############################################################################
+
+AVAILABLE = 0	# Node is online and ready to handle events
+STOPPING = 1	# Standby has been triggered, but some resources are still running
+IN_EVENT = 2	# All resources are stopped, and event has been initiated via Azure Metadata Service
+ON_HOLD = 3		# Node has a pending event that cannot be started there are no other nodes available
+
+def stringToNodeState(name):
+	if type(name) == int: return name
+	if name == "STOPPING": return STOPPING
+	if name == "IN_EVENT": return IN_EVENT
+	if name == "ON_HOLD": return ON_HOLD
+	return AVAILABLE
+
+def nodeStateToString(state):
+	if state == STOPPING: return "STOPPING"
+	if state == IN_EVENT: return "IN_EVENT"
+	if state == ON_HOLD: return "ON_HOLD"
+	return "AVAILABLE"
+
+##############################################################################
+
+class Node:
+	"""
+	Core class implementing logic for a cluster node
+	"""
+	def __init__(self, ra):
+		self.raOwner  = ra
+		self.azInfo   = azHelper.getInstanceInfo()
+		self.azName   = self.azInfo.name
+		self.hostName = socket.gethostname()
+		self.setAttr("azName", self.azName)
+		clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
+
+	def getAttr(self, key):
+		"""
+		Get a local attribute
+		"""
+		return clusterHelper.getAttr(key, node=self.hostName)
+
+	def setAttr(self, key, value):
+		"""
+		Set a local attribute
+		"""
+		return clusterHelper.setAttr(key, value, node=self.hostName)
+
+	def selfOrOtherNode(self, node):
+		"""
+		Helper function to distinguish self/other node
+		"""
+		return node if node else self.hostName
+
+	def setState(self, state, node=None):
+		"""
+		Set the state for a given node (or self)
+		"""
+		node = self.selfOrOtherNode(node)
+		ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state)))
+
+		clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node)
+
+		ocf.logger.debug("setState: finished")
+
+	def getState(self, node=None):
+		"""
+		Get the state for a given node (or self)
+		"""
+		node = self.selfOrOtherNode(node)
+		ocf.logger.debug("getState: begin; node = %s" % node)
+
+		state = clusterHelper.getAttr(attr_curNodeState, node=node)
+		ocf.logger.debug("getState: state = %s" % state)
+		ocf.logger.debug("getState: finished")
+		if not state:
+			return AVAILABLE
+		return stringToNodeState(state)
+
+	def setEventIDs(self, eventIDs, node=None):
+		"""
+		Set pending EventIDs for a given node (or self)
+		"""
+		node = self.selfOrOtherNode(node)
+		ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs)))
+
+		if eventIDs:
+			eventIDStr = ",".join(eventIDs)
+		else:
+			eventIDStr = None
+		clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node)
+
+		ocf.logger.debug("setEventIDs: finished")
+		return
+
+	def getEventIDs(self, node=None):
+		"""
+		Get pending EventIDs for a given node (or self)
+		"""
+		node = self.selfOrOtherNode(node)
+		ocf.logger.debug("getEventIDs: begin; node = %s" % node)
+
+		eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node)
+		if eventIDStr:
+			eventIDs = eventIDStr.decode().split(",")
+		else:
+			eventIDs = None
+
+		ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs))
+		return eventIDs
+
+	def updateNodeStateAndEvents(self, state, eventIDs, node=None):
+		"""
+		Set the state and pending EventIDs for a given node (or self)
+		"""
+		ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs)))
+
+		self.setState(state, node=node)
+		self.setEventIDs(eventIDs, node=node)
+
+		ocf.logger.debug("updateNodeStateAndEvents: finished")
+		return state
+
+	def putNodeStandby(self, node=None):
+		"""
+		Put self to standby
+		"""
+		node = self.selfOrOtherNode(node)
+		ocf.logger.debug("putNodeStandby: begin; node = %s" % node)
+
+		clusterHelper._exec("crm_attribute",
+							"-t", "nodes",
+							"-N", node,
+							"-n", "standby",
+							"-v", "on",
+							"--lifetime=forever")
+
+		ocf.logger.debug("putNodeStandby: finished")
+
+	def putNodeOnline(self, node=None):
+		"""
+		Put self back online
+		"""
+		node = self.selfOrOtherNode(node)
+		ocf.logger.debug("putNodeOnline: begin; node = %s" % node)
+
+		clusterHelper._exec("crm_attribute",
+							"-t", "nodes",
+							"-N", node,
+							"-n", "standby",
+							"-v", "off",
+							"--lifetime=forever")
+
+		ocf.logger.debug("putNodeOnline: finished")
+
+	def separateEvents(self, events):
+		"""
+		Split own/other nodes' events
+		"""
+		ocf.logger.debug("separateEvents: begin; events = %s" % str(events))
+
+		localEvents = []
+		remoteEvents = []
+		for e in events:
+			e = attrDict(e)
+			if e.EventType not in self.raOwner.relevantEventTypes:
+				continue
+			if self.azName in e.Resources:
+				localEvents.append(e)
+			else:
+				remoteEvents.append(e)
+		ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents)))
+		return (localEvents, remoteEvents)
+
+	def removeOrphanedEvents(self, azEvents):
+		"""
+		Remove remote events that are already finished
+		"""
+		ocf.logger.debug("removeOrphanedEvents: begin; azEvents = %s" % str(azEvents))
+
+		azEventIDs = set()
+		for e in azEvents:
+			azEventIDs.add(e.EventId)
+		# for all nodes except self ...
+		for n in clusterHelper.getAllNodes():
+			if n == self.hostName:
+				continue
+			curState = self.getState(node=n)
+			# ... that still show in an event or shutting down resources ...
+			if curState in (STOPPING, IN_EVENT):
+				ocf.logger.info("removeOrphanedEvents: node %s has state %s" % (n, curState))
+				clusterEventIDs = self.getEventIDs(node=n)
+				stillActive = False
+				# ... but don't have any more events running according to Azure, ...
+				for p in clusterEventIDs:
+					if p in azEventIDs:
+						ocf.logger.info("removeOrphanedEvents: (at least) event %s on node %s has not yet finished" % (str(p), n))
+						stillActive = True
+						break
+				if not stillActive:
+					# ... put them back online.
+					ocf.logger.info("removeOrphanedEvents: clusterEvents %s on node %s are not in azEvents %s -> bring node back online" % (str(clusterEventIDs), n, str(azEventIDs)))
+					self.putNodeOnline(node=n)
+
+		ocf.logger.debug("removeOrphanedEvents: finished")
+
+	def handleRemoteEvents(self, azEvents):
+		"""
+		Handle a list of events (as provided by Azure Metadata Service) for other nodes
+		"""
+		ocf.logger.debug("handleRemoteEvents: begin; hostName = %s, events = %s" % (self.hostName, str(azEvents)))
+
+		if len(azEvents) == 0:
+			ocf.logger.debug("handleRemoteEvents: no remote events to handle")
+			ocf.logger.debug("handleRemoteEvents: finished")
+			return
+		eventIDsForNode = {}
+
+		# iterate through all current events as per Azure
+		for e in azEvents:
+			ocf.logger.info("handleRemoteEvents: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources)))
+			# before we can force an event to start, we need to ensure all nodes involved have stopped their resources
+			if e.EventStatus == "Scheduled":
+				allNodesStopped = True
+				for azName in e.Resources:
+					hostName = clusterHelper.getHostNameFromAzName(azName)
+					state = self.getState(node=hostName)
+					if state == STOPPING:
+						# the only way we can continue is when node state is STOPPING, but all resources have been stopped
+						if not clusterHelper.allResourcesStoppedOnNode(hostName):
+							ocf.logger.info("handleRemoteEvents: (at least) node %s has still resources running -> wait" % hostName)
+							allNodesStopped = False
+							break
+					elif state in (AVAILABLE, IN_EVENT, ON_HOLD):
+						ocf.logger.info("handleRemoteEvents: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state)))
+						allNodesStopped = False
+						break
+				if allNodesStopped:
+					ocf.logger.info("handleRemoteEvents: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId))
+					for n in e.Resources:
+						hostName = clusterHelper.getHostNameFromAzName(n)
+						if hostName in eventIDsForNode:
+							eventIDsForNode[hostName].append(e.EventId)
+						else:
+							eventIDsForNode[hostName] = [e.EventId]
+			elif e.EventStatus == "Started":
+				ocf.logger.info("handleRemoteEvents: remote event already started")
+
+		# force the start of all events whose nodes are ready (i.e. have no more resources running)
+		if len(eventIDsForNode.keys()) > 0:
+			eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist])
+			ocf.logger.info("handleRemoteEvents: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce)))
+			for node, eventId in eventIDsForNode.items():
+				self.updateNodeStateAndEvents(IN_EVENT, eventId, node=node)
+			azHelper.forceEvents(eventIDsToForce)
+
+		ocf.logger.debug("handleRemoteEvents: finished")
+
+	def handleLocalEvents(self, azEvents):
+		"""
+		Handle a list of own events (as provided by Azure Metadata Service)
+		"""
+		ocf.logger.debug("handleLocalEvents: begin; hostName = %s, azEvents = %s" % (self.hostName, str(azEvents)))
+
+		azEventIDs = set()
+		for e in azEvents:
+			azEventIDs.add(e.EventId)
+
+		curState = self.getState()
+		clusterEventIDs = self.getEventIDs()
+		mayUpdateDocVersion = False
+		ocf.logger.info("handleLocalEvents: current state = %s; pending local clusterEvents = %s" % (nodeStateToString(curState), str(clusterEventIDs)))
+
+		# check if there are currently/still events set for the node
+		if clusterEventIDs:
+			# there are pending events set, so our state must be STOPPING or IN_EVENT
+			i = 0; touchedEventIDs = False
+			while i < len(clusterEventIDs):
+				# clean up pending events that are already finished according to AZ
+				if clusterEventIDs[i] not in azEventIDs:
+					ocf.logger.info("handleLocalEvents: remove finished local clusterEvent %s" % (clusterEventIDs[i]))
+					clusterEventIDs.pop(i)
+					touchedEventIDs = True
+				else:
+					i += 1
+			if len(clusterEventIDs) > 0:
+				# there are still pending events (either because we're still stopping, or because the event is still in place)
+				# either way, we need to wait
+				if touchedEventIDs:
+					ocf.logger.info("handleLocalEvents: added new local clusterEvent %s" % str(clusterEventIDs))
+					self.setEventIDs(clusterEventIDs)
+				else:
+					ocf.logger.info("handleLocalEvents: no local clusterEvents were updated")
+			else:
+				# there are no more pending events left after cleanup
+				if clusterHelper.noPendingResourcesOnNode(self.hostName):
+					# and no pending resources on the node -> set it back online
+					ocf.logger.info("handleLocalEvents: all local events finished -> clean up, put node online and AVAILABLE")
+					curState = self.updateNodeStateAndEvents(AVAILABLE, None)
+					self.putNodeOnline()
+					clusterHelper.removeHoldFromNodes()
+					# repeat handleLocalEvents() since we changed status to AVAILABLE
+				else:
+					ocf.logger.info("handleLocalEvents: all local events finished, but some resources have not completed startup yet -> wait")
+		else:
+			# there are no pending events set for us (yet)
+			if curState == AVAILABLE:
+				if len(azEventIDs) > 0:
+					if clusterHelper.otherNodesAvailable(self):
+						ocf.logger.info("handleLocalEvents: can handle local events %s -> set state STOPPING" % (str(azEventIDs)))
+						# this will also set mayUpdateDocVersion = True
+						curState = self.updateNodeStateAndEvents(STOPPING, azEventIDs)
+					else:
+						ocf.logger.info("handleLocalEvents: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(azEventIDs))
+						self.setState(ON_HOLD)
+				else:
+					ocf.logger.debug("handleLocalEvents: no local azEvents to handle")
+			if curState == STOPPING:
+				if clusterHelper.noPendingResourcesOnNode(self.hostName):
+					ocf.logger.info("handleLocalEvents: all local resources are started properly -> put node standby")
+					self.putNodeStandby()
+					mayUpdateDocVersion = True
+				else:
+					ocf.logger.info("handleLocalEvents: some local resources are not clean yet -> wait")
+
+		ocf.logger.debug("handleLocalEvents: finished; mayUpdateDocVersion = %s" % str(mayUpdateDocVersion))
+		return mayUpdateDocVersion
+
+##############################################################################
+
+class raAzEvents:
+	"""
+	Main class for resource agent
+	"""
+	def __init__(self, relevantEventTypes):
+		self.node = Node(self)
+		self.relevantEventTypes = relevantEventTypes
+
+	def monitor(self):
+		ocf.logger.debug("monitor: begin")
+
+		pullFailedAttemps = 0
+		while True:
+			# check if another node is pulling at the same time;
+			# this should only be a concern for the first pull, as setting up Scheduled Events may take up to 2 minutes.
+			if clusterHelper.getAttr(attr_globalPullState) == "PULLING":
+				pullFailedAttemps += 1
+				if pullFailedAttemps == global_pullMaxAttempts:
+					ocf.logger.warning("monitor: exceeded maximum number of attempts (%d) to pull events" % global_pullMaxAttempts)
+					ocf.logger.debug("monitor: finished")
+					return ocf.OCF_SUCCESS
+				else:
+					ocf.logger.info("monitor: another node is pulling; retry in %d seconds" % global_pullDelaySecs)
+					time.sleep(global_pullDelaySecs)
+					continue
+
+			# we can pull safely from Azure Metadata Service
+			clusterHelper.setAttr(attr_globalPullState, "PULLING")
+			events = azHelper.pullScheduledEvents()
+			clusterHelper.setAttr(attr_globalPullState, "IDLE")
+
+			# get current document version
+			curDocVersion  = events.DocumentIncarnation
+			lastDocVersion = self.node.getAttr(attr_lastDocVersion)
+			ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion))
+
+			# split events local/remote
+			(localEvents, remoteEvents) = self.node.separateEvents(events.Events)
+
+			# ensure local events are only executing once
+			if curDocVersion != lastDocVersion:
+				ocf.logger.debug("monitor: curDocVersion has not been handled yet")
+				# handleLocalEvents() returns True if mayUpdateDocVersion is True;
+				# this is only the case if we can ensure there are no pending events
+				if self.node.handleLocalEvents(localEvents):
+					ocf.logger.info("monitor: handleLocalEvents completed successfully -> update curDocVersion")
+					self.node.setAttr(attr_lastDocVersion, curDocVersion)
+				else:
+					ocf.logger.debug("monitor: handleLocalEvents still waiting -> keep curDocVersion")
+			else:
+				ocf.logger.info("monitor: already handled curDocVersion, skip")
+
+			# remove orphaned remote events and then handle the remaining remote events
+			self.node.removeOrphanedEvents(remoteEvents)
+			self.node.handleRemoteEvents(remoteEvents)
+			break
+
+		ocf.logger.debug("monitor: finished")
+		return ocf.OCF_SUCCESS
+
+##############################################################################
+
+def setLoglevel(verbose):
+	# set up writing into syslog
+	loglevel = default_loglevel
+	if verbose:
+		opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
+		urllib2.install_opener(opener)
+		loglevel = ocf.logging.DEBUG
+	ocf.log.setLevel(loglevel)
+
+description = (
+	"Microsoft Azure Scheduled Events monitoring agent",
+	"""This resource agent implements a monitor for scheduled
+(maintenance) events for a Microsoft Azure VM.
+
+If any relevant events are found, it moves all Pacemaker resources
+away from the affected node to allow for a graceful shutdown.
+
+	Usage:
+		[OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events ACTION
+
+		action (required): Supported values: monitor, help, meta-data
+		eventTypes (optional): List of event types to be considered
+				relevant by the resource agent (comma-separated).
+				Supported values: Freeze,Reboot,Redeploy
+				Default = Reboot,Redeploy
+/		verbose (optional): If set to true, displays debug info.
+				Default = false
+
+	Deployment:
+		crm configure primitive rsc_azure-events ocf:heartbeat:azure-events \
+			op monitor interval=10s
+		crm configure clone cln_azure-events rsc_azure-events
+
+For further information on Microsoft Azure Scheduled Events, please
+refer to the following documentation:
+https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events
+""")
+
+def monitor_action(eventTypes):
+	relevantEventTypes = set(eventTypes.split(",") if eventTypes else [])
+	ra = raAzEvents(relevantEventTypes)
+	return ra.monitor()
+
+def validate_action(eventTypes):
+	if eventTypes:
+		for event in eventTypes.split(","):
+			if event not in ("Freeze", "Reboot", "Redeploy"):
+				ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes)
+				return ocf.OCF_ERR_CONFIGURED
+	return ocf.OCF_SUCCESS
+
+def main():
+	agent = ocf.Agent("azure-events", shortdesc=description[0], longdesc=description[1])
+	agent.add_parameter(
+		"eventTypes",
+		shortdesc="List of resources to be considered",
+		longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)",
+		content_type="string",
+		default="Reboot,Redeploy")
+	agent.add_parameter(
+		"verbose",
+		shortdesc="Enable verbose agent logging",
+		longdesc="Set to true to enable verbose logging",
+		content_type="boolean",
+		default="false")
+	agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
+	agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
+	agent.add_action("validate-all", timeout=20, handler=validate_action)
+	agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action)
+	setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false")))
+	agent.run()
+
+if __name__ == '__main__':
+	main()
diff -uNr a/heartbeat/Makefile.am b/heartbeat/Makefile.am
--- a/heartbeat/Makefile.am	2020-04-08 13:20:19.912631331 +0200
+++ b/heartbeat/Makefile.am	2020-04-14 11:11:26.333806223 +0200
@@ -55,7 +55,7 @@
 osp_SCRIPTS	     =  nova-compute-wait	\
 			NovaEvacuate
 
-ocf_SCRIPTS	     =  AoEtarget		\
+ocf_SCRIPTS	      = AoEtarget		\
 			AudibleAlarm		\
 			ClusterMon		\
 			CTDB			\
@@ -120,10 +120,7 @@
 			fio			\
 			galera			\
 			garbd			\
-			gcp-pd-move		\
 			gcp-vpc-move-ip		\
-			gcp-vpc-move-vip	\
-			gcp-vpc-move-route	\
 			iSCSILogicalUnit	\
 			iSCSITarget		\
 			ids			\
@@ -180,6 +177,22 @@
 			vsftpd			\
 			zabbixserver
 
+if BUILD_AZURE_EVENTS
+ocf_SCRIPTS	     += azure-events
+endif
+
+if BUILD_GCP_PD_MOVE
+ocf_SCRIPTS	     += gcp-pd-move
+endif
+
+if BUILD_GCP_VPC_MOVE_ROUTE
+ocf_SCRIPTS	     += gcp-vpc-move-route
+endif
+
+if BUILD_GCP_VPC_MOVE_VIP
+ocf_SCRIPTS	     += gcp-vpc-move-vip
+endif
+
 ocfcommondir		= $(OCF_LIB_DIR_PREFIX)/heartbeat
 ocfcommon_DATA		= ocf-shellfuncs 	\
 			  ocf-binaries	 	\
@@ -208,3 +221,13 @@
 
 %.check: %
 	OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) ./$< meta-data | xmllint --path $(abs_srcdir) --noout --relaxng $(abs_srcdir)/metadata.rng -
+
+do_spellcheck = printf '[%s]\n' "$(agent)"; \
+                OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) \
+                ./$(agent) meta-data 2>/dev/null \
+                | xsltproc $(top_srcdir)/make/extract_text.xsl - \
+                | aspell pipe list -d en_US --ignore-case \
+                  --home-dir=$(top_srcdir)/make -p spellcheck-ignore \
+                | sed -n 's|^&\([^:]*\):.*|\1|p';
+spellcheck:
+	@$(foreach agent,$(ocf_SCRIPTS), $(do_spellcheck))
diff -uNr a/m4/ac_python_module.m4 b/m4/ac_python_module.m4
--- a/m4/ac_python_module.m4	1970-01-01 01:00:00.000000000 +0100
+++ b/m4/ac_python_module.m4	2020-04-14 11:11:26.325806378 +0200
@@ -0,0 +1,30 @@
+dnl @synopsis AC_PYTHON_MODULE(modname[, fatal])
+dnl
+dnl Checks for Python module.
+dnl
+dnl If fatal is non-empty then absence of a module will trigger an
+dnl error.
+dnl
+dnl @category InstalledPackages
+dnl @author Andrew Collier <colliera@nu.ac.za>.
+dnl @version 2004-07-14
+dnl @license AllPermissive
+
+AC_DEFUN([AC_PYTHON_MODULE],[
+	AC_MSG_CHECKING(python module: $1)
+	$PYTHON -c "import $1" 2>/dev/null
+	if test $? -eq 0;
+	then
+		AC_MSG_RESULT(yes)
+		eval AS_TR_CPP(HAVE_PYMOD_$1)=yes
+	else
+		AC_MSG_RESULT(no)
+		eval AS_TR_CPP(HAVE_PYMOD_$1)=no
+		#
+		if test -n "$2"
+		then
+			AC_MSG_ERROR(failed to find required module $1)
+			exit 1
+		fi
+	fi
+])
diff -uNr a/systemd/resource-agents.conf b/systemd/resource-agents.conf
--- a/systemd/resource-agents.conf	2018-06-29 14:05:02.000000000 +0200
+++ b/systemd/resource-agents.conf	1970-01-01 01:00:00.000000000 +0100
@@ -1 +0,0 @@
-d /var/run/resource-agents/ 1755 root root
diff -uNr a/systemd/resource-agents.conf.in b/systemd/resource-agents.conf.in
--- a/systemd/resource-agents.conf.in	1970-01-01 01:00:00.000000000 +0100
+++ b/systemd/resource-agents.conf.in	2020-04-14 11:11:26.325806378 +0200
@@ -0,0 +1 @@
+d @HA_RSCTMPDIR@ 1755 root root
rpms / resource-agents

Source Code

Files