Blame SOURCES/bz2130986-azure-events-az-new-ra.patch

a893af
From 5dcd5153f0318e4766f7f4d3e61dfdb4b352c39c Mon Sep 17 00:00:00 2001
a893af
From: MSSedusch <sedusch@microsoft.com>
a893af
Date: Mon, 30 May 2022 15:08:10 +0200
a893af
Subject: [PATCH 1/2] add new Azure Events AZ resource agent
a893af
a893af
---
a893af
 .gitignore                   |   1 +
a893af
 configure.ac                 |   8 +
a893af
 doc/man/Makefile.am          |   4 +
a893af
 heartbeat/Makefile.am        |   4 +
a893af
 heartbeat/azure-events-az.in | 782 +++++++++++++++++++++++++++++++++++
a893af
 5 files changed, 799 insertions(+)
a893af
 create mode 100644 heartbeat/azure-events-az.in
a893af
a893af
diff --git a/.gitignore b/.gitignore
a893af
index 0c259b5cf..e2b7c039c 100644
a893af
--- a/.gitignore
a893af
+++ b/.gitignore
a893af
@@ -54,6 +54,7 @@ heartbeat/Squid
a893af
 heartbeat/SysInfo
a893af
 heartbeat/aws-vpc-route53
a893af
 heartbeat/azure-events
a893af
+heartbeat/azure-events-az
a893af
 heartbeat/clvm
a893af
 heartbeat/conntrackd
a893af
 heartbeat/dnsupdate
a893af
diff --git a/configure.ac b/configure.ac
a893af
index eeecfad0e..5716a2be2 100644
a893af
--- a/configure.ac
a893af
+++ b/configure.ac
a893af
@@ -523,6 +523,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
a893af
 fi
a893af
 AM_CONDITIONAL(BUILD_AZURE_EVENTS, test $BUILD_AZURE_EVENTS -eq 1)
a893af
 
a893af
+BUILD_AZURE_EVENTS_AZ=1
a893af
+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
a893af
+    BUILD_AZURE_EVENTS_AZ=0
a893af
+    AC_MSG_WARN("Not building azure-events-az")
a893af
+fi
a893af
+AM_CONDITIONAL(BUILD_AZURE_EVENTS_AZ, test $BUILD_AZURE_EVENTS_AZ -eq 1)
a893af
+
a893af
 BUILD_GCP_PD_MOVE=1
a893af
 if test -z "$PYTHON" || test "x${HAVE_PYMOD_GOOGLEAPICLIENT}" != xyes || test $BUILD_OCF_PY -eq 0; then
a893af
     BUILD_GCP_PD_MOVE=0
a893af
@@ -976,6 +983,7 @@ rgmanager/Makefile						\
a893af
 
a893af
 dnl Files we output that need to be executable
a893af
 AC_CONFIG_FILES([heartbeat/azure-events], [chmod +x heartbeat/azure-events])
a893af
+AC_CONFIG_FILES([heartbeat/azure-events-az], [chmod +x heartbeat/azure-events-az])
a893af
 AC_CONFIG_FILES([heartbeat/AoEtarget], [chmod +x heartbeat/AoEtarget])
a893af
 AC_CONFIG_FILES([heartbeat/ManageRAID], [chmod +x heartbeat/ManageRAID])
a893af
 AC_CONFIG_FILES([heartbeat/ManageVE], [chmod +x heartbeat/ManageVE])
a893af
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
a893af
index cd8fd16bf..658c700ac 100644
a893af
--- a/doc/man/Makefile.am
a893af
+++ b/doc/man/Makefile.am
a893af
@@ -219,6 +219,10 @@ if BUILD_AZURE_EVENTS
a893af
 man_MANS           	+= ocf_heartbeat_azure-events.7
a893af
 endif
a893af
 
a893af
+if BUILD_AZURE_EVENTS_AZ
a893af
+man_MANS           	+= ocf_heartbeat_azure-events-az.7
a893af
+endif
a893af
+
a893af
 if BUILD_GCP_PD_MOVE
a893af
 man_MANS           	+= ocf_heartbeat_gcp-pd-move.7
a893af
 endif
a893af
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
a893af
index 20d41e36a..1133dc13e 100644
a893af
--- a/heartbeat/Makefile.am
a893af
+++ b/heartbeat/Makefile.am
a893af
@@ -188,6 +188,10 @@ if BUILD_AZURE_EVENTS
a893af
 ocf_SCRIPTS	     += azure-events
a893af
 endif
a893af
 
a893af
+if BUILD_AZURE_EVENTS_AZ
a893af
+ocf_SCRIPTS	     += azure-events-az
a893af
+endif
a893af
+
a893af
 if BUILD_GCP_PD_MOVE
a893af
 ocf_SCRIPTS	     += gcp-pd-move
a893af
 endif
a893af
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
a893af
new file mode 100644
a893af
index 000000000..616fc8d9e
a893af
--- /dev/null
a893af
+++ b/heartbeat/azure-events-az.in
a893af
@@ -0,0 +1,782 @@
a893af
+#!@PYTHON@ -tt
a893af
+#
a893af
+#	Resource agent for monitoring Azure Scheduled Events
a893af
+#
a893af
+# 	License:	GNU General Public License (GPL)
a893af
+#	(c) 2018 	Tobias Niekamp, Microsoft Corp.
a893af
+#				and Linux-HA contributors
a893af
+
a893af
+import os
a893af
+import sys
a893af
+import time
a893af
+import subprocess
a893af
+import json
a893af
+try:
a893af
+		import urllib2
a893af
+		from urllib2 import URLError
a893af
+except ImportError:
a893af
+		import urllib.request as urllib2
a893af
+		from urllib.error import URLError
a893af
+import socket
a893af
+from collections import defaultdict
a893af
+
a893af
+OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT"))
a893af
+sys.path.append(OCF_FUNCTIONS_DIR)
a893af
+import ocf
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+
a893af
+VERSION = "0.10"
a893af
+USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro())
a893af
+
a893af
+attr_globalPullState = "azure-events-az_globalPullState"
a893af
+attr_lastDocVersion  = "azure-events-az_lastDocVersion"
a893af
+attr_curNodeState = "azure-events-az_curNodeState"
a893af
+attr_pendingEventIDs = "azure-events-az_pendingEventIDs"
a893af
+attr_healthstate = "#health-azure"
a893af
+
a893af
+default_loglevel = ocf.logging.INFO
a893af
+default_relevantEventTypes = set(["Reboot", "Redeploy"])
a893af
+
a893af
+global_pullMaxAttempts = 3
a893af
+global_pullDelaySecs = 1
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+class attrDict(defaultdict):
a893af
+	"""
a893af
+	A wrapper for accessing dict keys like an attribute
a893af
+	"""
a893af
+	def __init__(self, data):
a893af
+		super(attrDict, self).__init__(attrDict)
a893af
+		for d in data.keys():
a893af
+			self.__setattr__(d, data[d])
a893af
+
a893af
+	def __getattr__(self, key):
a893af
+		try:
a893af
+			return self[key]
a893af
+		except KeyError:
a893af
+			raise AttributeError(key)
a893af
+
a893af
+	def __setattr__(self, key, value):
a893af
+		self[key] = value
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+class azHelper:
a893af
+	"""
a893af
+	Helper class for Azure's metadata API (including Scheduled Events)
a893af
+	"""
a893af
+	metadata_host = "http://169.254.169.254/metadata"
a893af
+	instance_api  = "instance"
a893af
+	events_api    = "scheduledevents"
a893af
+	api_version   = "2019-08-01"
a893af
+
a893af
+	@staticmethod
a893af
+	def _sendMetadataRequest(endpoint, postData=None):
a893af
+		"""
a893af
+		Send a request to Azure's Azure Metadata Service API
a893af
+		"""
a893af
+		url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version)
a893af
+		data = ""
a893af
+		ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData))
a893af
+		ocf.logger.debug("_sendMetadataRequest: url = %s" % url)
a893af
+
a893af
+		if postData and type(postData) != bytes:
a893af
+			postData = postData.encode()
a893af
+
a893af
+		req = urllib2.Request(url, postData)
a893af
+		req.add_header("Metadata", "true")
a893af
+		req.add_header("User-Agent", USER_AGENT)
a893af
+		try:
a893af
+			resp = urllib2.urlopen(req)
a893af
+		except URLError as e:
a893af
+			if hasattr(e, 'reason'):
a893af
+				ocf.logger.warning("Failed to reach the server: %s" % e.reason)
a893af
+				clusterHelper.setAttr(attr_globalPullState, "IDLE")
a893af
+			elif hasattr(e, 'code'):
a893af
+				ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code)
a893af
+				clusterHelper.setAttr(attr_globalPullState, "IDLE")
a893af
+		else:
a893af
+			data = resp.read()
a893af
+			ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
a893af
+
a893af
+		if data:
a893af
+			data = json.loads(data)
a893af
+
a893af
+		ocf.logger.debug("_sendMetadataRequest: finished")
a893af
+		return data
a893af
+
a893af
+	@staticmethod
a893af
+	def getInstanceInfo():
a893af
+		"""
a893af
+		Fetch details about the current VM from Azure's Azure Metadata Service API
a893af
+		"""
a893af
+		ocf.logger.debug("getInstanceInfo: begin")
a893af
+
a893af
+		jsondata = azHelper._sendMetadataRequest(azHelper.instance_api)
a893af
+		ocf.logger.debug("getInstanceInfo: json = %s" % jsondata)
a893af
+
a893af
+		if jsondata:
a893af
+			ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"]))
a893af
+			return attrDict(jsondata["compute"])
a893af
+		else:
a893af
+			ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info")
a893af
+			sys.exit(ocf.OCF_ERR_GENERIC)
a893af
+
a893af
+	@staticmethod
a893af
+	def pullScheduledEvents():
a893af
+		"""
a893af
+		Retrieve all currently scheduled events via Azure Metadata Service API
a893af
+		"""
a893af
+		ocf.logger.debug("pullScheduledEvents: begin")
a893af
+
a893af
+		jsondata = azHelper._sendMetadataRequest(azHelper.events_api)
a893af
+		ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata)
a893af
+
a893af
+		ocf.logger.debug("pullScheduledEvents: finished")
a893af
+		return attrDict(jsondata)
a893af
+
a893af
+	@staticmethod
a893af
+	def forceEvents(eventIDs):
a893af
+		"""
a893af
+		Force a set of events to start immediately
a893af
+		"""
a893af
+		ocf.logger.debug("forceEvents: begin")
a893af
+
a893af
+		events = []
a893af
+		for e in eventIDs:
a893af
+			events.append({
a893af
+				"EventId": e,
a893af
+			})
a893af
+		postData = {
a893af
+			"StartRequests" : events
a893af
+		}
a893af
+		ocf.logger.info("forceEvents: postData = %s" % postData)
a893af
+		resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData))
a893af
+
a893af
+		ocf.logger.debug("forceEvents: finished")
a893af
+		return
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+class clusterHelper:
a893af
+	"""
a893af
+	Helper functions for Pacemaker control via crm
a893af
+	"""
a893af
+	@staticmethod
a893af
+	def _getLocation(node):
a893af
+		"""
a893af
+		Helper function to retrieve local/global attributes
a893af
+		"""
a893af
+		if node:
a893af
+			return ["--node", node]
a893af
+		else:
a893af
+			return ["--type", "crm_config"]
a893af
+
a893af
+	@staticmethod
a893af
+	def _exec(command, *args):
a893af
+		"""
a893af
+		Helper function to execute a UNIX command
a893af
+		"""
a893af
+		args = list(args)
a893af
+		ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args)))
a893af
+
a893af
+		def flatten(*n):
a893af
+			return (str(e) for a in n
a893af
+				for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),)))
a893af
+		command = list(flatten([command] + args))
a893af
+		ocf.logger.debug("_exec: cmd = %s" % " ".join(command))
a893af
+		try:
a893af
+			ret = subprocess.check_output(command)
a893af
+			if type(ret) != str:
a893af
+				ret = ret.decode()
a893af
+			ocf.logger.debug("_exec: return = %s" % ret)
a893af
+			return ret.rstrip()
a893af
+		except Exception as err:
a893af
+			ocf.logger.exception(err)
a893af
+			return None
a893af
+
a893af
+	@staticmethod
a893af
+	def setAttr(key, value, node=None):
a893af
+		"""
a893af
+		Set the value of a specific global/local attribute in the Pacemaker cluster
a893af
+		"""
a893af
+		ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node))
a893af
+
a893af
+		if value:
a893af
+			ret = clusterHelper._exec("crm_attribute",
a893af
+									  "--name", key,
a893af
+									  "--update", value,
a893af
+									  clusterHelper._getLocation(node))
a893af
+		else:
a893af
+			ret = clusterHelper._exec("crm_attribute",
a893af
+									  "--name", key,
a893af
+									  "--delete",
a893af
+									  clusterHelper._getLocation(node))
a893af
+
a893af
+		ocf.logger.debug("setAttr: finished")
a893af
+		return len(ret) == 0
a893af
+
a893af
+	@staticmethod
a893af
+	def getAttr(key, node=None):
a893af
+		"""
a893af
+		Retrieve a global/local attribute from the Pacemaker cluster
a893af
+		"""
a893af
+		ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node))
a893af
+
a893af
+		val = clusterHelper._exec("crm_attribute",
a893af
+								  "--name", key,
a893af
+								  "--query", "--quiet",
a893af
+								  "--default", "",
a893af
+								  clusterHelper._getLocation(node))
a893af
+		ocf.logger.debug("getAttr: finished")
a893af
+		if not val:
a893af
+			return None
a893af
+		return val if not val.isdigit() else int(val)
a893af
+
a893af
+	@staticmethod
a893af
+	def getAllNodes():
a893af
+		"""
a893af
+		Get a list of hostnames for all nodes in the Pacemaker cluster
a893af
+		"""
a893af
+		ocf.logger.debug("getAllNodes: begin")
a893af
+
a893af
+		nodes = []
a893af
+		nodeList = clusterHelper._exec("crm_node", "--list")
a893af
+		for n in nodeList.split("\n"):
a893af
+			nodes.append(n.split()[1])
a893af
+		ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes))
a893af
+
a893af
+		return nodes
a893af
+
a893af
+	@staticmethod
a893af
+	def getHostNameFromAzName(azName):
a893af
+		"""
a893af
+		Helper function to get the actual host name from an Azure node name
a893af
+		"""
a893af
+		return clusterHelper.getAttr("hostName_%s" % azName)
a893af
+
a893af
+	@staticmethod
a893af
+	def removeHoldFromNodes():
a893af
+		"""
a893af
+		Remove the ON_HOLD state from all nodes in the Pacemaker cluster
a893af
+		"""
a893af
+		ocf.logger.debug("removeHoldFromNodes: begin")
a893af
+
a893af
+		for n in clusterHelper.getAllNodes():
a893af
+			if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD":
a893af
+				clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n)
a893af
+				ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n)
a893af
+
a893af
+		ocf.logger.debug("removeHoldFromNodes: finished")
a893af
+		return False
a893af
+
a893af
+	@staticmethod
a893af
+	def otherNodesAvailable(exceptNode):
a893af
+		"""
a893af
+		Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE
a893af
+		"""
a893af
+		ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode)
a893af
+
a893af
+		for n in clusterHelper.getAllNodes():
a893af
+			state = clusterHelper.getAttr(attr_curNodeState, node=n)
a893af
+			state = stringToNodeState(state) if state else AVAILABLE
a893af
+			if state == AVAILABLE and n != exceptNode.hostName:
a893af
+				ocf.logger.info("otherNodesAvailable: at least %s is available" % n)
a893af
+				ocf.logger.debug("otherNodesAvailable: finished")
a893af
+				return True
a893af
+		ocf.logger.info("otherNodesAvailable: no other nodes are available")
a893af
+		ocf.logger.debug("otherNodesAvailable: finished")
a893af
+
a893af
+		return False
a893af
+
a893af
+	@staticmethod
a893af
+	def transitionSummary():
a893af
+		"""
a893af
+		Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby)
a893af
+		"""
a893af
+		# <tniek> Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node?
a893af
+		# # crm_simulate -Ls
a893af
+		# 	Transition Summary:
a893af
+		# 	 * Promote rsc_SAPHana_HN1_HDB03:0      (Slave -> Master hsr3-db1)
a893af
+		# 	 * Stop    rsc_SAPHana_HN1_HDB03:1      (hsr3-db0)
a893af
+		# 	 * Move    rsc_ip_HN1_HDB03     (Started hsr3-db0 -> hsr3-db1)
a893af
+		# 	 * Start   rsc_nc_HN1_HDB03     (hsr3-db1)
a893af
+		# # Excepted result when there are no pending actions:
a893af
+		# 	Transition Summary:
a893af
+		ocf.logger.debug("transitionSummary: begin")
a893af
+
a893af
+		summary = clusterHelper._exec("crm_simulate", "-Ls")
a893af
+		if not summary:
a893af
+			ocf.logger.warning("transitionSummary: could not load transition summary")
a893af
+			return False
a893af
+		if summary.find("Transition Summary:") < 0:
a893af
+			ocf.logger.warning("transitionSummary: received unexpected transition summary: %s" % summary)
a893af
+			return False
a893af
+		summary = summary.split("Transition Summary:")[1]
a893af
+		ret = summary.split("\n").pop(0)
a893af
+
a893af
+		ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret))
a893af
+		return ret
a893af
+
a893af
+	@staticmethod
a893af
+	def listOperationsOnNode(node):
a893af
+		"""
a893af
+		Get a list of all current operations for a given node (used to check if any resources are pending)
a893af
+		"""
a893af
+		# hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0
a893af
+		# rsc_azure-events-az    (ocf::heartbeat:azure-events-az):      Started: rsc_azure-events-az_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun  8 22:37:46 2018, exec=115ms): complete
a893af
+		# rsc_azure-events-az    (ocf::heartbeat:azure-events-az):      Started: rsc_azure-events-az_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun  8 22:37:47 2018, exec=197ms): complete
a893af
+		# rsc_SAPHana_HN1_HDB03   (ocf::suse:SAPHana):    Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun  8 22:37:46 2018, exec=0ms): pending
a893af
+		# rsc_SAPHanaTopology_HN1_HDB03   (ocf::suse:SAPHanaTopology):    Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun  8 22:37:46 2018, exec=3214ms): complete
a893af
+		ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node)
a893af
+
a893af
+		resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node)
a893af
+		if len(resources) == 0:
a893af
+			ret = []
a893af
+		else:
a893af
+			ret = resources.split("\n")
a893af
+
a893af
+		ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret))
a893af
+		return ret
a893af
+
a893af
+	@staticmethod
a893af
+	def noPendingResourcesOnNode(node):
a893af
+		"""
a893af
+		Check that there are no pending resources on a given node
a893af
+		"""
a893af
+		ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node)
a893af
+
a893af
+		for r in clusterHelper.listOperationsOnNode(node):
a893af
+			ocf.logger.debug("noPendingResourcesOnNode: * %s" % r)
a893af
+			resource = r.split()[-1]
a893af
+			if resource == "pending":
a893af
+				ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource)
a893af
+				ocf.logger.debug("noPendingResourcesOnNode: finished; return = False")
a893af
+				return False
a893af
+		ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node)
a893af
+		ocf.logger.debug("noPendingResourcesOnNode: finished; return = True")
a893af
+
a893af
+		return True
a893af
+
a893af
+	@staticmethod
a893af
+	def allResourcesStoppedOnNode(node):
a893af
+		"""
a893af
+		Check that all resources on a given node are stopped
a893af
+		"""
a893af
+		ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node)
a893af
+
a893af
+		if clusterHelper.noPendingResourcesOnNode(node):
a893af
+			if len(clusterHelper.transitionSummary()) == 0:
a893af
+				ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node)
a893af
+				ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True")
a893af
+				return True
a893af
+			ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty")
a893af
+			ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
a893af
+			return False
a893af
+
a893af
+		ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node)
a893af
+		ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False")
a893af
+		return False
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+AVAILABLE = 0	# Node is online and ready to handle events
a893af
+STOPPING = 1	# Standby has been triggered, but some resources are still running
a893af
+IN_EVENT = 2	# All resources are stopped, and event has been initiated via Azure Metadata Service
a893af
+ON_HOLD = 3		# Node has a pending event that cannot be started there are no other nodes available
a893af
+
a893af
+def stringToNodeState(name):
a893af
+	if type(name) == int: return name
a893af
+	if name == "STOPPING": return STOPPING
a893af
+	if name == "IN_EVENT": return IN_EVENT
a893af
+	if name == "ON_HOLD": return ON_HOLD
a893af
+	return AVAILABLE
a893af
+
a893af
+def nodeStateToString(state):
a893af
+	if state == STOPPING: return "STOPPING"
a893af
+	if state == IN_EVENT: return "IN_EVENT"
a893af
+	if state == ON_HOLD: return "ON_HOLD"
a893af
+	return "AVAILABLE"
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+class Node:
a893af
+	"""
a893af
+	Core class implementing logic for a cluster node
a893af
+	"""
a893af
+	def __init__(self, ra):
a893af
+		self.raOwner  = ra
a893af
+		self.azInfo   = azHelper.getInstanceInfo()
a893af
+		self.azName   = self.azInfo.name
a893af
+		self.hostName = socket.gethostname()
a893af
+		self.setAttr("azName", self.azName)
a893af
+		clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
a893af
+
a893af
+	def getAttr(self, key):
a893af
+		"""
a893af
+		Get a local attribute
a893af
+		"""
a893af
+		return clusterHelper.getAttr(key, node=self.hostName)
a893af
+
a893af
+	def setAttr(self, key, value):
a893af
+		"""
a893af
+		Set a local attribute
a893af
+		"""
a893af
+		return clusterHelper.setAttr(key, value, node=self.hostName)
a893af
+
a893af
+	def selfOrOtherNode(self, node):
a893af
+		"""
a893af
+		Helper function to distinguish self/other node
a893af
+		"""
a893af
+		return node if node else self.hostName
a893af
+
a893af
+	def setState(self, state, node=None):
a893af
+		"""
a893af
+		Set the state for a given node (or self)
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state)))
a893af
+
a893af
+		clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node)
a893af
+
a893af
+		ocf.logger.debug("setState: finished")
a893af
+
a893af
+	def getState(self, node=None):
a893af
+		"""
a893af
+		Get the state for a given node (or self)
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("getState: begin; node = %s" % node)
a893af
+
a893af
+		state = clusterHelper.getAttr(attr_curNodeState, node=node)
a893af
+		ocf.logger.debug("getState: state = %s" % state)
a893af
+		ocf.logger.debug("getState: finished")
a893af
+		if not state:
a893af
+			return AVAILABLE
a893af
+		return stringToNodeState(state)
a893af
+
a893af
+	def setEventIDs(self, eventIDs, node=None):
a893af
+		"""
a893af
+		Set pending EventIDs for a given node (or self)
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs)))
a893af
+
a893af
+		if eventIDs:
a893af
+			eventIDStr = ",".join(eventIDs)
a893af
+		else:
a893af
+			eventIDStr = None
a893af
+		clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node)
a893af
+
a893af
+		ocf.logger.debug("setEventIDs: finished")
a893af
+		return
a893af
+
a893af
+	def getEventIDs(self, node=None):
a893af
+		"""
a893af
+		Get pending EventIDs for a given node (or self)
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("getEventIDs: begin; node = %s" % node)
a893af
+
a893af
+		eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node)
a893af
+		if eventIDStr:
a893af
+			eventIDs = eventIDStr.split(",")
a893af
+		else:
a893af
+			eventIDs = None
a893af
+
a893af
+		ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs))
a893af
+		return eventIDs
a893af
+
a893af
+	def updateNodeStateAndEvents(self, state, eventIDs, node=None):
a893af
+		"""
a893af
+		Set the state and pending EventIDs for a given node (or self)
a893af
+		"""
a893af
+		ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs)))
a893af
+
a893af
+		self.setState(state, node=node)
a893af
+		self.setEventIDs(eventIDs, node=node)
a893af
+
a893af
+		ocf.logger.debug("updateNodeStateAndEvents: finished")
a893af
+		return state
a893af
+
a893af
+	def putNodeStandby(self, node=None):
a893af
+		"""
a893af
+		Put self to standby
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("putNodeStandby: begin; node = %s" % node)
a893af
+
a893af
+		clusterHelper._exec("crm_attribute",
a893af
+							"--node", node,
a893af
+							"--name", attr_healthstate,
a893af
+							"--update", "-1000000",
a893af
+							"--lifetime=forever")
a893af
+
a893af
+		ocf.logger.debug("putNodeStandby: finished")
a893af
+
a893af
+	def isNodeInStandby(self, node=None):
a893af
+		"""
a893af
+		check if node is in standby
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("isNodeInStandby: begin; node = %s" % node)
a893af
+		isInStandy = False
a893af
+
a893af
+		healthAttributeStr = clusterHelper.getAttr(attr_healthstate, node)
a893af
+		if healthAttributeStr is not None:
a893af
+			try:
a893af
+				healthAttribute = int(healthAttributeStr)
a893af
+				isInStandy = healthAttribute < 0
a893af
+			except ValueError:
a893af
+				# Handle the exception
a893af
+				ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node))
a893af
+		
a893af
+		ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy)
a893af
+		return isInStandy
a893af
+
a893af
+	def putNodeOnline(self, node=None):
a893af
+		"""
a893af
+		Put self back online
a893af
+		"""
a893af
+		node = self.selfOrOtherNode(node)
a893af
+		ocf.logger.debug("putNodeOnline: begin; node = %s" % node)
a893af
+
a893af
+		clusterHelper._exec("crm_attribute",
a893af
+							"--node", node,
a893af
+							"--name", "#health-azure",
a893af
+							"--update", "0",
a893af
+							"--lifetime=forever")
a893af
+
a893af
+		ocf.logger.debug("putNodeOnline: finished")
a893af
+
a893af
+	def separateEvents(self, events):
a893af
+		"""
a893af
+		Split own/other nodes' events
a893af
+		"""
a893af
+		ocf.logger.debug("separateEvents: begin; events = %s" % str(events))
a893af
+
a893af
+		localEvents = []
a893af
+		remoteEvents = []
a893af
+		for e in events:
a893af
+			e = attrDict(e)
a893af
+			if e.EventType not in self.raOwner.relevantEventTypes:
a893af
+				continue
a893af
+			if self.azName in e.Resources:
a893af
+				localEvents.append(e)
a893af
+			else:
a893af
+				remoteEvents.append(e)
a893af
+		ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents)))
a893af
+		return (localEvents, remoteEvents)
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+class raAzEvents:
a893af
+	"""
a893af
+	Main class for resource agent
a893af
+	"""
a893af
+	def __init__(self, relevantEventTypes):
a893af
+		self.node = Node(self)
a893af
+		self.relevantEventTypes = relevantEventTypes
a893af
+
a893af
+	def monitor(self):
a893af
+		ocf.logger.debug("monitor: begin")
a893af
+		
a893af
+		events = azHelper.pullScheduledEvents()
a893af
+
a893af
+		# get current document version
a893af
+		curDocVersion  = events.DocumentIncarnation
a893af
+		lastDocVersion = self.node.getAttr(attr_lastDocVersion)
a893af
+		ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion))
a893af
+
a893af
+		# split events local/remote
a893af
+		(localEvents, remoteEvents) = self.node.separateEvents(events.Events)
a893af
+
a893af
+		# ensure local events are only executing once
a893af
+		if curDocVersion == lastDocVersion:
a893af
+			ocf.logger.info("monitor: already handled curDocVersion, skip")
a893af
+			return ocf.OCF_SUCCESS
a893af
+
a893af
+		localAzEventIDs = set()
a893af
+		for e in localEvents:
a893af
+			localAzEventIDs.add(e.EventId)
a893af
+
a893af
+		curState = self.node.getState()
a893af
+		clusterEventIDs = self.node.getEventIDs()
a893af
+
a893af
+		ocf.logger.debug("monitor: curDocVersion has not been handled yet")
a893af
+		
a893af
+		if clusterEventIDs:
a893af
+			# there are pending events set, so our state must be STOPPING or IN_EVENT
a893af
+			i = 0; touchedEventIDs = False
a893af
+			while i < len(clusterEventIDs):
a893af
+				# clean up pending events that are already finished according to AZ
a893af
+				if clusterEventIDs[i] not in localAzEventIDs:
a893af
+					ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i]))
a893af
+					clusterEventIDs.pop(i)
a893af
+					touchedEventIDs = True
a893af
+				else:
a893af
+					i += 1
a893af
+			if len(clusterEventIDs) > 0:
a893af
+				# there are still pending events (either because we're still stopping, or because the event is still in place)
a893af
+				# either way, we need to wait
a893af
+				if touchedEventIDs:
a893af
+					ocf.logger.info("monitor: added new local clusterEvent %s" % str(clusterEventIDs))
a893af
+					self.node.setEventIDs(clusterEventIDs)
a893af
+				else:
a893af
+					ocf.logger.info("monitor: no local clusterEvents were updated")
a893af
+			else:
a893af
+				# there are no more pending events left after cleanup
a893af
+				if clusterHelper.noPendingResourcesOnNode(self.node.hostName):
a893af
+					# and no pending resources on the node -> set it back online
a893af
+					ocf.logger.info("monitor: all local events finished -> clean up, put node online and AVAILABLE")
a893af
+					curState = self.node.updateNodeStateAndEvents(AVAILABLE, None)
a893af
+					self.node.putNodeOnline()
a893af
+					clusterHelper.removeHoldFromNodes()
a893af
+					# If Azure Scheduled Events are not used for 24 hours (e.g. because the cluster was asleep), it will be disabled for a VM.
a893af
+					# When the cluster wakes up and starts using it again, the DocumentIncarnation is reset.
a893af
+					# We need to remove it during cleanup, otherwise azure-events-az will not process the event after wakeup
a893af
+					self.node.setAttr(attr_lastDocVersion, None)
a893af
+				else:
a893af
+					ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait")
a893af
+		else:
a893af
+			if curState == AVAILABLE:
a893af
+				if len(localAzEventIDs) > 0:
a893af
+					if clusterHelper.otherNodesAvailable(self.node):
a893af
+						ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs)))
a893af
+						curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs)
a893af
+					else:
a893af
+						ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs))
a893af
+						self.node.setState(ON_HOLD)
a893af
+				else:
a893af
+					ocf.logger.debug("monitor: no local azEvents to handle")
a893af
+
a893af
+		if curState == STOPPING:
a893af
+			eventIDsForNode = {}
a893af
+			if clusterHelper.noPendingResourcesOnNode(self.node.hostName):
a893af
+				if not self.node.isNodeInStandby():
a893af
+					ocf.logger.info("monitor: all local resources are started properly -> put node standby and exit")
a893af
+					self.node.putNodeStandby()
a893af
+					return ocf.OCF_SUCCESS
a893af
+
a893af
+				for e in localEvents:
a893af
+					ocf.logger.info("monitor: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources)))
a893af
+					# before we can force an event to start, we need to ensure all nodes involved have stopped their resources
a893af
+					if e.EventStatus == "Scheduled":
a893af
+						allNodesStopped = True
a893af
+						for azName in e.Resources:
a893af
+							hostName = clusterHelper.getHostNameFromAzName(azName)
a893af
+							state = self.node.getState(node=hostName)
a893af
+							if state == STOPPING:
a893af
+								# the only way we can continue is when node state is STOPPING, but all resources have been stopped
a893af
+								if not clusterHelper.allResourcesStoppedOnNode(hostName):
a893af
+									ocf.logger.info("monitor: (at least) node %s has still resources running -> wait" % hostName)
a893af
+									allNodesStopped = False
a893af
+									break
a893af
+							elif state in (AVAILABLE, IN_EVENT, ON_HOLD):
a893af
+								ocf.logger.info("monitor: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state)))
a893af
+								allNodesStopped = False
a893af
+								break
a893af
+						if allNodesStopped:
a893af
+							ocf.logger.info("monitor: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId))
a893af
+							for n in e.Resources:
a893af
+								hostName = clusterHelper.getHostNameFromAzName(n)
a893af
+								if hostName in eventIDsForNode:
a893af
+									eventIDsForNode[hostName].append(e.EventId)
a893af
+								else:
a893af
+									eventIDsForNode[hostName] = [e.EventId]
a893af
+					elif e.EventStatus == "Started":
a893af
+						ocf.logger.info("monitor: remote event already started")
a893af
+
a893af
+				# force the start of all events whose nodes are ready (i.e. have no more resources running)
a893af
+				if len(eventIDsForNode.keys()) > 0:
a893af
+					eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist])
a893af
+					ocf.logger.info("monitor: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce)))
a893af
+					for node, eventId in eventIDsForNode.items():
a893af
+						self.node.updateNodeStateAndEvents(IN_EVENT, eventId, node=node)
a893af
+					azHelper.forceEvents(eventIDsToForce)
a893af
+					self.node.setAttr(attr_lastDocVersion, curDocVersion)
a893af
+			else:
a893af
+				ocf.logger.info("monitor: some local resources are not clean yet -> wait")
a893af
+
a893af
+		ocf.logger.debug("monitor: finished")
a893af
+		return ocf.OCF_SUCCESS
a893af
+
a893af
+##############################################################################
a893af
+
a893af
+def setLoglevel(verbose):
a893af
+	# set up writing into syslog
a893af
+	loglevel = default_loglevel
a893af
+	if verbose:
a893af
+		opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
a893af
+		urllib2.install_opener(opener)
a893af
+		loglevel = ocf.logging.DEBUG
a893af
+	ocf.log.setLevel(loglevel)
a893af
+
a893af
+description = (
a893af
+	"Microsoft Azure Scheduled Events monitoring agent",
a893af
+	"""This resource agent implements a monitor for scheduled
a893af
+(maintenance) events for a Microsoft Azure VM.
a893af
+
a893af
+If any relevant events are found, it moves all Pacemaker resources
a893af
+away from the affected node to allow for a graceful shutdown.
a893af
+
a893af
+	Usage:
a893af
+		[OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION
a893af
+
a893af
+		action (required): Supported values: monitor, help, meta-data
a893af
+		eventTypes (optional): List of event types to be considered
a893af
+				relevant by the resource agent (comma-separated).
a893af
+				Supported values: Freeze,Reboot,Redeploy
a893af
+				Default = Reboot,Redeploy
a893af
+/		verbose (optional): If set to true, displays debug info.
a893af
+				Default = false
a893af
+
a893af
+	Deployment:
a893af
+		crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \
a893af
+			op monitor interval=10s
a893af
+		crm configure clone cln_azure-events-az rsc_azure-events-az
a893af
+
a893af
+For further information on Microsoft Azure Scheduled Events, please
a893af
+refer to the following documentation:
a893af
+https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events
a893af
+""")
a893af
+
a893af
+def monitor_action(eventTypes):
a893af
+	relevantEventTypes = set(eventTypes.split(",") if eventTypes else [])
a893af
+	ra = raAzEvents(relevantEventTypes)
a893af
+	return ra.monitor()
a893af
+
a893af
+def validate_action(eventTypes):
a893af
+	if eventTypes:
a893af
+		for event in eventTypes.split(","):
a893af
+			if event not in ("Freeze", "Reboot", "Redeploy"):
a893af
+				ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes)
a893af
+				return ocf.OCF_ERR_CONFIGURED
a893af
+	return ocf.OCF_SUCCESS
a893af
+
a893af
+def main():
a893af
+	agent = ocf.Agent("azure-events-az", shortdesc=description[0], longdesc=description[1])
a893af
+	agent.add_parameter(
a893af
+		"eventTypes",
a893af
+		shortdesc="List of resources to be considered",
a893af
+		longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)",
a893af
+		content_type="string",
a893af
+		default="Reboot,Redeploy")
a893af
+	agent.add_parameter(
a893af
+		"verbose",
a893af
+		shortdesc="Enable verbose agent logging",
a893af
+		longdesc="Set to true to enable verbose logging",
a893af
+		content_type="boolean",
a893af
+		default="false")
a893af
+	agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
a893af
+	agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
a893af
+	agent.add_action("validate-all", timeout=20, handler=validate_action)
a893af
+	agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action)
a893af
+	setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false")))
a893af
+	agent.run()
a893af
+
a893af
+if __name__ == '__main__':
a893af
+	main()
a893af
\ No newline at end of file
a893af
a893af
From a95337d882c7cc69d604b050159ad50b679f18be Mon Sep 17 00:00:00 2001
a893af
From: MSSedusch <sedusch@microsoft.com>
a893af
Date: Thu, 2 Jun 2022 14:10:33 +0200
a893af
Subject: [PATCH 2/2] Remove developer documentation
a893af
a893af
---
a893af
 heartbeat/azure-events-az.in | 11 -----------
a893af
 1 file changed, 11 deletions(-)
a893af
a893af
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
a893af
index 616fc8d9e..59d095306 100644
a893af
--- a/heartbeat/azure-events-az.in
a893af
+++ b/heartbeat/azure-events-az.in
a893af
@@ -723,17 +723,6 @@ description = (
a893af
 If any relevant events are found, it moves all Pacemaker resources
a893af
 away from the affected node to allow for a graceful shutdown.
a893af
 
a893af
-	Usage:
a893af
-		[OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION
a893af
-
a893af
-		action (required): Supported values: monitor, help, meta-data
a893af
-		eventTypes (optional): List of event types to be considered
a893af
-				relevant by the resource agent (comma-separated).
a893af
-				Supported values: Freeze,Reboot,Redeploy
a893af
-				Default = Reboot,Redeploy
a893af
-/		verbose (optional): If set to true, displays debug info.
a893af
-				Default = false
a893af
-
a893af
 	Deployment:
a893af
 		crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \
a893af
 			op monitor interval=10s