diff --git a/SOURCES/bz1905820-LVM-activate-fix-return-codes.patch b/SOURCES/bz1905820-LVM-activate-fix-return-codes.patch new file mode 100644 index 0000000..4597e3f --- /dev/null +++ b/SOURCES/bz1905820-LVM-activate-fix-return-codes.patch @@ -0,0 +1,195 @@ +From 640c2b57f0f3e7256d587ddd5960341cb38b1982 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Sun, 13 Dec 2020 14:58:34 -0800 +Subject: [PATCH] LVM-activate: Fix return codes + +OCF_ERR_ARGS should be used when the configuration isn't valid for the +**local** node, and so the resource should not attempt to start again +locally until the issue is corrected. + +OCF_ERR_CONFIGURED should be used when the configuration isn't valid on +**any** node, and so the resource should not attempt to start again +anywhere until the issue is corrected. + +One remaining gray area: Should lvmlockd/lvmetad/clvmd improperly +running (or improperly not running) be an OCF_ERR_GENERIC or +OCF_ERR_ARGS? The fact that it's a state issue rather than a config +issue suggests OCF_ERR_GENERIC. The fact that it won't be fixed without +user intervention suggests OCF_ERR_ARGS. The approach here is to use +GENERIC for all of these. One can make the case that "improperly +running" should use ARGS, since a process must be manually stopped to +fix the issue, and that "improperly not running" should use GENERIC, +since there's a small chance the process died and will be recovered in +some way. + +More info about return code meanings: + - https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Administration/html/agents.html#how-are-ocf-return-codes-interpreted + +Resolves: RHBZ#1905820 + +Signed-off-by: Reid Wahl +--- + heartbeat/LVM-activate | 47 +++++++++++++++++++++--------------------- + 1 file changed, 23 insertions(+), 24 deletions(-) + +diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate +index c86606637..e951a08e9 100755 +--- a/heartbeat/LVM-activate ++++ b/heartbeat/LVM-activate +@@ -333,8 +333,7 @@ config_verify() + real=$(lvmconfig "$name" | cut -d'=' -f2) + if [ "$real" != "$expect" ]; then + ocf_exit_reason "config item $name: expect=$expect but real=$real" +- exit $OCF_ERR_CONFIGURED +- ++ exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +@@ -366,12 +365,12 @@ lvmlockd_check() + fi + + ocf_exit_reason "lvmlockd daemon is not running!" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_GENERIC + fi + + if pgrep clvmd >/dev/null 2>&1 ; then + ocf_exit_reason "clvmd daemon is running unexpectedly." +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +@@ -402,17 +401,17 @@ clvmd_check() + # Good: clvmd is running, and lvmlockd is not running + if ! pgrep clvmd >/dev/null 2>&1 ; then + ocf_exit_reason "clvmd daemon is not running!" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_GENERIC + fi + + if pgrep lvmetad >/dev/null 2>&1 ; then + ocf_exit_reason "Please stop lvmetad daemon when clvmd is running." +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_GENERIC + fi + + if pgrep lvmlockd >/dev/null 2>&1 ; then + ocf_exit_reason "lvmlockd daemon is running unexpectedly." +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +@@ -424,12 +423,12 @@ systemid_check() + source=$(lvmconfig 'global/system_id_source' 2>/dev/null | cut -d"=" -f2) + if [ "$source" = "" ] || [ "$source" = "none" ]; then + ocf_exit_reason "system_id_source in lvm.conf is not set correctly!" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_ARGS + fi + + if [ -z ${SYSTEM_ID} ]; then + ocf_exit_reason "local/system_id is not set!" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +@@ -441,18 +440,18 @@ tagging_check() + # The volume_list must be initialized to something in order to + # guarantee our tag will be filtered on startup + if ! lvm dumpconfig activation/volume_list; then +- ocf_log err "LVM: Improper setup detected" ++ ocf_log err "LVM: Improper setup detected" + ocf_exit_reason "The volume_list filter must be initialized in lvm.conf for exclusive activation without clvmd" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_ARGS + fi + + # Our tag must _NOT_ be in the volume_list. This agent + # overrides the volume_list during activation using the + # special tag reserved for cluster activation + if lvm dumpconfig activation/volume_list | grep -e "\"@${OUR_TAG}\"" -e "\"${VG}\""; then +- ocf_log err "LVM: Improper setup detected" ++ ocf_log err "LVM: Improper setup detected" + ocf_exit_reason "The volume_list in lvm.conf must not contain the cluster tag, \"${OUR_TAG}\", or volume group, ${VG}" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +@@ -463,13 +462,13 @@ read_parameters() + if [ -z "$VG" ] + then + ocf_exit_reason "You must identify the volume group name!" +- exit $OCF_ERR_ARGS ++ exit $OCF_ERR_CONFIGURED + fi + + if [ "$LV_activation_mode" != "shared" ] && [ "$LV_activation_mode" != "exclusive" ] + then + ocf_exit_reason "Invalid value for activation_mode: $LV_activation_mode" +- exit $OCF_ERR_ARGS ++ exit $OCF_ERR_CONFIGURED + fi + + # Convert VG_access_mode from string to index +@@ -519,8 +518,10 @@ lvm_validate() { + exit $OCF_NOT_RUNNING + fi + ++ # Could be a transient error (e.g., iSCSI connection ++ # issue) so use OCF_ERR_GENERIC + ocf_exit_reason "Volume group[${VG}] doesn't exist, or not visible on this node!" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_GENERIC + fi + + # Inconsistency might be due to missing physical volumes, which doesn't +@@ -549,7 +550,7 @@ lvm_validate() { + mode=$? + if [ $VG_access_mode_num -ne 4 ] && [ $mode -ne $VG_access_mode_num ]; then + ocf_exit_reason "The specified vg_access_mode doesn't match the lock_type on VG metadata!" +- exit $OCF_ERR_ARGS ++ exit $OCF_ERR_CONFIGURED + fi + + # Nothing to do if the VG has no logical volume +@@ -561,11 +562,11 @@ lvm_validate() { + + # Check if the given $LV is in the $VG + if [ -n "$LV" ]; then +- OUT=$(lvs --foreign --noheadings ${VG}/${LV} 2>&1) ++ output=$(lvs --foreign --noheadings ${VG}/${LV} 2>&1) + if [ $? -ne 0 ]; then +- ocf_log err "lvs: ${OUT}" ++ ocf_log err "lvs: ${output}" + ocf_exit_reason "LV ($LV) is not in the given VG ($VG)." +- exit $OCF_ERR_ARGS ++ exit $OCF_ERR_CONFIGURED + fi + fi + +@@ -580,7 +581,6 @@ lvm_validate() { + 3) + systemid_check + ;; +- + 4) + tagging_check + ;; +@@ -808,10 +808,9 @@ lvm_status() { + dd if=${dm_name} of=/dev/null bs=1 count=1 >/dev/null \ + 2>&1 + if [ $? -ne 0 ]; then +- return $OCF_NOT_RUNNING +- else +- return $OCF_SUCCESS ++ return $OCF_ERR_GENERIC + fi ++ return $OCF_SUCCESS + ;; + *) + ocf_exit_reason "unsupported monitor level $OCF_CHECK_LEVEL" diff --git a/SOURCES/bz1977012-azure-events-az-new-ra.patch b/SOURCES/bz1977012-azure-events-az-new-ra.patch new file mode 100644 index 0000000..88c7781 --- /dev/null +++ b/SOURCES/bz1977012-azure-events-az-new-ra.patch @@ -0,0 +1,903 @@ +From 5dcd5153f0318e4766f7f4d3e61dfdb4b352c39c Mon Sep 17 00:00:00 2001 +From: MSSedusch +Date: Mon, 30 May 2022 15:08:10 +0200 +Subject: [PATCH 1/2] add new Azure Events AZ resource agent + +--- + .gitignore | 1 + + configure.ac | 8 + + doc/man/Makefile.am | 4 + + heartbeat/Makefile.am | 4 + + heartbeat/azure-events-az.in | 782 +++++++++++++++++++++++++++++++++++ + 5 files changed, 799 insertions(+) + create mode 100644 heartbeat/azure-events-az.in + +diff --git a/.gitignore b/.gitignore +index 0c259b5cf..e2b7c039c 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -54,6 +54,7 @@ heartbeat/Squid + heartbeat/SysInfo + heartbeat/aws-vpc-route53 + heartbeat/azure-events ++heartbeat/azure-events-az + heartbeat/clvm + heartbeat/conntrackd + heartbeat/dnsupdate +diff --git a/configure.ac b/configure.ac +index eeecfad0e..5716a2be2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -523,6 +523,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then + fi + AM_CONDITIONAL(BUILD_AZURE_EVENTS, test $BUILD_AZURE_EVENTS -eq 1) + ++BUILD_AZURE_EVENTS_AZ=1 ++if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then ++ BUILD_AZURE_EVENTS_AZ=0 ++ AC_MSG_WARN("Not building azure-events-az") ++fi ++AM_CONDITIONAL(BUILD_AZURE_EVENTS_AZ, test $BUILD_AZURE_EVENTS_AZ -eq 1) ++ + BUILD_GCP_PD_MOVE=1 + if test -z "$PYTHON" || test "x${HAVE_PYMOD_GOOGLEAPICLIENT}" != xyes || test $BUILD_OCF_PY -eq 0; then + BUILD_GCP_PD_MOVE=0 +@@ -976,6 +983,7 @@ rgmanager/Makefile \ + + dnl Files we output that need to be executable + AC_CONFIG_FILES([heartbeat/azure-events], [chmod +x heartbeat/azure-events]) ++AC_CONFIG_FILES([heartbeat/azure-events-az], [chmod +x heartbeat/azure-events-az]) + AC_CONFIG_FILES([heartbeat/AoEtarget], [chmod +x heartbeat/AoEtarget]) + AC_CONFIG_FILES([heartbeat/ManageRAID], [chmod +x heartbeat/ManageRAID]) + AC_CONFIG_FILES([heartbeat/ManageVE], [chmod +x heartbeat/ManageVE]) +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index cd8fd16bf..658c700ac 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -219,6 +219,10 @@ if BUILD_AZURE_EVENTS + man_MANS += ocf_heartbeat_azure-events.7 + endif + ++if BUILD_AZURE_EVENTS_AZ ++man_MANS += ocf_heartbeat_azure-events-az.7 ++endif ++ + if BUILD_GCP_PD_MOVE + man_MANS += ocf_heartbeat_gcp-pd-move.7 + endif +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index 20d41e36a..1133dc13e 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -188,6 +188,10 @@ if BUILD_AZURE_EVENTS + ocf_SCRIPTS += azure-events + endif + ++if BUILD_AZURE_EVENTS_AZ ++ocf_SCRIPTS += azure-events-az ++endif ++ + if BUILD_GCP_PD_MOVE + ocf_SCRIPTS += gcp-pd-move + endif +diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in +new file mode 100644 +index 000000000..616fc8d9e +--- /dev/null ++++ b/heartbeat/azure-events-az.in +@@ -0,0 +1,782 @@ ++#!@PYTHON@ -tt ++# ++# Resource agent for monitoring Azure Scheduled Events ++# ++# License: GNU General Public License (GPL) ++# (c) 2018 Tobias Niekamp, Microsoft Corp. ++# and Linux-HA contributors ++ ++import os ++import sys ++import time ++import subprocess ++import json ++try: ++ import urllib2 ++ from urllib2 import URLError ++except ImportError: ++ import urllib.request as urllib2 ++ from urllib.error import URLError ++import socket ++from collections import defaultdict ++ ++OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) ++sys.path.append(OCF_FUNCTIONS_DIR) ++import ocf ++ ++############################################################################## ++ ++ ++VERSION = "0.10" ++USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro()) ++ ++attr_globalPullState = "azure-events-az_globalPullState" ++attr_lastDocVersion = "azure-events-az_lastDocVersion" ++attr_curNodeState = "azure-events-az_curNodeState" ++attr_pendingEventIDs = "azure-events-az_pendingEventIDs" ++attr_healthstate = "#health-azure" ++ ++default_loglevel = ocf.logging.INFO ++default_relevantEventTypes = set(["Reboot", "Redeploy"]) ++ ++global_pullMaxAttempts = 3 ++global_pullDelaySecs = 1 ++ ++############################################################################## ++ ++class attrDict(defaultdict): ++ """ ++ A wrapper for accessing dict keys like an attribute ++ """ ++ def __init__(self, data): ++ super(attrDict, self).__init__(attrDict) ++ for d in data.keys(): ++ self.__setattr__(d, data[d]) ++ ++ def __getattr__(self, key): ++ try: ++ return self[key] ++ except KeyError: ++ raise AttributeError(key) ++ ++ def __setattr__(self, key, value): ++ self[key] = value ++ ++############################################################################## ++ ++class azHelper: ++ """ ++ Helper class for Azure's metadata API (including Scheduled Events) ++ """ ++ metadata_host = "http://169.254.169.254/metadata" ++ instance_api = "instance" ++ events_api = "scheduledevents" ++ api_version = "2019-08-01" ++ ++ @staticmethod ++ def _sendMetadataRequest(endpoint, postData=None): ++ """ ++ Send a request to Azure's Azure Metadata Service API ++ """ ++ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version) ++ data = "" ++ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData)) ++ ocf.logger.debug("_sendMetadataRequest: url = %s" % url) ++ ++ if postData and type(postData) != bytes: ++ postData = postData.encode() ++ ++ req = urllib2.Request(url, postData) ++ req.add_header("Metadata", "true") ++ req.add_header("User-Agent", USER_AGENT) ++ try: ++ resp = urllib2.urlopen(req) ++ except URLError as e: ++ if hasattr(e, 'reason'): ++ ocf.logger.warning("Failed to reach the server: %s" % e.reason) ++ clusterHelper.setAttr(attr_globalPullState, "IDLE") ++ elif hasattr(e, 'code'): ++ ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code) ++ clusterHelper.setAttr(attr_globalPullState, "IDLE") ++ else: ++ data = resp.read() ++ ocf.logger.debug("_sendMetadataRequest: response = %s" % data) ++ ++ if data: ++ data = json.loads(data) ++ ++ ocf.logger.debug("_sendMetadataRequest: finished") ++ return data ++ ++ @staticmethod ++ def getInstanceInfo(): ++ """ ++ Fetch details about the current VM from Azure's Azure Metadata Service API ++ """ ++ ocf.logger.debug("getInstanceInfo: begin") ++ ++ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api) ++ ocf.logger.debug("getInstanceInfo: json = %s" % jsondata) ++ ++ if jsondata: ++ ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"])) ++ return attrDict(jsondata["compute"]) ++ else: ++ ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info") ++ sys.exit(ocf.OCF_ERR_GENERIC) ++ ++ @staticmethod ++ def pullScheduledEvents(): ++ """ ++ Retrieve all currently scheduled events via Azure Metadata Service API ++ """ ++ ocf.logger.debug("pullScheduledEvents: begin") ++ ++ jsondata = azHelper._sendMetadataRequest(azHelper.events_api) ++ ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata) ++ ++ ocf.logger.debug("pullScheduledEvents: finished") ++ return attrDict(jsondata) ++ ++ @staticmethod ++ def forceEvents(eventIDs): ++ """ ++ Force a set of events to start immediately ++ """ ++ ocf.logger.debug("forceEvents: begin") ++ ++ events = [] ++ for e in eventIDs: ++ events.append({ ++ "EventId": e, ++ }) ++ postData = { ++ "StartRequests" : events ++ } ++ ocf.logger.info("forceEvents: postData = %s" % postData) ++ resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData)) ++ ++ ocf.logger.debug("forceEvents: finished") ++ return ++ ++############################################################################## ++ ++class clusterHelper: ++ """ ++ Helper functions for Pacemaker control via crm ++ """ ++ @staticmethod ++ def _getLocation(node): ++ """ ++ Helper function to retrieve local/global attributes ++ """ ++ if node: ++ return ["--node", node] ++ else: ++ return ["--type", "crm_config"] ++ ++ @staticmethod ++ def _exec(command, *args): ++ """ ++ Helper function to execute a UNIX command ++ """ ++ args = list(args) ++ ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args))) ++ ++ def flatten(*n): ++ return (str(e) for a in n ++ for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),))) ++ command = list(flatten([command] + args)) ++ ocf.logger.debug("_exec: cmd = %s" % " ".join(command)) ++ try: ++ ret = subprocess.check_output(command) ++ if type(ret) != str: ++ ret = ret.decode() ++ ocf.logger.debug("_exec: return = %s" % ret) ++ return ret.rstrip() ++ except Exception as err: ++ ocf.logger.exception(err) ++ return None ++ ++ @staticmethod ++ def setAttr(key, value, node=None): ++ """ ++ Set the value of a specific global/local attribute in the Pacemaker cluster ++ """ ++ ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node)) ++ ++ if value: ++ ret = clusterHelper._exec("crm_attribute", ++ "--name", key, ++ "--update", value, ++ clusterHelper._getLocation(node)) ++ else: ++ ret = clusterHelper._exec("crm_attribute", ++ "--name", key, ++ "--delete", ++ clusterHelper._getLocation(node)) ++ ++ ocf.logger.debug("setAttr: finished") ++ return len(ret) == 0 ++ ++ @staticmethod ++ def getAttr(key, node=None): ++ """ ++ Retrieve a global/local attribute from the Pacemaker cluster ++ """ ++ ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node)) ++ ++ val = clusterHelper._exec("crm_attribute", ++ "--name", key, ++ "--query", "--quiet", ++ "--default", "", ++ clusterHelper._getLocation(node)) ++ ocf.logger.debug("getAttr: finished") ++ if not val: ++ return None ++ return val if not val.isdigit() else int(val) ++ ++ @staticmethod ++ def getAllNodes(): ++ """ ++ Get a list of hostnames for all nodes in the Pacemaker cluster ++ """ ++ ocf.logger.debug("getAllNodes: begin") ++ ++ nodes = [] ++ nodeList = clusterHelper._exec("crm_node", "--list") ++ for n in nodeList.split("\n"): ++ nodes.append(n.split()[1]) ++ ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes)) ++ ++ return nodes ++ ++ @staticmethod ++ def getHostNameFromAzName(azName): ++ """ ++ Helper function to get the actual host name from an Azure node name ++ """ ++ return clusterHelper.getAttr("hostName_%s" % azName) ++ ++ @staticmethod ++ def removeHoldFromNodes(): ++ """ ++ Remove the ON_HOLD state from all nodes in the Pacemaker cluster ++ """ ++ ocf.logger.debug("removeHoldFromNodes: begin") ++ ++ for n in clusterHelper.getAllNodes(): ++ if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD": ++ clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n) ++ ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n) ++ ++ ocf.logger.debug("removeHoldFromNodes: finished") ++ return False ++ ++ @staticmethod ++ def otherNodesAvailable(exceptNode): ++ """ ++ Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE ++ """ ++ ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode) ++ ++ for n in clusterHelper.getAllNodes(): ++ state = clusterHelper.getAttr(attr_curNodeState, node=n) ++ state = stringToNodeState(state) if state else AVAILABLE ++ if state == AVAILABLE and n != exceptNode.hostName: ++ ocf.logger.info("otherNodesAvailable: at least %s is available" % n) ++ ocf.logger.debug("otherNodesAvailable: finished") ++ return True ++ ocf.logger.info("otherNodesAvailable: no other nodes are available") ++ ocf.logger.debug("otherNodesAvailable: finished") ++ ++ return False ++ ++ @staticmethod ++ def transitionSummary(): ++ """ ++ Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby) ++ """ ++ # Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node? ++ # # crm_simulate -Ls ++ # Transition Summary: ++ # * Promote rsc_SAPHana_HN1_HDB03:0 (Slave -> Master hsr3-db1) ++ # * Stop rsc_SAPHana_HN1_HDB03:1 (hsr3-db0) ++ # * Move rsc_ip_HN1_HDB03 (Started hsr3-db0 -> hsr3-db1) ++ # * Start rsc_nc_HN1_HDB03 (hsr3-db1) ++ # # Excepted result when there are no pending actions: ++ # Transition Summary: ++ ocf.logger.debug("transitionSummary: begin") ++ ++ summary = clusterHelper._exec("crm_simulate", "-Ls") ++ if not summary: ++ ocf.logger.warning("transitionSummary: could not load transition summary") ++ return False ++ if summary.find("Transition Summary:") < 0: ++ ocf.logger.warning("transitionSummary: received unexpected transition summary: %s" % summary) ++ return False ++ summary = summary.split("Transition Summary:")[1] ++ ret = summary.split("\n").pop(0) ++ ++ ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret)) ++ return ret ++ ++ @staticmethod ++ def listOperationsOnNode(node): ++ """ ++ Get a list of all current operations for a given node (used to check if any resources are pending) ++ """ ++ # hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0 ++ # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=115ms): complete ++ # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun 8 22:37:47 2018, exec=197ms): complete ++ # rsc_SAPHana_HN1_HDB03 (ocf::suse:SAPHana): Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun 8 22:37:46 2018, exec=0ms): pending ++ # rsc_SAPHanaTopology_HN1_HDB03 (ocf::suse:SAPHanaTopology): Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=3214ms): complete ++ ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node) ++ ++ resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node) ++ if len(resources) == 0: ++ ret = [] ++ else: ++ ret = resources.split("\n") ++ ++ ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret)) ++ return ret ++ ++ @staticmethod ++ def noPendingResourcesOnNode(node): ++ """ ++ Check that there are no pending resources on a given node ++ """ ++ ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node) ++ ++ for r in clusterHelper.listOperationsOnNode(node): ++ ocf.logger.debug("noPendingResourcesOnNode: * %s" % r) ++ resource = r.split()[-1] ++ if resource == "pending": ++ ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource) ++ ocf.logger.debug("noPendingResourcesOnNode: finished; return = False") ++ return False ++ ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node) ++ ocf.logger.debug("noPendingResourcesOnNode: finished; return = True") ++ ++ return True ++ ++ @staticmethod ++ def allResourcesStoppedOnNode(node): ++ """ ++ Check that all resources on a given node are stopped ++ """ ++ ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node) ++ ++ if clusterHelper.noPendingResourcesOnNode(node): ++ if len(clusterHelper.transitionSummary()) == 0: ++ ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node) ++ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True") ++ return True ++ ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty") ++ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") ++ return False ++ ++ ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node) ++ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") ++ return False ++ ++############################################################################## ++ ++AVAILABLE = 0 # Node is online and ready to handle events ++STOPPING = 1 # Standby has been triggered, but some resources are still running ++IN_EVENT = 2 # All resources are stopped, and event has been initiated via Azure Metadata Service ++ON_HOLD = 3 # Node has a pending event that cannot be started there are no other nodes available ++ ++def stringToNodeState(name): ++ if type(name) == int: return name ++ if name == "STOPPING": return STOPPING ++ if name == "IN_EVENT": return IN_EVENT ++ if name == "ON_HOLD": return ON_HOLD ++ return AVAILABLE ++ ++def nodeStateToString(state): ++ if state == STOPPING: return "STOPPING" ++ if state == IN_EVENT: return "IN_EVENT" ++ if state == ON_HOLD: return "ON_HOLD" ++ return "AVAILABLE" ++ ++############################################################################## ++ ++class Node: ++ """ ++ Core class implementing logic for a cluster node ++ """ ++ def __init__(self, ra): ++ self.raOwner = ra ++ self.azInfo = azHelper.getInstanceInfo() ++ self.azName = self.azInfo.name ++ self.hostName = socket.gethostname() ++ self.setAttr("azName", self.azName) ++ clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName) ++ ++ def getAttr(self, key): ++ """ ++ Get a local attribute ++ """ ++ return clusterHelper.getAttr(key, node=self.hostName) ++ ++ def setAttr(self, key, value): ++ """ ++ Set a local attribute ++ """ ++ return clusterHelper.setAttr(key, value, node=self.hostName) ++ ++ def selfOrOtherNode(self, node): ++ """ ++ Helper function to distinguish self/other node ++ """ ++ return node if node else self.hostName ++ ++ def setState(self, state, node=None): ++ """ ++ Set the state for a given node (or self) ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state))) ++ ++ clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node) ++ ++ ocf.logger.debug("setState: finished") ++ ++ def getState(self, node=None): ++ """ ++ Get the state for a given node (or self) ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("getState: begin; node = %s" % node) ++ ++ state = clusterHelper.getAttr(attr_curNodeState, node=node) ++ ocf.logger.debug("getState: state = %s" % state) ++ ocf.logger.debug("getState: finished") ++ if not state: ++ return AVAILABLE ++ return stringToNodeState(state) ++ ++ def setEventIDs(self, eventIDs, node=None): ++ """ ++ Set pending EventIDs for a given node (or self) ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs))) ++ ++ if eventIDs: ++ eventIDStr = ",".join(eventIDs) ++ else: ++ eventIDStr = None ++ clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node) ++ ++ ocf.logger.debug("setEventIDs: finished") ++ return ++ ++ def getEventIDs(self, node=None): ++ """ ++ Get pending EventIDs for a given node (or self) ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("getEventIDs: begin; node = %s" % node) ++ ++ eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node) ++ if eventIDStr: ++ eventIDs = eventIDStr.split(",") ++ else: ++ eventIDs = None ++ ++ ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs)) ++ return eventIDs ++ ++ def updateNodeStateAndEvents(self, state, eventIDs, node=None): ++ """ ++ Set the state and pending EventIDs for a given node (or self) ++ """ ++ ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs))) ++ ++ self.setState(state, node=node) ++ self.setEventIDs(eventIDs, node=node) ++ ++ ocf.logger.debug("updateNodeStateAndEvents: finished") ++ return state ++ ++ def putNodeStandby(self, node=None): ++ """ ++ Put self to standby ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("putNodeStandby: begin; node = %s" % node) ++ ++ clusterHelper._exec("crm_attribute", ++ "--node", node, ++ "--name", attr_healthstate, ++ "--update", "-1000000", ++ "--lifetime=forever") ++ ++ ocf.logger.debug("putNodeStandby: finished") ++ ++ def isNodeInStandby(self, node=None): ++ """ ++ check if node is in standby ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("isNodeInStandby: begin; node = %s" % node) ++ isInStandy = False ++ ++ healthAttributeStr = clusterHelper.getAttr(attr_healthstate, node) ++ if healthAttributeStr is not None: ++ try: ++ healthAttribute = int(healthAttributeStr) ++ isInStandy = healthAttribute < 0 ++ except ValueError: ++ # Handle the exception ++ ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node)) ++ ++ ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy) ++ return isInStandy ++ ++ def putNodeOnline(self, node=None): ++ """ ++ Put self back online ++ """ ++ node = self.selfOrOtherNode(node) ++ ocf.logger.debug("putNodeOnline: begin; node = %s" % node) ++ ++ clusterHelper._exec("crm_attribute", ++ "--node", node, ++ "--name", "#health-azure", ++ "--update", "0", ++ "--lifetime=forever") ++ ++ ocf.logger.debug("putNodeOnline: finished") ++ ++ def separateEvents(self, events): ++ """ ++ Split own/other nodes' events ++ """ ++ ocf.logger.debug("separateEvents: begin; events = %s" % str(events)) ++ ++ localEvents = [] ++ remoteEvents = [] ++ for e in events: ++ e = attrDict(e) ++ if e.EventType not in self.raOwner.relevantEventTypes: ++ continue ++ if self.azName in e.Resources: ++ localEvents.append(e) ++ else: ++ remoteEvents.append(e) ++ ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents))) ++ return (localEvents, remoteEvents) ++ ++############################################################################## ++ ++class raAzEvents: ++ """ ++ Main class for resource agent ++ """ ++ def __init__(self, relevantEventTypes): ++ self.node = Node(self) ++ self.relevantEventTypes = relevantEventTypes ++ ++ def monitor(self): ++ ocf.logger.debug("monitor: begin") ++ ++ events = azHelper.pullScheduledEvents() ++ ++ # get current document version ++ curDocVersion = events.DocumentIncarnation ++ lastDocVersion = self.node.getAttr(attr_lastDocVersion) ++ ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion)) ++ ++ # split events local/remote ++ (localEvents, remoteEvents) = self.node.separateEvents(events.Events) ++ ++ # ensure local events are only executing once ++ if curDocVersion == lastDocVersion: ++ ocf.logger.info("monitor: already handled curDocVersion, skip") ++ return ocf.OCF_SUCCESS ++ ++ localAzEventIDs = set() ++ for e in localEvents: ++ localAzEventIDs.add(e.EventId) ++ ++ curState = self.node.getState() ++ clusterEventIDs = self.node.getEventIDs() ++ ++ ocf.logger.debug("monitor: curDocVersion has not been handled yet") ++ ++ if clusterEventIDs: ++ # there are pending events set, so our state must be STOPPING or IN_EVENT ++ i = 0; touchedEventIDs = False ++ while i < len(clusterEventIDs): ++ # clean up pending events that are already finished according to AZ ++ if clusterEventIDs[i] not in localAzEventIDs: ++ ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i])) ++ clusterEventIDs.pop(i) ++ touchedEventIDs = True ++ else: ++ i += 1 ++ if len(clusterEventIDs) > 0: ++ # there are still pending events (either because we're still stopping, or because the event is still in place) ++ # either way, we need to wait ++ if touchedEventIDs: ++ ocf.logger.info("monitor: added new local clusterEvent %s" % str(clusterEventIDs)) ++ self.node.setEventIDs(clusterEventIDs) ++ else: ++ ocf.logger.info("monitor: no local clusterEvents were updated") ++ else: ++ # there are no more pending events left after cleanup ++ if clusterHelper.noPendingResourcesOnNode(self.node.hostName): ++ # and no pending resources on the node -> set it back online ++ ocf.logger.info("monitor: all local events finished -> clean up, put node online and AVAILABLE") ++ curState = self.node.updateNodeStateAndEvents(AVAILABLE, None) ++ self.node.putNodeOnline() ++ clusterHelper.removeHoldFromNodes() ++ # If Azure Scheduled Events are not used for 24 hours (e.g. because the cluster was asleep), it will be disabled for a VM. ++ # When the cluster wakes up and starts using it again, the DocumentIncarnation is reset. ++ # We need to remove it during cleanup, otherwise azure-events-az will not process the event after wakeup ++ self.node.setAttr(attr_lastDocVersion, None) ++ else: ++ ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait") ++ else: ++ if curState == AVAILABLE: ++ if len(localAzEventIDs) > 0: ++ if clusterHelper.otherNodesAvailable(self.node): ++ ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs))) ++ curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs) ++ else: ++ ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs)) ++ self.node.setState(ON_HOLD) ++ else: ++ ocf.logger.debug("monitor: no local azEvents to handle") ++ ++ if curState == STOPPING: ++ eventIDsForNode = {} ++ if clusterHelper.noPendingResourcesOnNode(self.node.hostName): ++ if not self.node.isNodeInStandby(): ++ ocf.logger.info("monitor: all local resources are started properly -> put node standby and exit") ++ self.node.putNodeStandby() ++ return ocf.OCF_SUCCESS ++ ++ for e in localEvents: ++ ocf.logger.info("monitor: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources))) ++ # before we can force an event to start, we need to ensure all nodes involved have stopped their resources ++ if e.EventStatus == "Scheduled": ++ allNodesStopped = True ++ for azName in e.Resources: ++ hostName = clusterHelper.getHostNameFromAzName(azName) ++ state = self.node.getState(node=hostName) ++ if state == STOPPING: ++ # the only way we can continue is when node state is STOPPING, but all resources have been stopped ++ if not clusterHelper.allResourcesStoppedOnNode(hostName): ++ ocf.logger.info("monitor: (at least) node %s has still resources running -> wait" % hostName) ++ allNodesStopped = False ++ break ++ elif state in (AVAILABLE, IN_EVENT, ON_HOLD): ++ ocf.logger.info("monitor: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state))) ++ allNodesStopped = False ++ break ++ if allNodesStopped: ++ ocf.logger.info("monitor: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId)) ++ for n in e.Resources: ++ hostName = clusterHelper.getHostNameFromAzName(n) ++ if hostName in eventIDsForNode: ++ eventIDsForNode[hostName].append(e.EventId) ++ else: ++ eventIDsForNode[hostName] = [e.EventId] ++ elif e.EventStatus == "Started": ++ ocf.logger.info("monitor: remote event already started") ++ ++ # force the start of all events whose nodes are ready (i.e. have no more resources running) ++ if len(eventIDsForNode.keys()) > 0: ++ eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist]) ++ ocf.logger.info("monitor: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce))) ++ for node, eventId in eventIDsForNode.items(): ++ self.node.updateNodeStateAndEvents(IN_EVENT, eventId, node=node) ++ azHelper.forceEvents(eventIDsToForce) ++ self.node.setAttr(attr_lastDocVersion, curDocVersion) ++ else: ++ ocf.logger.info("monitor: some local resources are not clean yet -> wait") ++ ++ ocf.logger.debug("monitor: finished") ++ return ocf.OCF_SUCCESS ++ ++############################################################################## ++ ++def setLoglevel(verbose): ++ # set up writing into syslog ++ loglevel = default_loglevel ++ if verbose: ++ opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1)) ++ urllib2.install_opener(opener) ++ loglevel = ocf.logging.DEBUG ++ ocf.log.setLevel(loglevel) ++ ++description = ( ++ "Microsoft Azure Scheduled Events monitoring agent", ++ """This resource agent implements a monitor for scheduled ++(maintenance) events for a Microsoft Azure VM. ++ ++If any relevant events are found, it moves all Pacemaker resources ++away from the affected node to allow for a graceful shutdown. ++ ++ Usage: ++ [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION ++ ++ action (required): Supported values: monitor, help, meta-data ++ eventTypes (optional): List of event types to be considered ++ relevant by the resource agent (comma-separated). ++ Supported values: Freeze,Reboot,Redeploy ++ Default = Reboot,Redeploy ++/ verbose (optional): If set to true, displays debug info. ++ Default = false ++ ++ Deployment: ++ crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \ ++ op monitor interval=10s ++ crm configure clone cln_azure-events-az rsc_azure-events-az ++ ++For further information on Microsoft Azure Scheduled Events, please ++refer to the following documentation: ++https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events ++""") ++ ++def monitor_action(eventTypes): ++ relevantEventTypes = set(eventTypes.split(",") if eventTypes else []) ++ ra = raAzEvents(relevantEventTypes) ++ return ra.monitor() ++ ++def validate_action(eventTypes): ++ if eventTypes: ++ for event in eventTypes.split(","): ++ if event not in ("Freeze", "Reboot", "Redeploy"): ++ ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes) ++ return ocf.OCF_ERR_CONFIGURED ++ return ocf.OCF_SUCCESS ++ ++def main(): ++ agent = ocf.Agent("azure-events-az", shortdesc=description[0], longdesc=description[1]) ++ agent.add_parameter( ++ "eventTypes", ++ shortdesc="List of resources to be considered", ++ longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)", ++ content_type="string", ++ default="Reboot,Redeploy") ++ agent.add_parameter( ++ "verbose", ++ shortdesc="Enable verbose agent logging", ++ longdesc="Set to true to enable verbose logging", ++ content_type="boolean", ++ default="false") ++ agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS) ++ agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS) ++ agent.add_action("validate-all", timeout=20, handler=validate_action) ++ agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action) ++ setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false"))) ++ agent.run() ++ ++if __name__ == '__main__': ++ main() +\ No newline at end of file + +From a95337d882c7cc69d604b050159ad50b679f18be Mon Sep 17 00:00:00 2001 +From: MSSedusch +Date: Thu, 2 Jun 2022 14:10:33 +0200 +Subject: [PATCH 2/2] Remove developer documentation + +--- + heartbeat/azure-events-az.in | 11 ----------- + 1 file changed, 11 deletions(-) + +diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in +index 616fc8d9e..59d095306 100644 +--- a/heartbeat/azure-events-az.in ++++ b/heartbeat/azure-events-az.in +@@ -723,17 +723,6 @@ description = ( + If any relevant events are found, it moves all Pacemaker resources + away from the affected node to allow for a graceful shutdown. + +- Usage: +- [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION +- +- action (required): Supported values: monitor, help, meta-data +- eventTypes (optional): List of event types to be considered +- relevant by the resource agent (comma-separated). +- Supported values: Freeze,Reboot,Redeploy +- Default = Reboot,Redeploy +-/ verbose (optional): If set to true, displays debug info. +- Default = false +- + Deployment: + crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \ + op monitor interval=10s diff --git a/SOURCES/bz2049319-Filesystem-add-support-for-Amazon-EFS.patch b/SOURCES/bz2049319-Filesystem-add-support-for-Amazon-EFS.patch new file mode 100644 index 0000000..05e7bf1 --- /dev/null +++ b/SOURCES/bz2049319-Filesystem-add-support-for-Amazon-EFS.patch @@ -0,0 +1,175 @@ +From cab190c737fdf58268aa5c009f6089b754862b22 Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Tue, 1 Feb 2022 16:32:50 -0800 +Subject: [PATCH 1/3] Filesystem: Fix OpenBSD check in fstype_supported() + +fstype_supported() is supposed to skip the /proc/filesystems check if +the OS is OpenBSD. Instead, it skips the check if the OS is **not** +OpenBSD. That means the function has been a no-op for all other distros. + +Signed-off-by: Reid Wahl +--- + heartbeat/Filesystem | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 010c1dcfc..8b4792152 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -440,7 +440,7 @@ fstype_supported() + local support="$FSTYPE" + local rc + +- if [ "X${HOSTOS}" != "XOpenBSD" ];then ++ if [ "X${HOSTOS}" = "XOpenBSD" ];then + # skip checking /proc/filesystems for obsd + return $OCF_SUCCESS + fi + +From 5d38b87daa9cfffa89a193df131d6ebd87cd05aa Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Tue, 1 Feb 2022 18:26:32 -0800 +Subject: [PATCH 2/3] Filesystem: Improve fstype_supported logs for fuse + +Make it more clear when we have to use a different name to check for +support of a particular filesystem. Currently only used for fuse-type +filesystems. + +Signed-off-by: Reid Wahl +--- + heartbeat/Filesystem | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 8b4792152..4d84846c1 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -455,6 +455,10 @@ fstype_supported() + fuse.*|glusterfs|rozofs) support="fuse";; + esac + ++ if [ "$support" != "$FSTYPE" ]; then ++ ocf_log info "Checking support for $FSTYPE as \"$support\"" ++ fi ++ + grep -w "$support"'$' /proc/filesystems >/dev/null + if [ $? -eq 0 ]; then + # found the fs type +@@ -465,7 +469,7 @@ fstype_supported() + # check the if the filesystem support exists again. + $MODPROBE $support >/dev/null + if [ $? -ne 0 ]; then +- ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems and failed to load kernel module" ++ ocf_exit_reason "Couldn't find filesystem $support in /proc/filesystems and failed to load kernel module" + return $OCF_ERR_INSTALLED + fi + +@@ -478,11 +482,11 @@ fstype_supported() + # yes. found the filesystem after doing the modprobe + return $OCF_SUCCESS + fi +- ocf_log debug "Unable to find support for $FSTYPE in /proc/filesystems after modprobe, trying again" ++ ocf_log debug "Unable to find support for $support in /proc/filesystems after modprobe, trying again" + sleep 1 + done + +- ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems" ++ ocf_exit_reason "Couldn't find filesystem $support in /proc/filesystems" + return $OCF_ERR_INSTALLED + } + +@@ -837,6 +841,9 @@ Filesystem_monitor() + # VALIDATE_ALL: Are the instance parameters valid? + # FIXME!! The only part that's useful is the return code. + # This code always returns $OCF_SUCCESS (!) ++# FIXME!! Needs some tuning to match fstype_supported() (e.g., for ++# fuse). Can we just call fstype_supported() with a flag like ++# "no_modprobe" instead? + # + Filesystem_validate_all() + { + +From e2174244067b02d798e0f12437f0f499c80f91fe Mon Sep 17 00:00:00 2001 +From: Reid Wahl +Date: Tue, 1 Feb 2022 18:55:47 -0800 +Subject: [PATCH 3/3] Filesystem: Add support for Amazon EFS mount helper + +mount.efs, the mount helper for Amazon Elastic File System (EFS) +provided by amazon-efs-utils [1], is a wrapper for mount.nfs4. It offers +a number of AWS-specific mount options and some security improvements +like encryption of data in transit. + +This commit adds support by treating an fstype=efs like fstype=nfs4 for +the most part. + +Resolves: RHBZ#2049319 + +[1] https://docs.aws.amazon.com/efs/latest/ug/efs-mount-helper.html + +Signed-off-by: Reid Wahl +--- + heartbeat/Filesystem | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 4d84846c1..1a90d6a42 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -341,7 +341,7 @@ determine_blockdevice() { + # Get the current real device name, if possible. + # (specified devname could be -L or -U...) + case "$FSTYPE" in +- nfs4|nfs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre) ++ nfs4|nfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre) + : ;; + *) + match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}" +@@ -423,7 +423,7 @@ is_fsck_needed() { + no) false;; + ""|auto) + case "$FSTYPE" in +- ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs) ++ ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs) + false;; + *) + true;; +@@ -450,9 +450,11 @@ fstype_supported() + return $OCF_SUCCESS + fi + +- # support fuse-filesystems (e.g. GlusterFS) ++ # support fuse-filesystems (e.g. GlusterFS) and Amazon Elastic File ++ # System (EFS) + case "$FSTYPE" in + fuse.*|glusterfs|rozofs) support="fuse";; ++ efs) support="nfs4";; + esac + + if [ "$support" != "$FSTYPE" ]; then +@@ -701,7 +703,7 @@ Filesystem_stop() + + # For networked filesystems, there's merit in trying -f: + case "$FSTYPE" in +- nfs4|nfs|cifs|smbfs) umount_force="-f" ;; ++ nfs4|nfs|efs|cifs|smbfs) umount_force="-f" ;; + esac + + # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. +@@ -892,7 +894,7 @@ set_blockdevice_var() { + + # these are definitely not block devices + case "$FSTYPE" in +- nfs4|nfs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;; ++ nfs4|nfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;; + esac + + if $(is_option "loop"); then +@@ -1013,7 +1015,7 @@ is_option "ro" && + CLUSTERSAFE=2 + + case "$FSTYPE" in +-nfs4|nfs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre) ++nfs4|nfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre) + CLUSTERSAFE=1 # this is kind of safe too + ;; + # add here CLUSTERSAFE=0 for all filesystems which are not diff --git a/SOURCES/bz2109159-storage_mon-1-exit-after-help.patch b/SOURCES/bz2109159-storage_mon-1-exit-after-help.patch new file mode 100644 index 0000000..a8fa868 --- /dev/null +++ b/SOURCES/bz2109159-storage_mon-1-exit-after-help.patch @@ -0,0 +1,79 @@ +From b3eadb8523b599af800a7c772606aa0e90cf142f Mon Sep 17 00:00:00 2001 +From: Fujii Masao +Date: Tue, 19 Jul 2022 17:03:02 +0900 +Subject: [PATCH 1/2] Make storage_mon -h exit just after printing help + messages. + +Previously, when -h or an invalid option was specified, storage_mon +printed the help messages, proceeded processing and then could +throw an error. This was not the behavior that, e.g., users who want +to specify -h option to see the help messages are expecting. To fix +this issue, this commit changes storage_mon so that it exits just +after printing the help messages when -h or an invalid option is +specified. +--- + tools/storage_mon.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 7b65bb419..1303371f7 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -28,7 +28,7 @@ static void usage(char *name, FILE *f) + fprintf(f, " --timeout max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT); + fprintf(f, " --inject-errors-percent Generate EIO errors %% of the time (for testing only)\n"); + fprintf(f, " --verbose emit extra output to stdout\n"); +- fprintf(f, " --help print this messages\n"); ++ fprintf(f, " --help print this messages, then exit\n"); + } + + /* Check one device */ +@@ -178,9 +178,11 @@ int main(int argc, char *argv[]) + break; + case 'h': + usage(argv[0], stdout); ++ exit(0); + break; + default: + usage(argv[0], stderr); ++ exit(-1); + break; + } + + +From e62795f02d25a772a239e0a4f9eb9d6470c134ee Mon Sep 17 00:00:00 2001 +From: Fujii Masao +Date: Tue, 19 Jul 2022 17:56:32 +0900 +Subject: [PATCH 2/2] Fix typo in help message. + +--- + tools/storage_mon.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 1303371f7..3c82d5ee8 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -28,7 +28,7 @@ static void usage(char *name, FILE *f) + fprintf(f, " --timeout max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT); + fprintf(f, " --inject-errors-percent Generate EIO errors %% of the time (for testing only)\n"); + fprintf(f, " --verbose emit extra output to stdout\n"); +- fprintf(f, " --help print this messages, then exit\n"); ++ fprintf(f, " --help print this message\n"); + } + + /* Check one device */ +@@ -178,11 +178,11 @@ int main(int argc, char *argv[]) + break; + case 'h': + usage(argv[0], stdout); +- exit(0); ++ return 0; + break; + default: + usage(argv[0], stderr); +- exit(-1); ++ return -1; + break; + } + diff --git a/SOURCES/bz2109159-storage_mon-2-fix-specified-scores-count.patch b/SOURCES/bz2109159-storage_mon-2-fix-specified-scores-count.patch new file mode 100644 index 0000000..8bbe33e --- /dev/null +++ b/SOURCES/bz2109159-storage_mon-2-fix-specified-scores-count.patch @@ -0,0 +1,36 @@ +From a68957e8f1e8169438acf5a4321f47ed7d8ceec1 Mon Sep 17 00:00:00 2001 +From: Fujii Masao +Date: Tue, 19 Jul 2022 20:28:38 +0900 +Subject: [PATCH] storage_mon: Fix bug in checking of number of specified + scores. + +Previously specifying the maximum allowed number (MAX_DEVICES, currently 25) +of devices and scores as arguments could cause storage_mon to fail unexpectedly +with the error message "too many scores, max is 25". This issue happened +because storage_mon checked whether the number of specified scores +exceeded the upper limit by using the local variable "device_count" indicating +the number of specified devices (not scores). So after the maximum number +of devices arguments were interpreted, the appearance of next score argument +caused the error even when the number of interpreted scores arguments had +not exceeded the maximum. + +This patch fixes storage_mon so that it uses the local variable "score_count" +indicating the number of specified scores, to check whether arguments for +scores are specified more than the upper limit. +--- + tools/storage_mon.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 3c82d5ee8..c749076c2 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) + } + break; + case 's': +- if (device_count < MAX_DEVICES) { ++ if (score_count < MAX_DEVICES) { + int score = atoi(optarg); + if (score < 1 || score > 10) { + fprintf(stderr, "Score must be between 1 and 10 inclusive\n"); diff --git a/SOURCES/bz2109159-storage_mon-3-fix-child-process-exit.patch b/SOURCES/bz2109159-storage_mon-3-fix-child-process-exit.patch new file mode 100644 index 0000000..d02d584 --- /dev/null +++ b/SOURCES/bz2109159-storage_mon-3-fix-child-process-exit.patch @@ -0,0 +1,43 @@ +From c6ea93fcb499c84c3d8e9aad2ced65065a3f6d51 Mon Sep 17 00:00:00 2001 +From: Fujii Masao +Date: Tue, 19 Jul 2022 22:34:08 +0900 +Subject: [PATCH] Fix bug in handling of child process exit. + +When storage_mon detects that a child process exits with zero, +it resets the test_forks[] entry for the child process to 0, to avoid +waitpid() for the process again in the loop. But, previously, +storage_mon didn't do that when it detected that a child process +exited with non-zero. Which caused waitpid() to be called again +for the process already gone and to report an error like +"waitpid on XXX failed: No child processes" unexpectedly. +In this case, basically storage_mon should wait until all the child +processes exit and return the final score, instead. + +This patch fixes this issue by making storage_mon reset test_works[] +entry even when a child process exits with non-zero. +--- + tools/storage_mon.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 3c82d5ee8..83a48ca36 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -232,13 +232,13 @@ int main(int argc, char *argv[]) + + if (w == test_forks[i]) { + if (WIFEXITED(wstatus)) { +- if (WEXITSTATUS(wstatus) == 0) { +- finished_count++; +- test_forks[i] = 0; +- } else { ++ if (WEXITSTATUS(wstatus) != 0) { + syslog(LOG_ERR, "Error reading from device %s", devices[i]); + final_score += scores[i]; + } ++ ++ finished_count++; ++ test_forks[i] = 0; + } + } + } diff --git a/SOURCES/bz2109159-storage_mon-4-fix-possible-false-negatives.patch b/SOURCES/bz2109159-storage_mon-4-fix-possible-false-negatives.patch new file mode 100644 index 0000000..8448bc6 --- /dev/null +++ b/SOURCES/bz2109159-storage_mon-4-fix-possible-false-negatives.patch @@ -0,0 +1,417 @@ +From 0bb52cf9985bda47e13940761b3d8e2eaddf377c Mon Sep 17 00:00:00 2001 +From: Kazunori INOUE +Date: Wed, 10 Aug 2022 17:35:54 +0900 +Subject: [PATCH 1/4] storage_mon: Use the O_DIRECT flag in open() to eliminate + cache effects + +--- + tools/Makefile.am | 1 + + tools/storage_mon.c | 82 +++++++++++++++++++++++++++++++++------------ + 2 files changed, 61 insertions(+), 22 deletions(-) + +diff --git a/tools/Makefile.am b/tools/Makefile.am +index 1309223b4..08323fee3 100644 +--- a/tools/Makefile.am ++++ b/tools/Makefile.am +@@ -74,6 +74,7 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl + findif_SOURCES = findif.c + + storage_mon_SOURCES = storage_mon.c ++storage_mon_CFLAGS = -D_GNU_SOURCE + + if BUILD_TICKLE + halib_PROGRAMS += tickle_tcp +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 930ead41c..ba87492fc 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -31,23 +31,27 @@ static void usage(char *name, FILE *f) + fprintf(f, " --help print this message\n"); + } + +-/* Check one device */ +-static void *test_device(const char *device, int verbose, int inject_error_percent) ++static int open_device(const char *device, int verbose) + { +- uint64_t devsize; + int device_fd; + int res; ++ uint64_t devsize; + off_t seek_spot; +- char buffer[512]; + +- if (verbose) { +- printf("Testing device %s\n", device); ++#if defined(__linux__) || defined(__FreeBSD__) ++ device_fd = open(device, O_RDONLY|O_DIRECT); ++ if (device_fd >= 0) { ++ return device_fd; ++ } else if (errno != EINVAL) { ++ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ return -1; + } ++#endif + + device_fd = open(device, O_RDONLY); + if (device_fd < 0) { + fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); +- exit(-1); ++ return -1; + } + #ifdef __FreeBSD__ + res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize); +@@ -57,11 +61,12 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + if (res != 0) { + fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); + close(device_fd); +- exit(-1); ++ return -1; + } + if (verbose) { + fprintf(stderr, "%s: size=%zu\n", device, devsize); + } ++ + /* Don't fret about real randomness */ + srand(time(NULL) + getpid()); + /* Pick a random place on the device - sector aligned */ +@@ -70,35 +75,64 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + if (res < 0) { + fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno)); + close(device_fd); +- exit(-1); ++ return -1; + } +- + if (verbose) { + printf("%s: reading from pos %ld\n", device, seek_spot); + } ++ return device_fd; ++} ++ ++/* Check one device */ ++static void *test_device(const char *device, int verbose, int inject_error_percent) ++{ ++ int device_fd; ++ int sec_size = 0; ++ int res; ++ void *buffer; ++ ++ if (verbose) { ++ printf("Testing device %s\n", device); ++ } ++ ++ device_fd = open_device(device, verbose); ++ if (device_fd < 0) { ++ exit(-1); ++ } ++ ++ ioctl(device_fd, BLKSSZGET, &sec_size); ++ if (sec_size == 0) { ++ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); ++ goto error; ++ } + +- res = read(device_fd, buffer, sizeof(buffer)); ++ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { ++ fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno)); ++ goto error; ++ } ++ ++ res = read(device_fd, buffer, sec_size); ++ free(buffer); + if (res < 0) { + fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); +- close(device_fd); +- exit(-1); ++ goto error; + } +- if (res < (int)sizeof(buffer)) { +- fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res); +- close(device_fd); +- exit(-1); ++ if (res < sec_size) { ++ fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res); ++ goto error; + } + + /* Fake an error */ +- if (inject_error_percent && ((rand() % 100) < inject_error_percent)) { +- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); +- close(device_fd); +- exit(-1); ++ if (inject_error_percent) { ++ srand(time(NULL) + getpid()); ++ if ((rand() % 100) < inject_error_percent) { ++ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); ++ goto error; ++ } + } + res = close(device_fd); + if (res != 0) { + fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno)); +- close(device_fd); + exit(-1); + } + +@@ -106,6 +140,10 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + printf("%s: done\n", device); + } + exit(0); ++ ++error: ++ close(device_fd); ++ exit(-1); + } + + int main(int argc, char *argv[]) + +From ce4e632f29ed6b86b82a959eac5844655baed153 Mon Sep 17 00:00:00 2001 +From: Kazunori INOUE +Date: Mon, 15 Aug 2022 19:17:21 +0900 +Subject: [PATCH 2/4] storage_mon: fix build-related issues + +--- + tools/storage_mon.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index ba87492fc..e34d1975a 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -38,7 +38,6 @@ static int open_device(const char *device, int verbose) + uint64_t devsize; + off_t seek_spot; + +-#if defined(__linux__) || defined(__FreeBSD__) + device_fd = open(device, O_RDONLY|O_DIRECT); + if (device_fd >= 0) { + return device_fd; +@@ -46,7 +45,6 @@ static int open_device(const char *device, int verbose) + fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); + return -1; + } +-#endif + + device_fd = open(device, O_RDONLY); + if (device_fd < 0) { +@@ -100,7 +98,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + exit(-1); + } + ++#ifdef __FreeBSD__ ++ ioctl(device_fd, DIOCGSECTORSIZE, &sec_size); ++#else + ioctl(device_fd, BLKSSZGET, &sec_size); ++#endif + if (sec_size == 0) { + fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); + goto error; + +From 7a0aaa0dfdebeab3fae9fe9ddc412c3d1f610273 Mon Sep 17 00:00:00 2001 +From: Kazunori INOUE +Date: Wed, 24 Aug 2022 17:36:23 +0900 +Subject: [PATCH 3/4] storage_mon: do random lseek even with O_DIRECT, etc + +--- + tools/storage_mon.c | 118 ++++++++++++++++++++++---------------------- + 1 file changed, 58 insertions(+), 60 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index e34d1975a..0bdb48649 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -31,38 +31,43 @@ static void usage(char *name, FILE *f) + fprintf(f, " --help print this message\n"); + } + +-static int open_device(const char *device, int verbose) ++/* Check one device */ ++static void *test_device(const char *device, int verbose, int inject_error_percent) + { ++ uint64_t devsize; ++ int flags = O_RDONLY | O_DIRECT; + int device_fd; + int res; +- uint64_t devsize; + off_t seek_spot; + +- device_fd = open(device, O_RDONLY|O_DIRECT); +- if (device_fd >= 0) { +- return device_fd; +- } else if (errno != EINVAL) { +- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); +- return -1; ++ if (verbose) { ++ printf("Testing device %s\n", device); + } + +- device_fd = open(device, O_RDONLY); ++ device_fd = open(device, flags); + if (device_fd < 0) { +- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); +- return -1; ++ if (errno != EINVAL) { ++ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ exit(-1); ++ } ++ flags &= ~O_DIRECT; ++ device_fd = open(device, flags); ++ if (device_fd < 0) { ++ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ exit(-1); ++ } + } + #ifdef __FreeBSD__ + res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize); + #else + res = ioctl(device_fd, BLKGETSIZE64, &devsize); + #endif +- if (res != 0) { ++ if (res < 0) { + fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); +- close(device_fd); +- return -1; ++ goto error; + } + if (verbose) { +- fprintf(stderr, "%s: size=%zu\n", device, devsize); ++ printf("%s: opened %s O_DIRECT, size=%zu\n", device, (flags & O_DIRECT)?"with":"without", devsize); + } + + /* Don't fret about real randomness */ +@@ -72,65 +77,58 @@ static int open_device(const char *device, int verbose) + res = lseek(device_fd, seek_spot, SEEK_SET); + if (res < 0) { + fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno)); +- close(device_fd); +- return -1; ++ goto error; + } + if (verbose) { + printf("%s: reading from pos %ld\n", device, seek_spot); + } +- return device_fd; +-} +- +-/* Check one device */ +-static void *test_device(const char *device, int verbose, int inject_error_percent) +-{ +- int device_fd; +- int sec_size = 0; +- int res; +- void *buffer; +- +- if (verbose) { +- printf("Testing device %s\n", device); +- } + +- device_fd = open_device(device, verbose); +- if (device_fd < 0) { +- exit(-1); +- } ++ if (flags & O_DIRECT) { ++ int sec_size = 0; ++ void *buffer; + + #ifdef __FreeBSD__ +- ioctl(device_fd, DIOCGSECTORSIZE, &sec_size); ++ res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size); + #else +- ioctl(device_fd, BLKSSZGET, &sec_size); ++ res = ioctl(device_fd, BLKSSZGET, &sec_size); + #endif +- if (sec_size == 0) { +- fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); +- goto error; +- } ++ if (res < 0) { ++ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); ++ goto error; ++ } + +- if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { +- fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno)); +- goto error; +- } ++ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { ++ fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno)); ++ goto error; ++ } ++ res = read(device_fd, buffer, sec_size); ++ free(buffer); ++ if (res < 0) { ++ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); ++ goto error; ++ } ++ if (res < sec_size) { ++ fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res); ++ goto error; ++ } ++ } else { ++ char buffer[512]; + +- res = read(device_fd, buffer, sec_size); +- free(buffer); +- if (res < 0) { +- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); +- goto error; +- } +- if (res < sec_size) { +- fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res); +- goto error; ++ res = read(device_fd, buffer, sizeof(buffer)); ++ if (res < 0) { ++ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); ++ goto error; ++ } ++ if (res < (int)sizeof(buffer)) { ++ fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res); ++ goto error; ++ } + } + + /* Fake an error */ +- if (inject_error_percent) { +- srand(time(NULL) + getpid()); +- if ((rand() % 100) < inject_error_percent) { +- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); +- goto error; +- } ++ if (inject_error_percent && ((rand() % 100) < inject_error_percent)) { ++ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); ++ goto error; + } + res = close(device_fd); + if (res != 0) { + +From db97e055a17526cec056c595844a9d8851e3ee19 Mon Sep 17 00:00:00 2001 +From: Kazunori INOUE +Date: Thu, 25 Aug 2022 16:03:46 +0900 +Subject: [PATCH 4/4] storage_mon: improve error messages when ioctl() fails + +--- + tools/storage_mon.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index 0bdb48649..f829c5081 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -63,7 +63,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + res = ioctl(device_fd, BLKGETSIZE64, &devsize); + #endif + if (res < 0) { +- fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); ++ fprintf(stderr, "Failed to get device size for %s: %s\n", device, strerror(errno)); + goto error; + } + if (verbose) { +@@ -93,7 +93,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + res = ioctl(device_fd, BLKSSZGET, &sec_size); + #endif + if (res < 0) { +- fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno)); ++ fprintf(stderr, "Failed to get block device sector size for %s: %s\n", device, strerror(errno)); + goto error; + } + diff --git a/SOURCES/bz2127117-nfsserver-nfsv4_only-parameter.patch b/SOURCES/bz2127117-nfsserver-nfsv4_only-parameter.patch new file mode 100644 index 0000000..9bcbb41 --- /dev/null +++ b/SOURCES/bz2127117-nfsserver-nfsv4_only-parameter.patch @@ -0,0 +1,298 @@ +From 764757380af19d3a21d40f3c9624e4135ff074e1 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 2 Nov 2022 10:26:31 +0100 +Subject: [PATCH] nfsserver: add nfsv4_only parameter to make it run without + rpc-statd/rpcbind services + +--- + heartbeat/nfsserver | 200 +++++++++++++++++++++++++------------------- + 1 file changed, 114 insertions(+), 86 deletions(-) + +diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver +index 9bbd603e5..cb2d43ab1 100755 +--- a/heartbeat/nfsserver ++++ b/heartbeat/nfsserver +@@ -79,6 +79,16 @@ Init script for nfsserver + + + ++ ++ ++Run in NFSv4 only mode (rpc-statd and rpcbind services masked). ++ ++ ++NFSv4 only mode. ++ ++ ++ ++ + + + Do not send reboot notifications to NFSv3 clients during server startup. +@@ -332,7 +342,7 @@ v3locking_exec() + if [ $EXEC_MODE -eq 2 ]; then + nfs_exec $cmd nfs-lock.service + elif [ $EXEC_MODE -eq 3 ]; then +- nfs_exec $cmd rpc-statd.service ++ nfs_exec $cmd rpc-statd.service + else + case $cmd in + start) locking_start;; +@@ -348,20 +358,22 @@ nfsserver_systemd_monitor() + local rc + local fn + +- ocf_log debug "Status: rpcbind" +- rpcinfo > /dev/null 2>&1 +- rc=$? +- if [ "$rc" -ne "0" ]; then +- ocf_exit_reason "rpcbind is not running" +- return $OCF_NOT_RUNNING +- fi ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ ocf_log debug "Status: rpcbind" ++ rpcinfo > /dev/null 2>&1 ++ rc=$? ++ if [ "$rc" -ne "0" ]; then ++ ocf_exit_reason "rpcbind is not running" ++ return $OCF_NOT_RUNNING ++ fi + +- ocf_log debug "Status: nfs-mountd" +- ps axww | grep -q "[r]pc.mountd" +- rc=$? +- if [ "$rc" -ne "0" ]; then +- ocf_exit_reason "nfs-mountd is not running" +- return $OCF_NOT_RUNNING ++ ocf_log debug "Status: nfs-mountd" ++ ps axww | grep -q "[r]pc.mountd" ++ rc=$? ++ if [ "$rc" -ne "0" ]; then ++ ocf_exit_reason "nfs-mountd is not running" ++ return $OCF_NOT_RUNNING ++ fi + fi + + ocf_log debug "Status: nfs-idmapd" +@@ -375,12 +387,14 @@ nfsserver_systemd_monitor() + return $OCF_NOT_RUNNING + fi + +- ocf_log debug "Status: rpc-statd" +- rpcinfo -t localhost 100024 > /dev/null 2>&1 +- rc=$? +- if [ "$rc" -ne "0" ]; then +- ocf_exit_reason "rpc-statd is not running" +- return $OCF_NOT_RUNNING ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ ocf_log debug "Status: rpc-statd" ++ rpcinfo -t localhost 100024 > /dev/null 2>&1 ++ rc=$? ++ if [ "$rc" -ne "0" ]; then ++ ocf_exit_reason "rpc-statd is not running" ++ return $OCF_NOT_RUNNING ++ fi + fi + + nfs_exec is-active nfs-server +@@ -424,7 +438,7 @@ nfsserver_monitor () + if [ $rc -eq 0 ]; then + # don't report success if nfs servers are up + # without locking daemons. +- v3locking_exec "status" ++ ocf_is_true "$OCF_RESKEY_nfsv4_only" || v3locking_exec "status" + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "NFS server is up, but the locking daemons are down" +@@ -786,48 +800,54 @@ nfsserver_start () + + # systemd + case $EXEC_MODE in +- [23]) nfs_exec start rpcbind +- local i=1 +- while : ; do +- ocf_log info "Start: rpcbind i: $i" +- rpcinfo > /dev/null 2>&1 +- rc=$? +- if [ "$rc" -eq "0" ]; then +- break; +- fi +- sleep 1 +- i=$((i + 1)) +- done ++ [23]) if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ nfs_exec start rpcbind ++ local i=1 ++ while : ; do ++ ocf_log info "Start: rpcbind i: $i" ++ rpcinfo > /dev/null 2>&1 ++ rc=$? ++ if [ "$rc" -eq "0" ]; then ++ break ++ fi ++ sleep 1 ++ i=$((i + 1)) ++ done ++ fi + ;; + esac + +- # check to see if we need to start rpc.statd +- v3locking_exec "status" +- if [ $? -ne $OCF_SUCCESS ]; then +- v3locking_exec "start" +- rc=$? +- if [ $rc -ne 0 ]; then +- ocf_exit_reason "Failed to start NFS server locking daemons" +- return $rc ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ # check to see if we need to start rpc.statd ++ v3locking_exec "status" ++ if [ $? -ne $OCF_SUCCESS ]; then ++ v3locking_exec "start" ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_exit_reason "Failed to start NFS server locking daemons" ++ return $rc ++ fi ++ else ++ ocf_log info "rpc.statd already up" + fi +- else +- ocf_log info "rpc.statd already up" + fi + + # systemd + case $EXEC_MODE in +- [23]) nfs_exec start nfs-mountd +- local i=1 +- while : ; do +- ocf_log info "Start: nfs-mountd i: $i" +- ps axww | grep -q "[r]pc.mountd" +- rc=$? +- if [ "$rc" -eq "0" ]; then +- break; +- fi +- sleep 1 +- i=$((i + 1)) +- done ++ [23]) if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ nfs_exec start nfs-mountd ++ local i=1 ++ while : ; do ++ ocf_log info "Start: nfs-mountd i: $i" ++ ps axww | grep -q "[r]pc.mountd" ++ rc=$? ++ if [ "$rc" -eq "0" ]; then ++ break ++ fi ++ sleep 1 ++ i=$((i + 1)) ++ done ++ fi + + nfs_exec start nfs-idmapd + local i=1 +@@ -839,24 +859,26 @@ nfsserver_start () + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -eq "0" ]; then +- break; ++ break + fi + sleep 1 + i=$((i + 1)) + done + +- nfs_exec start rpc-statd +- local i=1 +- while : ; do +- ocf_log info "Start: rpc-statd i: $i" +- rpcinfo -t localhost 100024 > /dev/null 2>&1 +- rc=$? +- if [ "$rc" -eq "0" ]; then +- break; +- fi +- sleep 1 +- i=$((i + 1)) +- done ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ nfs_exec start rpc-statd ++ local i=1 ++ while : ; do ++ ocf_log info "Start: rpc-statd i: $i" ++ rpcinfo -t localhost 100024 > /dev/null 2>&1 ++ rc=$? ++ if [ "$rc" -eq "0" ]; then ++ break ++ fi ++ sleep 1 ++ i=$((i + 1)) ++ done ++ fi + esac + + +@@ -914,13 +936,15 @@ nfsserver_stop () + sleep 1 + done + +- nfs_exec stop rpc-statd > /dev/null 2>&1 +- ocf_log info "Stop: rpc-statd" +- rpcinfo -t localhost 100024 > /dev/null 2>&1 +- rc=$? +- if [ "$rc" -eq "0" ]; then +- ocf_exit_reason "Failed to stop rpc-statd" +- return $OCF_ERR_GENERIC ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ nfs_exec stop rpc-statd > /dev/null 2>&1 ++ ocf_log info "Stop: rpc-statd" ++ rpcinfo -t localhost 100024 > /dev/null 2>&1 ++ rc=$? ++ if [ "$rc" -eq "0" ]; then ++ ocf_exit_reason "Failed to stop rpc-statd" ++ return $OCF_ERR_GENERIC ++ fi + fi + + nfs_exec stop nfs-idmapd > /dev/null 2>&1 +@@ -935,13 +959,15 @@ nfsserver_stop () + return $OCF_ERR_GENERIC + fi + +- nfs_exec stop nfs-mountd > /dev/null 2>&1 +- ocf_log info "Stop: nfs-mountd" +- ps axww | grep -q "[r]pc.mountd" +- rc=$? +- if [ "$rc" -eq "0" ]; then +- ocf_exit_reason "Failed to stop nfs-mountd" +- return $OCF_ERR_GENERIC ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ nfs_exec stop nfs-mountd > /dev/null 2>&1 ++ ocf_log info "Stop: nfs-mountd" ++ ps axww | grep -q "[r]pc.mountd" ++ rc=$? ++ if [ "$rc" -eq "0" ]; then ++ ocf_exit_reason "Failed to stop nfs-mountd" ++ return $OCF_ERR_GENERIC ++ fi + fi + + if systemctl --no-legend list-unit-files "nfsdcld*" | grep -q nfsdcld; then +@@ -960,10 +986,12 @@ nfsserver_stop () + esac + + +- v3locking_exec "stop" +- if [ $? -ne 0 ]; then +- ocf_exit_reason "Failed to stop NFS locking daemons" +- rc=$OCF_ERR_GENERIC ++ if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then ++ v3locking_exec "stop" ++ if [ $? -ne 0 ]; then ++ ocf_exit_reason "Failed to stop NFS locking daemons" ++ rc=$OCF_ERR_GENERIC ++ fi + fi + + # systemd diff --git a/SOURCES/bz2130986-azure-events-az-new-ra.patch b/SOURCES/bz2130986-azure-events-az-new-ra.patch deleted file mode 100644 index 88c7781..0000000 --- a/SOURCES/bz2130986-azure-events-az-new-ra.patch +++ /dev/null @@ -1,903 +0,0 @@ -From 5dcd5153f0318e4766f7f4d3e61dfdb4b352c39c Mon Sep 17 00:00:00 2001 -From: MSSedusch -Date: Mon, 30 May 2022 15:08:10 +0200 -Subject: [PATCH 1/2] add new Azure Events AZ resource agent - ---- - .gitignore | 1 + - configure.ac | 8 + - doc/man/Makefile.am | 4 + - heartbeat/Makefile.am | 4 + - heartbeat/azure-events-az.in | 782 +++++++++++++++++++++++++++++++++++ - 5 files changed, 799 insertions(+) - create mode 100644 heartbeat/azure-events-az.in - -diff --git a/.gitignore b/.gitignore -index 0c259b5cf..e2b7c039c 100644 ---- a/.gitignore -+++ b/.gitignore -@@ -54,6 +54,7 @@ heartbeat/Squid - heartbeat/SysInfo - heartbeat/aws-vpc-route53 - heartbeat/azure-events -+heartbeat/azure-events-az - heartbeat/clvm - heartbeat/conntrackd - heartbeat/dnsupdate -diff --git a/configure.ac b/configure.ac -index eeecfad0e..5716a2be2 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -523,6 +523,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then - fi - AM_CONDITIONAL(BUILD_AZURE_EVENTS, test $BUILD_AZURE_EVENTS -eq 1) - -+BUILD_AZURE_EVENTS_AZ=1 -+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then -+ BUILD_AZURE_EVENTS_AZ=0 -+ AC_MSG_WARN("Not building azure-events-az") -+fi -+AM_CONDITIONAL(BUILD_AZURE_EVENTS_AZ, test $BUILD_AZURE_EVENTS_AZ -eq 1) -+ - BUILD_GCP_PD_MOVE=1 - if test -z "$PYTHON" || test "x${HAVE_PYMOD_GOOGLEAPICLIENT}" != xyes || test $BUILD_OCF_PY -eq 0; then - BUILD_GCP_PD_MOVE=0 -@@ -976,6 +983,7 @@ rgmanager/Makefile \ - - dnl Files we output that need to be executable - AC_CONFIG_FILES([heartbeat/azure-events], [chmod +x heartbeat/azure-events]) -+AC_CONFIG_FILES([heartbeat/azure-events-az], [chmod +x heartbeat/azure-events-az]) - AC_CONFIG_FILES([heartbeat/AoEtarget], [chmod +x heartbeat/AoEtarget]) - AC_CONFIG_FILES([heartbeat/ManageRAID], [chmod +x heartbeat/ManageRAID]) - AC_CONFIG_FILES([heartbeat/ManageVE], [chmod +x heartbeat/ManageVE]) -diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am -index cd8fd16bf..658c700ac 100644 ---- a/doc/man/Makefile.am -+++ b/doc/man/Makefile.am -@@ -219,6 +219,10 @@ if BUILD_AZURE_EVENTS - man_MANS += ocf_heartbeat_azure-events.7 - endif - -+if BUILD_AZURE_EVENTS_AZ -+man_MANS += ocf_heartbeat_azure-events-az.7 -+endif -+ - if BUILD_GCP_PD_MOVE - man_MANS += ocf_heartbeat_gcp-pd-move.7 - endif -diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am -index 20d41e36a..1133dc13e 100644 ---- a/heartbeat/Makefile.am -+++ b/heartbeat/Makefile.am -@@ -188,6 +188,10 @@ if BUILD_AZURE_EVENTS - ocf_SCRIPTS += azure-events - endif - -+if BUILD_AZURE_EVENTS_AZ -+ocf_SCRIPTS += azure-events-az -+endif -+ - if BUILD_GCP_PD_MOVE - ocf_SCRIPTS += gcp-pd-move - endif -diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in -new file mode 100644 -index 000000000..616fc8d9e ---- /dev/null -+++ b/heartbeat/azure-events-az.in -@@ -0,0 +1,782 @@ -+#!@PYTHON@ -tt -+# -+# Resource agent for monitoring Azure Scheduled Events -+# -+# License: GNU General Public License (GPL) -+# (c) 2018 Tobias Niekamp, Microsoft Corp. -+# and Linux-HA contributors -+ -+import os -+import sys -+import time -+import subprocess -+import json -+try: -+ import urllib2 -+ from urllib2 import URLError -+except ImportError: -+ import urllib.request as urllib2 -+ from urllib.error import URLError -+import socket -+from collections import defaultdict -+ -+OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) -+sys.path.append(OCF_FUNCTIONS_DIR) -+import ocf -+ -+############################################################################## -+ -+ -+VERSION = "0.10" -+USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro()) -+ -+attr_globalPullState = "azure-events-az_globalPullState" -+attr_lastDocVersion = "azure-events-az_lastDocVersion" -+attr_curNodeState = "azure-events-az_curNodeState" -+attr_pendingEventIDs = "azure-events-az_pendingEventIDs" -+attr_healthstate = "#health-azure" -+ -+default_loglevel = ocf.logging.INFO -+default_relevantEventTypes = set(["Reboot", "Redeploy"]) -+ -+global_pullMaxAttempts = 3 -+global_pullDelaySecs = 1 -+ -+############################################################################## -+ -+class attrDict(defaultdict): -+ """ -+ A wrapper for accessing dict keys like an attribute -+ """ -+ def __init__(self, data): -+ super(attrDict, self).__init__(attrDict) -+ for d in data.keys(): -+ self.__setattr__(d, data[d]) -+ -+ def __getattr__(self, key): -+ try: -+ return self[key] -+ except KeyError: -+ raise AttributeError(key) -+ -+ def __setattr__(self, key, value): -+ self[key] = value -+ -+############################################################################## -+ -+class azHelper: -+ """ -+ Helper class for Azure's metadata API (including Scheduled Events) -+ """ -+ metadata_host = "http://169.254.169.254/metadata" -+ instance_api = "instance" -+ events_api = "scheduledevents" -+ api_version = "2019-08-01" -+ -+ @staticmethod -+ def _sendMetadataRequest(endpoint, postData=None): -+ """ -+ Send a request to Azure's Azure Metadata Service API -+ """ -+ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version) -+ data = "" -+ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData)) -+ ocf.logger.debug("_sendMetadataRequest: url = %s" % url) -+ -+ if postData and type(postData) != bytes: -+ postData = postData.encode() -+ -+ req = urllib2.Request(url, postData) -+ req.add_header("Metadata", "true") -+ req.add_header("User-Agent", USER_AGENT) -+ try: -+ resp = urllib2.urlopen(req) -+ except URLError as e: -+ if hasattr(e, 'reason'): -+ ocf.logger.warning("Failed to reach the server: %s" % e.reason) -+ clusterHelper.setAttr(attr_globalPullState, "IDLE") -+ elif hasattr(e, 'code'): -+ ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code) -+ clusterHelper.setAttr(attr_globalPullState, "IDLE") -+ else: -+ data = resp.read() -+ ocf.logger.debug("_sendMetadataRequest: response = %s" % data) -+ -+ if data: -+ data = json.loads(data) -+ -+ ocf.logger.debug("_sendMetadataRequest: finished") -+ return data -+ -+ @staticmethod -+ def getInstanceInfo(): -+ """ -+ Fetch details about the current VM from Azure's Azure Metadata Service API -+ """ -+ ocf.logger.debug("getInstanceInfo: begin") -+ -+ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api) -+ ocf.logger.debug("getInstanceInfo: json = %s" % jsondata) -+ -+ if jsondata: -+ ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"])) -+ return attrDict(jsondata["compute"]) -+ else: -+ ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info") -+ sys.exit(ocf.OCF_ERR_GENERIC) -+ -+ @staticmethod -+ def pullScheduledEvents(): -+ """ -+ Retrieve all currently scheduled events via Azure Metadata Service API -+ """ -+ ocf.logger.debug("pullScheduledEvents: begin") -+ -+ jsondata = azHelper._sendMetadataRequest(azHelper.events_api) -+ ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata) -+ -+ ocf.logger.debug("pullScheduledEvents: finished") -+ return attrDict(jsondata) -+ -+ @staticmethod -+ def forceEvents(eventIDs): -+ """ -+ Force a set of events to start immediately -+ """ -+ ocf.logger.debug("forceEvents: begin") -+ -+ events = [] -+ for e in eventIDs: -+ events.append({ -+ "EventId": e, -+ }) -+ postData = { -+ "StartRequests" : events -+ } -+ ocf.logger.info("forceEvents: postData = %s" % postData) -+ resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData)) -+ -+ ocf.logger.debug("forceEvents: finished") -+ return -+ -+############################################################################## -+ -+class clusterHelper: -+ """ -+ Helper functions for Pacemaker control via crm -+ """ -+ @staticmethod -+ def _getLocation(node): -+ """ -+ Helper function to retrieve local/global attributes -+ """ -+ if node: -+ return ["--node", node] -+ else: -+ return ["--type", "crm_config"] -+ -+ @staticmethod -+ def _exec(command, *args): -+ """ -+ Helper function to execute a UNIX command -+ """ -+ args = list(args) -+ ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args))) -+ -+ def flatten(*n): -+ return (str(e) for a in n -+ for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),))) -+ command = list(flatten([command] + args)) -+ ocf.logger.debug("_exec: cmd = %s" % " ".join(command)) -+ try: -+ ret = subprocess.check_output(command) -+ if type(ret) != str: -+ ret = ret.decode() -+ ocf.logger.debug("_exec: return = %s" % ret) -+ return ret.rstrip() -+ except Exception as err: -+ ocf.logger.exception(err) -+ return None -+ -+ @staticmethod -+ def setAttr(key, value, node=None): -+ """ -+ Set the value of a specific global/local attribute in the Pacemaker cluster -+ """ -+ ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node)) -+ -+ if value: -+ ret = clusterHelper._exec("crm_attribute", -+ "--name", key, -+ "--update", value, -+ clusterHelper._getLocation(node)) -+ else: -+ ret = clusterHelper._exec("crm_attribute", -+ "--name", key, -+ "--delete", -+ clusterHelper._getLocation(node)) -+ -+ ocf.logger.debug("setAttr: finished") -+ return len(ret) == 0 -+ -+ @staticmethod -+ def getAttr(key, node=None): -+ """ -+ Retrieve a global/local attribute from the Pacemaker cluster -+ """ -+ ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node)) -+ -+ val = clusterHelper._exec("crm_attribute", -+ "--name", key, -+ "--query", "--quiet", -+ "--default", "", -+ clusterHelper._getLocation(node)) -+ ocf.logger.debug("getAttr: finished") -+ if not val: -+ return None -+ return val if not val.isdigit() else int(val) -+ -+ @staticmethod -+ def getAllNodes(): -+ """ -+ Get a list of hostnames for all nodes in the Pacemaker cluster -+ """ -+ ocf.logger.debug("getAllNodes: begin") -+ -+ nodes = [] -+ nodeList = clusterHelper._exec("crm_node", "--list") -+ for n in nodeList.split("\n"): -+ nodes.append(n.split()[1]) -+ ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes)) -+ -+ return nodes -+ -+ @staticmethod -+ def getHostNameFromAzName(azName): -+ """ -+ Helper function to get the actual host name from an Azure node name -+ """ -+ return clusterHelper.getAttr("hostName_%s" % azName) -+ -+ @staticmethod -+ def removeHoldFromNodes(): -+ """ -+ Remove the ON_HOLD state from all nodes in the Pacemaker cluster -+ """ -+ ocf.logger.debug("removeHoldFromNodes: begin") -+ -+ for n in clusterHelper.getAllNodes(): -+ if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD": -+ clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n) -+ ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n) -+ -+ ocf.logger.debug("removeHoldFromNodes: finished") -+ return False -+ -+ @staticmethod -+ def otherNodesAvailable(exceptNode): -+ """ -+ Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE -+ """ -+ ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode) -+ -+ for n in clusterHelper.getAllNodes(): -+ state = clusterHelper.getAttr(attr_curNodeState, node=n) -+ state = stringToNodeState(state) if state else AVAILABLE -+ if state == AVAILABLE and n != exceptNode.hostName: -+ ocf.logger.info("otherNodesAvailable: at least %s is available" % n) -+ ocf.logger.debug("otherNodesAvailable: finished") -+ return True -+ ocf.logger.info("otherNodesAvailable: no other nodes are available") -+ ocf.logger.debug("otherNodesAvailable: finished") -+ -+ return False -+ -+ @staticmethod -+ def transitionSummary(): -+ """ -+ Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby) -+ """ -+ # Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node? -+ # # crm_simulate -Ls -+ # Transition Summary: -+ # * Promote rsc_SAPHana_HN1_HDB03:0 (Slave -> Master hsr3-db1) -+ # * Stop rsc_SAPHana_HN1_HDB03:1 (hsr3-db0) -+ # * Move rsc_ip_HN1_HDB03 (Started hsr3-db0 -> hsr3-db1) -+ # * Start rsc_nc_HN1_HDB03 (hsr3-db1) -+ # # Excepted result when there are no pending actions: -+ # Transition Summary: -+ ocf.logger.debug("transitionSummary: begin") -+ -+ summary = clusterHelper._exec("crm_simulate", "-Ls") -+ if not summary: -+ ocf.logger.warning("transitionSummary: could not load transition summary") -+ return False -+ if summary.find("Transition Summary:") < 0: -+ ocf.logger.warning("transitionSummary: received unexpected transition summary: %s" % summary) -+ return False -+ summary = summary.split("Transition Summary:")[1] -+ ret = summary.split("\n").pop(0) -+ -+ ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret)) -+ return ret -+ -+ @staticmethod -+ def listOperationsOnNode(node): -+ """ -+ Get a list of all current operations for a given node (used to check if any resources are pending) -+ """ -+ # hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0 -+ # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=115ms): complete -+ # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun 8 22:37:47 2018, exec=197ms): complete -+ # rsc_SAPHana_HN1_HDB03 (ocf::suse:SAPHana): Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun 8 22:37:46 2018, exec=0ms): pending -+ # rsc_SAPHanaTopology_HN1_HDB03 (ocf::suse:SAPHanaTopology): Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=3214ms): complete -+ ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node) -+ -+ resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node) -+ if len(resources) == 0: -+ ret = [] -+ else: -+ ret = resources.split("\n") -+ -+ ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret)) -+ return ret -+ -+ @staticmethod -+ def noPendingResourcesOnNode(node): -+ """ -+ Check that there are no pending resources on a given node -+ """ -+ ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node) -+ -+ for r in clusterHelper.listOperationsOnNode(node): -+ ocf.logger.debug("noPendingResourcesOnNode: * %s" % r) -+ resource = r.split()[-1] -+ if resource == "pending": -+ ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource) -+ ocf.logger.debug("noPendingResourcesOnNode: finished; return = False") -+ return False -+ ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node) -+ ocf.logger.debug("noPendingResourcesOnNode: finished; return = True") -+ -+ return True -+ -+ @staticmethod -+ def allResourcesStoppedOnNode(node): -+ """ -+ Check that all resources on a given node are stopped -+ """ -+ ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node) -+ -+ if clusterHelper.noPendingResourcesOnNode(node): -+ if len(clusterHelper.transitionSummary()) == 0: -+ ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node) -+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True") -+ return True -+ ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty") -+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") -+ return False -+ -+ ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node) -+ ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") -+ return False -+ -+############################################################################## -+ -+AVAILABLE = 0 # Node is online and ready to handle events -+STOPPING = 1 # Standby has been triggered, but some resources are still running -+IN_EVENT = 2 # All resources are stopped, and event has been initiated via Azure Metadata Service -+ON_HOLD = 3 # Node has a pending event that cannot be started there are no other nodes available -+ -+def stringToNodeState(name): -+ if type(name) == int: return name -+ if name == "STOPPING": return STOPPING -+ if name == "IN_EVENT": return IN_EVENT -+ if name == "ON_HOLD": return ON_HOLD -+ return AVAILABLE -+ -+def nodeStateToString(state): -+ if state == STOPPING: return "STOPPING" -+ if state == IN_EVENT: return "IN_EVENT" -+ if state == ON_HOLD: return "ON_HOLD" -+ return "AVAILABLE" -+ -+############################################################################## -+ -+class Node: -+ """ -+ Core class implementing logic for a cluster node -+ """ -+ def __init__(self, ra): -+ self.raOwner = ra -+ self.azInfo = azHelper.getInstanceInfo() -+ self.azName = self.azInfo.name -+ self.hostName = socket.gethostname() -+ self.setAttr("azName", self.azName) -+ clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName) -+ -+ def getAttr(self, key): -+ """ -+ Get a local attribute -+ """ -+ return clusterHelper.getAttr(key, node=self.hostName) -+ -+ def setAttr(self, key, value): -+ """ -+ Set a local attribute -+ """ -+ return clusterHelper.setAttr(key, value, node=self.hostName) -+ -+ def selfOrOtherNode(self, node): -+ """ -+ Helper function to distinguish self/other node -+ """ -+ return node if node else self.hostName -+ -+ def setState(self, state, node=None): -+ """ -+ Set the state for a given node (or self) -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state))) -+ -+ clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node) -+ -+ ocf.logger.debug("setState: finished") -+ -+ def getState(self, node=None): -+ """ -+ Get the state for a given node (or self) -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("getState: begin; node = %s" % node) -+ -+ state = clusterHelper.getAttr(attr_curNodeState, node=node) -+ ocf.logger.debug("getState: state = %s" % state) -+ ocf.logger.debug("getState: finished") -+ if not state: -+ return AVAILABLE -+ return stringToNodeState(state) -+ -+ def setEventIDs(self, eventIDs, node=None): -+ """ -+ Set pending EventIDs for a given node (or self) -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs))) -+ -+ if eventIDs: -+ eventIDStr = ",".join(eventIDs) -+ else: -+ eventIDStr = None -+ clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node) -+ -+ ocf.logger.debug("setEventIDs: finished") -+ return -+ -+ def getEventIDs(self, node=None): -+ """ -+ Get pending EventIDs for a given node (or self) -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("getEventIDs: begin; node = %s" % node) -+ -+ eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node) -+ if eventIDStr: -+ eventIDs = eventIDStr.split(",") -+ else: -+ eventIDs = None -+ -+ ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs)) -+ return eventIDs -+ -+ def updateNodeStateAndEvents(self, state, eventIDs, node=None): -+ """ -+ Set the state and pending EventIDs for a given node (or self) -+ """ -+ ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs))) -+ -+ self.setState(state, node=node) -+ self.setEventIDs(eventIDs, node=node) -+ -+ ocf.logger.debug("updateNodeStateAndEvents: finished") -+ return state -+ -+ def putNodeStandby(self, node=None): -+ """ -+ Put self to standby -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("putNodeStandby: begin; node = %s" % node) -+ -+ clusterHelper._exec("crm_attribute", -+ "--node", node, -+ "--name", attr_healthstate, -+ "--update", "-1000000", -+ "--lifetime=forever") -+ -+ ocf.logger.debug("putNodeStandby: finished") -+ -+ def isNodeInStandby(self, node=None): -+ """ -+ check if node is in standby -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("isNodeInStandby: begin; node = %s" % node) -+ isInStandy = False -+ -+ healthAttributeStr = clusterHelper.getAttr(attr_healthstate, node) -+ if healthAttributeStr is not None: -+ try: -+ healthAttribute = int(healthAttributeStr) -+ isInStandy = healthAttribute < 0 -+ except ValueError: -+ # Handle the exception -+ ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node)) -+ -+ ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy) -+ return isInStandy -+ -+ def putNodeOnline(self, node=None): -+ """ -+ Put self back online -+ """ -+ node = self.selfOrOtherNode(node) -+ ocf.logger.debug("putNodeOnline: begin; node = %s" % node) -+ -+ clusterHelper._exec("crm_attribute", -+ "--node", node, -+ "--name", "#health-azure", -+ "--update", "0", -+ "--lifetime=forever") -+ -+ ocf.logger.debug("putNodeOnline: finished") -+ -+ def separateEvents(self, events): -+ """ -+ Split own/other nodes' events -+ """ -+ ocf.logger.debug("separateEvents: begin; events = %s" % str(events)) -+ -+ localEvents = [] -+ remoteEvents = [] -+ for e in events: -+ e = attrDict(e) -+ if e.EventType not in self.raOwner.relevantEventTypes: -+ continue -+ if self.azName in e.Resources: -+ localEvents.append(e) -+ else: -+ remoteEvents.append(e) -+ ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents))) -+ return (localEvents, remoteEvents) -+ -+############################################################################## -+ -+class raAzEvents: -+ """ -+ Main class for resource agent -+ """ -+ def __init__(self, relevantEventTypes): -+ self.node = Node(self) -+ self.relevantEventTypes = relevantEventTypes -+ -+ def monitor(self): -+ ocf.logger.debug("monitor: begin") -+ -+ events = azHelper.pullScheduledEvents() -+ -+ # get current document version -+ curDocVersion = events.DocumentIncarnation -+ lastDocVersion = self.node.getAttr(attr_lastDocVersion) -+ ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion)) -+ -+ # split events local/remote -+ (localEvents, remoteEvents) = self.node.separateEvents(events.Events) -+ -+ # ensure local events are only executing once -+ if curDocVersion == lastDocVersion: -+ ocf.logger.info("monitor: already handled curDocVersion, skip") -+ return ocf.OCF_SUCCESS -+ -+ localAzEventIDs = set() -+ for e in localEvents: -+ localAzEventIDs.add(e.EventId) -+ -+ curState = self.node.getState() -+ clusterEventIDs = self.node.getEventIDs() -+ -+ ocf.logger.debug("monitor: curDocVersion has not been handled yet") -+ -+ if clusterEventIDs: -+ # there are pending events set, so our state must be STOPPING or IN_EVENT -+ i = 0; touchedEventIDs = False -+ while i < len(clusterEventIDs): -+ # clean up pending events that are already finished according to AZ -+ if clusterEventIDs[i] not in localAzEventIDs: -+ ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i])) -+ clusterEventIDs.pop(i) -+ touchedEventIDs = True -+ else: -+ i += 1 -+ if len(clusterEventIDs) > 0: -+ # there are still pending events (either because we're still stopping, or because the event is still in place) -+ # either way, we need to wait -+ if touchedEventIDs: -+ ocf.logger.info("monitor: added new local clusterEvent %s" % str(clusterEventIDs)) -+ self.node.setEventIDs(clusterEventIDs) -+ else: -+ ocf.logger.info("monitor: no local clusterEvents were updated") -+ else: -+ # there are no more pending events left after cleanup -+ if clusterHelper.noPendingResourcesOnNode(self.node.hostName): -+ # and no pending resources on the node -> set it back online -+ ocf.logger.info("monitor: all local events finished -> clean up, put node online and AVAILABLE") -+ curState = self.node.updateNodeStateAndEvents(AVAILABLE, None) -+ self.node.putNodeOnline() -+ clusterHelper.removeHoldFromNodes() -+ # If Azure Scheduled Events are not used for 24 hours (e.g. because the cluster was asleep), it will be disabled for a VM. -+ # When the cluster wakes up and starts using it again, the DocumentIncarnation is reset. -+ # We need to remove it during cleanup, otherwise azure-events-az will not process the event after wakeup -+ self.node.setAttr(attr_lastDocVersion, None) -+ else: -+ ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait") -+ else: -+ if curState == AVAILABLE: -+ if len(localAzEventIDs) > 0: -+ if clusterHelper.otherNodesAvailable(self.node): -+ ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs))) -+ curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs) -+ else: -+ ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs)) -+ self.node.setState(ON_HOLD) -+ else: -+ ocf.logger.debug("monitor: no local azEvents to handle") -+ -+ if curState == STOPPING: -+ eventIDsForNode = {} -+ if clusterHelper.noPendingResourcesOnNode(self.node.hostName): -+ if not self.node.isNodeInStandby(): -+ ocf.logger.info("monitor: all local resources are started properly -> put node standby and exit") -+ self.node.putNodeStandby() -+ return ocf.OCF_SUCCESS -+ -+ for e in localEvents: -+ ocf.logger.info("monitor: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources))) -+ # before we can force an event to start, we need to ensure all nodes involved have stopped their resources -+ if e.EventStatus == "Scheduled": -+ allNodesStopped = True -+ for azName in e.Resources: -+ hostName = clusterHelper.getHostNameFromAzName(azName) -+ state = self.node.getState(node=hostName) -+ if state == STOPPING: -+ # the only way we can continue is when node state is STOPPING, but all resources have been stopped -+ if not clusterHelper.allResourcesStoppedOnNode(hostName): -+ ocf.logger.info("monitor: (at least) node %s has still resources running -> wait" % hostName) -+ allNodesStopped = False -+ break -+ elif state in (AVAILABLE, IN_EVENT, ON_HOLD): -+ ocf.logger.info("monitor: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state))) -+ allNodesStopped = False -+ break -+ if allNodesStopped: -+ ocf.logger.info("monitor: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId)) -+ for n in e.Resources: -+ hostName = clusterHelper.getHostNameFromAzName(n) -+ if hostName in eventIDsForNode: -+ eventIDsForNode[hostName].append(e.EventId) -+ else: -+ eventIDsForNode[hostName] = [e.EventId] -+ elif e.EventStatus == "Started": -+ ocf.logger.info("monitor: remote event already started") -+ -+ # force the start of all events whose nodes are ready (i.e. have no more resources running) -+ if len(eventIDsForNode.keys()) > 0: -+ eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist]) -+ ocf.logger.info("monitor: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce))) -+ for node, eventId in eventIDsForNode.items(): -+ self.node.updateNodeStateAndEvents(IN_EVENT, eventId, node=node) -+ azHelper.forceEvents(eventIDsToForce) -+ self.node.setAttr(attr_lastDocVersion, curDocVersion) -+ else: -+ ocf.logger.info("monitor: some local resources are not clean yet -> wait") -+ -+ ocf.logger.debug("monitor: finished") -+ return ocf.OCF_SUCCESS -+ -+############################################################################## -+ -+def setLoglevel(verbose): -+ # set up writing into syslog -+ loglevel = default_loglevel -+ if verbose: -+ opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1)) -+ urllib2.install_opener(opener) -+ loglevel = ocf.logging.DEBUG -+ ocf.log.setLevel(loglevel) -+ -+description = ( -+ "Microsoft Azure Scheduled Events monitoring agent", -+ """This resource agent implements a monitor for scheduled -+(maintenance) events for a Microsoft Azure VM. -+ -+If any relevant events are found, it moves all Pacemaker resources -+away from the affected node to allow for a graceful shutdown. -+ -+ Usage: -+ [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION -+ -+ action (required): Supported values: monitor, help, meta-data -+ eventTypes (optional): List of event types to be considered -+ relevant by the resource agent (comma-separated). -+ Supported values: Freeze,Reboot,Redeploy -+ Default = Reboot,Redeploy -+/ verbose (optional): If set to true, displays debug info. -+ Default = false -+ -+ Deployment: -+ crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \ -+ op monitor interval=10s -+ crm configure clone cln_azure-events-az rsc_azure-events-az -+ -+For further information on Microsoft Azure Scheduled Events, please -+refer to the following documentation: -+https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events -+""") -+ -+def monitor_action(eventTypes): -+ relevantEventTypes = set(eventTypes.split(",") if eventTypes else []) -+ ra = raAzEvents(relevantEventTypes) -+ return ra.monitor() -+ -+def validate_action(eventTypes): -+ if eventTypes: -+ for event in eventTypes.split(","): -+ if event not in ("Freeze", "Reboot", "Redeploy"): -+ ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes) -+ return ocf.OCF_ERR_CONFIGURED -+ return ocf.OCF_SUCCESS -+ -+def main(): -+ agent = ocf.Agent("azure-events-az", shortdesc=description[0], longdesc=description[1]) -+ agent.add_parameter( -+ "eventTypes", -+ shortdesc="List of resources to be considered", -+ longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)", -+ content_type="string", -+ default="Reboot,Redeploy") -+ agent.add_parameter( -+ "verbose", -+ shortdesc="Enable verbose agent logging", -+ longdesc="Set to true to enable verbose logging", -+ content_type="boolean", -+ default="false") -+ agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS) -+ agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS) -+ agent.add_action("validate-all", timeout=20, handler=validate_action) -+ agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action) -+ setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false"))) -+ agent.run() -+ -+if __name__ == '__main__': -+ main() -\ No newline at end of file - -From a95337d882c7cc69d604b050159ad50b679f18be Mon Sep 17 00:00:00 2001 -From: MSSedusch -Date: Thu, 2 Jun 2022 14:10:33 +0200 -Subject: [PATCH 2/2] Remove developer documentation - ---- - heartbeat/azure-events-az.in | 11 ----------- - 1 file changed, 11 deletions(-) - -diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in -index 616fc8d9e..59d095306 100644 ---- a/heartbeat/azure-events-az.in -+++ b/heartbeat/azure-events-az.in -@@ -723,17 +723,6 @@ description = ( - If any relevant events are found, it moves all Pacemaker resources - away from the affected node to allow for a graceful shutdown. - -- Usage: -- [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events-az ACTION -- -- action (required): Supported values: monitor, help, meta-data -- eventTypes (optional): List of event types to be considered -- relevant by the resource agent (comma-separated). -- Supported values: Freeze,Reboot,Redeploy -- Default = Reboot,Redeploy --/ verbose (optional): If set to true, displays debug info. -- Default = false -- - Deployment: - crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \ - op monitor interval=10s diff --git a/SOURCES/bz2133682-IPsrcaddr-proto-metric-scope-default-route-fixes.patch b/SOURCES/bz2133682-IPsrcaddr-proto-metric-scope-default-route-fixes.patch new file mode 100644 index 0000000..8722395 --- /dev/null +++ b/SOURCES/bz2133682-IPsrcaddr-proto-metric-scope-default-route-fixes.patch @@ -0,0 +1,147 @@ +From 237d55120a7c8d761f839c96651e722b3bb3bc88 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 12 Oct 2022 13:57:30 +0200 +Subject: [PATCH 1/4] IPsrcaddr: fix PROTO regex + +--- + heartbeat/IPsrcaddr | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr +index 7dbf65ff5..24406d296 100755 +--- a/heartbeat/IPsrcaddr ++++ b/heartbeat/IPsrcaddr +@@ -188,7 +188,7 @@ IPADDR="\($OCTET\.\)\{3\}$OCTET" + SRCCLAUSE="src$WS$WS*\($IPADDR\)" + MATCHROUTE="\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)" + METRICCLAUSE=".*\(metric$WS[^ ]\+\)" +-PROTOCLAUSE=".*\(proto$WS[^ ]\+\)" ++PROTOCLAUSE=".*\(proto$WS[^ ]\+\).*" + FINDIF=findif + + # findif needs that to be set + +From c70ba457851a401cb201cb87d23bdbc5f4fcd2b3 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 12 Oct 2022 14:00:30 +0200 +Subject: [PATCH 2/4] IPsrcaddr: detect metric for main table only, and allow + specifying metric if necessary + +--- + heartbeat/IPsrcaddr | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr +index 24406d296..4745eb8a7 100755 +--- a/heartbeat/IPsrcaddr ++++ b/heartbeat/IPsrcaddr +@@ -59,12 +59,14 @@ OCF_RESKEY_ipaddress_default="" + OCF_RESKEY_cidr_netmask_default="" + OCF_RESKEY_destination_default="0.0.0.0/0" + OCF_RESKEY_proto_default="" ++OCF_RESKEY_metric_default="" + OCF_RESKEY_table_default="" + + : ${OCF_RESKEY_ipaddress=${OCF_RESKEY_ipaddress_default}} + : ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}} + : ${OCF_RESKEY_destination=${OCF_RESKEY_destination_default}} + : ${OCF_RESKEY_proto=${OCF_RESKEY_proto_default}} ++: ${OCF_RESKEY_metric=${OCF_RESKEY_metric_default}} + : ${OCF_RESKEY_table=${OCF_RESKEY_table_default}} + ####################################################################### + +@@ -143,6 +145,14 @@ Proto to match when finding network. E.g. "kernel". + + + ++ ++ ++Metric. Only needed if incorrect metric value is used. ++ ++Metric ++ ++ ++ + + + Table to modify. E.g. "local". +@@ -548,8 +558,14 @@ rc=$? + + INTERFACE=`echo $findif_out | awk '{print $1}'` + LISTROUTE=`$IP2UTIL route list dev $INTERFACE scope link $PROTO match $ipaddress` +-METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"` + [ -z "$PROTO" ] && PROTO=`echo $LISTROUTE | sed -n "s/$PROTOCLAUSE/\1/p"` ++if [ -n "$OCF_RESKEY_metric" ]; then ++ METRIC="metric $OCF_RESKEY_metric" ++elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then ++ METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"` ++else ++ METRIC="" ++fi + if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then + NETWORK=`echo $LISTROUTE | grep -m 1 -o '^[^ ]*'` + + +From c514f12f7a19440f475938f2a4659e5e9667fa25 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 12 Oct 2022 14:01:26 +0200 +Subject: [PATCH 3/4] IPsrcaddr: use scope host when using non-main tables + +--- + heartbeat/IPsrcaddr | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr +index 4745eb8a7..926246008 100755 +--- a/heartbeat/IPsrcaddr ++++ b/heartbeat/IPsrcaddr +@@ -279,8 +279,14 @@ srca_stop() { + + [ $rc = 2 ] && errorexit "The address you specified to stop does not match the preferred source address" + ++ if [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then ++ SCOPE="link" ++ else ++ SCOPE="host" ++ fi ++ + PRIMARY_IP="$($IP2UTIL -4 -o addr show dev $INTERFACE primary | awk '{split($4,a,"/");print a[1]}')" +- OPTS="proto kernel scope link src $PRIMARY_IP" ++ OPTS="proto kernel scope $SCOPE src $PRIMARY_IP" + + $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC || \ + errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed" + +From 1f387ac8017b3eee23b41eadafd58ce21a29eb21 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 13 Oct 2022 13:11:28 +0200 +Subject: [PATCH 4/4] IPsrcaddr: fix monitor/status for default route not being + equal to src IP before start, and change route src correctly in stop-action + +--- + heartbeat/IPsrcaddr | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr +index 926246008..1bd41a930 100755 +--- a/heartbeat/IPsrcaddr ++++ b/heartbeat/IPsrcaddr +@@ -229,6 +229,7 @@ srca_read() { + + [ -z "$SRCIP" ] && return 1 + [ $SRCIP = $1 ] && return 0 ++ [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "status" ] && [ "${ROUTE%% *}" = "default" ] && return 1 + return 2 + } + +@@ -292,8 +293,8 @@ srca_stop() { + errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed" + + if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then +- $CMDCHANGE $ROUTE_WO_SRC || \ +- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC' failed" ++ $CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP || \ ++ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP' failed" + fi + + return $? diff --git a/SOURCES/bz2134536-IPsrcaddr-proto-metric-scope-default-route-fixes.patch b/SOURCES/bz2134536-IPsrcaddr-proto-metric-scope-default-route-fixes.patch deleted file mode 100644 index 8722395..0000000 --- a/SOURCES/bz2134536-IPsrcaddr-proto-metric-scope-default-route-fixes.patch +++ /dev/null @@ -1,147 +0,0 @@ -From 237d55120a7c8d761f839c96651e722b3bb3bc88 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Wed, 12 Oct 2022 13:57:30 +0200 -Subject: [PATCH 1/4] IPsrcaddr: fix PROTO regex - ---- - heartbeat/IPsrcaddr | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr -index 7dbf65ff5..24406d296 100755 ---- a/heartbeat/IPsrcaddr -+++ b/heartbeat/IPsrcaddr -@@ -188,7 +188,7 @@ IPADDR="\($OCTET\.\)\{3\}$OCTET" - SRCCLAUSE="src$WS$WS*\($IPADDR\)" - MATCHROUTE="\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)" - METRICCLAUSE=".*\(metric$WS[^ ]\+\)" --PROTOCLAUSE=".*\(proto$WS[^ ]\+\)" -+PROTOCLAUSE=".*\(proto$WS[^ ]\+\).*" - FINDIF=findif - - # findif needs that to be set - -From c70ba457851a401cb201cb87d23bdbc5f4fcd2b3 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Wed, 12 Oct 2022 14:00:30 +0200 -Subject: [PATCH 2/4] IPsrcaddr: detect metric for main table only, and allow - specifying metric if necessary - ---- - heartbeat/IPsrcaddr | 18 +++++++++++++++++- - 1 file changed, 17 insertions(+), 1 deletion(-) - -diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr -index 24406d296..4745eb8a7 100755 ---- a/heartbeat/IPsrcaddr -+++ b/heartbeat/IPsrcaddr -@@ -59,12 +59,14 @@ OCF_RESKEY_ipaddress_default="" - OCF_RESKEY_cidr_netmask_default="" - OCF_RESKEY_destination_default="0.0.0.0/0" - OCF_RESKEY_proto_default="" -+OCF_RESKEY_metric_default="" - OCF_RESKEY_table_default="" - - : ${OCF_RESKEY_ipaddress=${OCF_RESKEY_ipaddress_default}} - : ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}} - : ${OCF_RESKEY_destination=${OCF_RESKEY_destination_default}} - : ${OCF_RESKEY_proto=${OCF_RESKEY_proto_default}} -+: ${OCF_RESKEY_metric=${OCF_RESKEY_metric_default}} - : ${OCF_RESKEY_table=${OCF_RESKEY_table_default}} - ####################################################################### - -@@ -143,6 +145,14 @@ Proto to match when finding network. E.g. "kernel". - - - -+ -+ -+Metric. Only needed if incorrect metric value is used. -+ -+Metric -+ -+ -+ - - - Table to modify. E.g. "local". -@@ -548,8 +558,14 @@ rc=$? - - INTERFACE=`echo $findif_out | awk '{print $1}'` - LISTROUTE=`$IP2UTIL route list dev $INTERFACE scope link $PROTO match $ipaddress` --METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"` - [ -z "$PROTO" ] && PROTO=`echo $LISTROUTE | sed -n "s/$PROTOCLAUSE/\1/p"` -+if [ -n "$OCF_RESKEY_metric" ]; then -+ METRIC="metric $OCF_RESKEY_metric" -+elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then -+ METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"` -+else -+ METRIC="" -+fi - if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then - NETWORK=`echo $LISTROUTE | grep -m 1 -o '^[^ ]*'` - - -From c514f12f7a19440f475938f2a4659e5e9667fa25 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Wed, 12 Oct 2022 14:01:26 +0200 -Subject: [PATCH 3/4] IPsrcaddr: use scope host when using non-main tables - ---- - heartbeat/IPsrcaddr | 8 +++++++- - 1 file changed, 7 insertions(+), 1 deletion(-) - -diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr -index 4745eb8a7..926246008 100755 ---- a/heartbeat/IPsrcaddr -+++ b/heartbeat/IPsrcaddr -@@ -279,8 +279,14 @@ srca_stop() { - - [ $rc = 2 ] && errorexit "The address you specified to stop does not match the preferred source address" - -+ if [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then -+ SCOPE="link" -+ else -+ SCOPE="host" -+ fi -+ - PRIMARY_IP="$($IP2UTIL -4 -o addr show dev $INTERFACE primary | awk '{split($4,a,"/");print a[1]}')" -- OPTS="proto kernel scope link src $PRIMARY_IP" -+ OPTS="proto kernel scope $SCOPE src $PRIMARY_IP" - - $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC || \ - errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed" - -From 1f387ac8017b3eee23b41eadafd58ce21a29eb21 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Thu, 13 Oct 2022 13:11:28 +0200 -Subject: [PATCH 4/4] IPsrcaddr: fix monitor/status for default route not being - equal to src IP before start, and change route src correctly in stop-action - ---- - heartbeat/IPsrcaddr | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr -index 926246008..1bd41a930 100755 ---- a/heartbeat/IPsrcaddr -+++ b/heartbeat/IPsrcaddr -@@ -229,6 +229,7 @@ srca_read() { - - [ -z "$SRCIP" ] && return 1 - [ $SRCIP = $1 ] && return 0 -+ [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "status" ] && [ "${ROUTE%% *}" = "default" ] && return 1 - return 2 - } - -@@ -292,8 +293,8 @@ srca_stop() { - errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed" - - if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then -- $CMDCHANGE $ROUTE_WO_SRC || \ -- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC' failed" -+ $CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP || \ -+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP' failed" - fi - - return $? diff --git a/SOURCES/bz2139131-mysql-common-return-error-if-kill-fails.patch b/SOURCES/bz2139131-mysql-common-return-error-if-kill-fails.patch new file mode 100644 index 0000000..e6267f8 --- /dev/null +++ b/SOURCES/bz2139131-mysql-common-return-error-if-kill-fails.patch @@ -0,0 +1,25 @@ +From 97a05e0e662ed922c9ecd016b39ab90ee233d5c9 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 24 Nov 2022 10:36:56 +0100 +Subject: [PATCH] mysql-common: return error in stop-action if kill fails to + stop the process, so the node can get fenced + +--- + heartbeat/mysql-common.sh | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/heartbeat/mysql-common.sh b/heartbeat/mysql-common.sh +index 34e1c6748..8104019b0 100755 +--- a/heartbeat/mysql-common.sh ++++ b/heartbeat/mysql-common.sh +@@ -318,6 +318,10 @@ mysql_common_stop() + if [ $? != $OCF_NOT_RUNNING ]; then + ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." + /bin/kill -KILL $pid > /dev/null ++ mysql_common_status info $pid ++ if [ $? != $OCF_NOT_RUNNING ]; then ++ return $OCF_ERR_GENERIC ++ fi + fi + + ocf_log info "MySQL stopped"; diff --git a/SOURCES/bz2141836-vdo-vol-dont-fail-probe-action.patch b/SOURCES/bz2141836-vdo-vol-dont-fail-probe-action.patch new file mode 100644 index 0000000..28c28ce --- /dev/null +++ b/SOURCES/bz2141836-vdo-vol-dont-fail-probe-action.patch @@ -0,0 +1,27 @@ +From 739e6ce9096facd6d37dffd524c79c961e3fae38 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Fri, 11 Nov 2022 14:17:39 +0100 +Subject: [PATCH] vdo-vol: dont fail probe action when the underlying device + doesnt exist + +--- + heartbeat/vdo-vol | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/heartbeat/vdo-vol b/heartbeat/vdo-vol +index 94822cb82..29bd7b8fd 100755 +--- a/heartbeat/vdo-vol ++++ b/heartbeat/vdo-vol +@@ -148,6 +148,12 @@ vdo_monitor(){ + MODE=$(vdostats --verbose ${OCF_RESKEY_volume} | grep "operating mode" | awk '{print $NF}') + + case "$status" in ++ *"ERROR - vdodumpconfig: Failed to make FileLayer from"*) ++ if ocf_is_probe; then ++ return $OCF_NOT_RUNNING ++ fi ++ return $OCF_ERR_GENERIC ++ ;; + *"Device mapper status: not available"*) + return $OCF_NOT_RUNNING + ;; diff --git a/SOURCES/bz2144866-vdo-vol-dont-fail-probe-action.patch b/SOURCES/bz2144866-vdo-vol-dont-fail-probe-action.patch deleted file mode 100644 index 28c28ce..0000000 --- a/SOURCES/bz2144866-vdo-vol-dont-fail-probe-action.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 739e6ce9096facd6d37dffd524c79c961e3fae38 Mon Sep 17 00:00:00 2001 -From: Oyvind Albrigtsen -Date: Fri, 11 Nov 2022 14:17:39 +0100 -Subject: [PATCH] vdo-vol: dont fail probe action when the underlying device - doesnt exist - ---- - heartbeat/vdo-vol | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/heartbeat/vdo-vol b/heartbeat/vdo-vol -index 94822cb82..29bd7b8fd 100755 ---- a/heartbeat/vdo-vol -+++ b/heartbeat/vdo-vol -@@ -148,6 +148,12 @@ vdo_monitor(){ - MODE=$(vdostats --verbose ${OCF_RESKEY_volume} | grep "operating mode" | awk '{print $NF}') - - case "$status" in -+ *"ERROR - vdodumpconfig: Failed to make FileLayer from"*) -+ if ocf_is_probe; then -+ return $OCF_NOT_RUNNING -+ fi -+ return $OCF_ERR_GENERIC -+ ;; - *"Device mapper status: not available"*) - return $OCF_NOT_RUNNING - ;; diff --git a/SOURCES/bz2157873-1-all-ras-validate-all-OCF_CHECK_LEVEL-10.patch b/SOURCES/bz2157873-1-all-ras-validate-all-OCF_CHECK_LEVEL-10.patch new file mode 100644 index 0000000..85f5f48 --- /dev/null +++ b/SOURCES/bz2157873-1-all-ras-validate-all-OCF_CHECK_LEVEL-10.patch @@ -0,0 +1,137 @@ +From bf89ad06d5da5c05533c80a37a37c8dbbcd123aa Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 8 Dec 2022 15:40:07 +0100 +Subject: [PATCH] galera/mpathpersist/sg_persist/IPsrcaddr: only check notify + and promotable when OCF_CHECK_LEVEL=10 + +Pacemaker has started running validate-all action before creating the +resource. It doesnt provide notify/promotable settings while doing so, +so this patch moves these checks to OCF_CHECK_LEVEL 10 and runs the +validate action at OCF_CHECK_LEVEL 10 in the start-action. +--- + heartbeat/IPsrcaddr | 13 ++++++++----- + heartbeat/galera.in | 9 ++++++--- + heartbeat/mpathpersist.in | 13 +++++++++---- + heartbeat/sg_persist.in | 13 +++++++++---- + 4 files changed, 32 insertions(+), 16 deletions(-) + +diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr +index 1bd41a930..66e2ad8cd 100755 +--- a/heartbeat/IPsrcaddr ++++ b/heartbeat/IPsrcaddr +@@ -510,11 +510,13 @@ srca_validate_all() { + fi + + # We should serve this IP address of course +- if ip_status "$ipaddress"; then +- : +- else +- ocf_exit_reason "We are not serving [$ipaddress], hence can not make it a preferred source address" +- return $OCF_ERR_INSTALLED ++ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then ++ if ip_status "$ipaddress"; then ++ : ++ else ++ ocf_exit_reason "We are not serving [$ipaddress], hence can not make it a preferred source address" ++ return $OCF_ERR_INSTALLED ++ fi + fi + return $OCF_SUCCESS + } +@@ -540,6 +542,7 @@ esac + + ipaddress="$OCF_RESKEY_ipaddress" + ++[ "$__OCF_ACTION" != "validate-all" ] && OCF_CHECK_LEVEL=10 + srca_validate_all + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then +diff --git a/heartbeat/galera.in b/heartbeat/galera.in +index cd2fee7c0..6aed3e4b6 100755 +--- a/heartbeat/galera.in ++++ b/heartbeat/galera.in +@@ -1015,9 +1015,11 @@ galera_stop() + + galera_validate() + { +- if ! ocf_is_ms; then +- ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource." +- return $OCF_ERR_CONFIGURED ++ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then ++ if ! ocf_is_ms; then ++ ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource." ++ return $OCF_ERR_CONFIGURED ++ fi + fi + + if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then +@@ -1035,6 +1037,7 @@ case "$1" in + exit $OCF_SUCCESS;; + esac + ++[ "$__OCF_ACTION" = "start" ] && OCF_CHECK_LEVEL=10 + galera_validate + rc=$? + LSB_STATUS_STOPPED=3 +diff --git a/heartbeat/mpathpersist.in b/heartbeat/mpathpersist.in +index 0e2c2a4a0..8a46b9930 100644 +--- a/heartbeat/mpathpersist.in ++++ b/heartbeat/mpathpersist.in +@@ -630,10 +630,11 @@ mpathpersist_action_notify() { + } + + mpathpersist_action_validate_all () { +- +- if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then +- ocf_log err "Master options misconfigured." +- exit $OCF_ERR_CONFIGURED ++ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then ++ if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then ++ ocf_log err "Master options misconfigured." ++ exit $OCF_ERR_CONFIGURED ++ fi + fi + + return $OCF_SUCCESS +@@ -659,6 +660,10 @@ case $ACTION in + start|promote|monitor|stop|demote) + ocf_log debug "$RESOURCE: starting action \"$ACTION\"" + mpathpersist_init ++ if [ "$__OCF_ACTION" = "start" ]; then ++ OCF_CHECK_LEVEL=10 ++ mpathpersist_action_validate_all ++ fi + mpathpersist_action_$ACTION + exit $? + ;; +diff --git a/heartbeat/sg_persist.in b/heartbeat/sg_persist.in +index 16048ea6f..620c02f4a 100644 +--- a/heartbeat/sg_persist.in ++++ b/heartbeat/sg_persist.in +@@ -643,10 +643,11 @@ sg_persist_action_notify() { + } + + sg_persist_action_validate_all () { +- +- if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then +- ocf_log err "Master options misconfigured." +- exit $OCF_ERR_CONFIGURED ++ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then ++ if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then ++ ocf_log err "Master options misconfigured." ++ exit $OCF_ERR_CONFIGURED ++ fi + fi + + return $OCF_SUCCESS +@@ -672,6 +673,10 @@ case $ACTION in + start|promote|monitor|stop|demote) + ocf_log debug "$RESOURCE: starting action \"$ACTION\"" + sg_persist_init ++ if [ "$__OCF_ACTION" = "start" ]; then ++ OCF_CHECK_LEVEL=10 ++ sg_persist_action_validate_all ++ fi + sg_persist_action_$ACTION + exit $? + ;; diff --git a/SOURCES/bz2157873-2-Filesystem-CTDB-validate-all-improvements.patch b/SOURCES/bz2157873-2-Filesystem-CTDB-validate-all-improvements.patch new file mode 100644 index 0000000..bd95157 --- /dev/null +++ b/SOURCES/bz2157873-2-Filesystem-CTDB-validate-all-improvements.patch @@ -0,0 +1,49 @@ +From 21666c5c842b8a6028699ee78db75a1d7134fad0 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 4 Jan 2023 10:39:16 +0100 +Subject: [PATCH 1/2] Filesystem: remove validate-all mountpoint warning as it + is auto-created during start-action if it doesnt exist + +--- + heartbeat/Filesystem | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 44270ad98..65088029e 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -851,10 +851,6 @@ Filesystem_monitor() + # + Filesystem_validate_all() + { +- if [ -n "$MOUNTPOINT" ] && [ ! -d "$MOUNTPOINT" ]; then +- ocf_log warn "Mountpoint $MOUNTPOINT does not exist" +- fi +- + # Check if the $FSTYPE is workable + # NOTE: Without inserting the $FSTYPE module, this step may be imprecise + # TODO: This is Linux specific crap. + +From 8a7f40b6ab93d8d39230d864ab06a57ff48d6f1f Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 5 Jan 2023 13:09:48 +0100 +Subject: [PATCH 2/2] CTDB: change public_addresses validate-all warning to + info + +--- + heartbeat/CTDB.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/CTDB.in b/heartbeat/CTDB.in +index 46f56cfac..b4af66bc1 100755 +--- a/heartbeat/CTDB.in ++++ b/heartbeat/CTDB.in +@@ -940,7 +940,7 @@ ctdb_validate() { + fi + + if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then +- ocf_log warn "CTDB file '${OCF_RESKEY_ctdb_config_dir}/public_addresses' exists - CTDB will try to manage IP failover!" ++ ocf_log info "CTDB file '${OCF_RESKEY_ctdb_config_dir}/public_addresses' exists - CTDB will try to manage IP failover!" + fi + + if [ ! -f "$OCF_RESKEY_ctdb_config_dir/nodes" ]; then diff --git a/SOURCES/bz2157873-3-pgsqlms-validate-all-OCF_CHECK_LEVEL-10.patch b/SOURCES/bz2157873-3-pgsqlms-validate-all-OCF_CHECK_LEVEL-10.patch new file mode 100644 index 0000000..7b98a63 --- /dev/null +++ b/SOURCES/bz2157873-3-pgsqlms-validate-all-OCF_CHECK_LEVEL-10.patch @@ -0,0 +1,68 @@ +--- a/heartbeat/pgsqlms 2023-01-04 14:42:36.093258702 +0100 ++++ b/heartbeat/pgsqlms 2023-01-04 14:40:52.403994545 +0100 +@@ -66,6 +66,7 @@ + my $maxlag = $ENV{'OCF_RESKEY_maxlag'} || $maxlag_default; + my $recovery_tpl = $ENV{'OCF_RESKEY_recovery_template'} + || "$pgdata/recovery.conf.pcmk"; ++my $ocf_check_level = $ENV{'OCF_CHECK_LEVEL'} || 0; + + + # PostgreSQL commands path +@@ -1304,26 +1305,28 @@ + return $OCF_ERR_INSTALLED; + } + +- # check notify=true +- $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\ +- --meta --get-parameter notify 2>/dev/null }; +- chomp $ans; +- unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) { +- ocf_exit_reason( +- 'You must set meta parameter notify=true for your master resource' +- ); +- return $OCF_ERR_INSTALLED; +- } ++ if ( $ocf_check_level == 10 ) { ++ # check notify=true ++ $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\ ++ --meta --get-parameter notify 2>/dev/null }; ++ chomp $ans; ++ unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) { ++ ocf_exit_reason( ++ 'You must set meta parameter notify=true for your "master" resource' ++ ); ++ return $OCF_ERR_INSTALLED; ++ } + +- # check master-max=1 +- unless ( +- defined $ENV{'OCF_RESKEY_CRM_meta_master_max'} +- and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1' +- ) { +- ocf_exit_reason( +- 'You must set meta parameter master-max=1 for your master resource' +- ); +- return $OCF_ERR_INSTALLED; ++ # check master-max=1 ++ unless ( ++ defined $ENV{'OCF_RESKEY_CRM_meta_master_max'} ++ and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1' ++ ) { ++ ocf_exit_reason( ++ 'You must set meta parameter master-max=1 for your "master" resource' ++ ); ++ return $OCF_ERR_INSTALLED; ++ } + } + + if ( $PGVERNUM >= $PGVER_12 ) { +@@ -2242,6 +2245,9 @@ + # Set current node name. + $nodename = ocf_local_nodename(); + ++if ( $__OCF_ACTION ne 'validate-all' ) { ++ $ocf_check_level = 10; ++} + $exit_code = pgsql_validate_all(); + + exit $exit_code if $exit_code != $OCF_SUCCESS or $__OCF_ACTION eq 'validate-all'; diff --git a/SOURCES/bz2157873-4-exportfs-pgsql-validate-all-fixes.patch b/SOURCES/bz2157873-4-exportfs-pgsql-validate-all-fixes.patch new file mode 100644 index 0000000..d09352d --- /dev/null +++ b/SOURCES/bz2157873-4-exportfs-pgsql-validate-all-fixes.patch @@ -0,0 +1,187 @@ +From 81f9e1a04dfd2274ccb906310b4f191485e342ab Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 11 Jan 2023 13:22:24 +0100 +Subject: [PATCH 1/2] exportfs: move testdir() to start-action to avoid failing + during resource creation (validate-all) and make it create the directory if + it doesnt exist + +--- + heartbeat/exportfs | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +diff --git a/heartbeat/exportfs b/heartbeat/exportfs +index c10777fa9..2307a9e67 100755 +--- a/heartbeat/exportfs ++++ b/heartbeat/exportfs +@@ -301,6 +301,16 @@ exportfs_monitor () + fi + } + ++testdir() { ++ if [ ! -d $1 ]; then ++ mkdir -p "$1" ++ if [ $? -ne 0 ]; then ++ ocf_exit_reason "Unable to create directory $1" ++ return 1 ++ fi ++ fi ++ return 0 ++} + export_one() { + local dir=$1 + local opts sep +@@ -331,6 +341,10 @@ export_one() { + } + exportfs_start () + { ++ if ! forall testdir; then ++ return $OCF_ERR_INSTALLED ++ fi ++ + if exportfs_monitor; then + ocf_log debug "already exported" + return $OCF_SUCCESS +@@ -428,14 +442,6 @@ exportfs_stop () + fi + } + +-testdir() { +- if [ ! -d $1 ]; then +- ocf_is_probe || +- ocf_log err "$1 does not exist or is not a directory" +- return 1 +- fi +- return 0 +-} + exportfs_validate_all () + { + if echo "$OCF_RESKEY_fsid" | grep -q -F ','; then +@@ -447,9 +453,6 @@ exportfs_validate_all () + ocf_exit_reason "use integer fsid when exporting multiple directories" + return $OCF_ERR_CONFIGURED + fi +- if ! forall testdir; then +- return $OCF_ERR_INSTALLED +- fi + } + + for dir in $OCF_RESKEY_directory; do +@@ -466,7 +469,7 @@ for dir in $OCF_RESKEY_directory; do + fi + else + case "$__OCF_ACTION" in +- stop|monitor) ++ stop|monitor|validate-all) + canonicalized_dir="$dir" + ocf_log debug "$dir does not exist" + ;; + +From 8ee41af82cda35149f8e0cfede6a8ddef3e221e1 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Wed, 11 Jan 2023 13:25:57 +0100 +Subject: [PATCH 2/2] pgsql: dont run promotable and file checks that could be + on shared storage during validate-all action + +--- + heartbeat/pgsql | 53 +++++++++++++++++++++++++++++-------------------- + 1 file changed, 32 insertions(+), 21 deletions(-) + +diff --git a/heartbeat/pgsql b/heartbeat/pgsql +index aa8a13a84..532063ac5 100755 +--- a/heartbeat/pgsql ++++ b/heartbeat/pgsql +@@ -1835,7 +1835,7 @@ check_config() { + + if [ ! -f "$1" ]; then + if ocf_is_probe; then +- ocf_log info "Configuration file is $1 not readable during probe." ++ ocf_log info "Unable to read $1 during probe." + rc=1 + else + ocf_exit_reason "Configuration file $1 doesn't exist" +@@ -1846,8 +1846,7 @@ check_config() { + return $rc + } + +-# Validate most critical parameters +-pgsql_validate_all() { ++validate_ocf_check_level_10() { + local version + local check_config_rc + local rep_mode_string +@@ -1883,12 +1882,6 @@ pgsql_validate_all() { + fi + fi + +- getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 +- if [ ! $? -eq 0 ]; then +- ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist"; +- return $OCF_ERR_INSTALLED; +- fi +- + if ocf_is_probe; then + ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" + else +@@ -1898,18 +1891,6 @@ pgsql_validate_all() { + fi + fi + +- if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ] +- then +- ocf_exit_reason "monitor password can't be empty" +- return $OCF_ERR_CONFIGURED +- fi +- +- if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ] +- then +- ocf_exit_reason "monitor_user has to be set if monitor_password is set" +- return $OCF_ERR_CONFIGURED +- fi +- + if is_replication || [ "$OCF_RESKEY_rep_mode" = "slave" ]; then + if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then + ocf_exit_reason "Replication mode needs PostgreSQL 9.1 or higher." +@@ -2027,6 +2008,35 @@ pgsql_validate_all() { + return $OCF_SUCCESS + } + ++# Validate most critical parameters ++pgsql_validate_all() { ++ local rc ++ ++ getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 ++ if [ ! $? -eq 0 ]; then ++ ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist"; ++ return $OCF_ERR_INSTALLED; ++ fi ++ ++ if [ -n "$OCF_RESKEY_monitor_user" ] && [ -z "$OCF_RESKEY_monitor_password" ]; then ++ ocf_exit_reason "monitor password can't be empty" ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ -z "$OCF_RESKEY_monitor_user" ] && [ -n "$OCF_RESKEY_monitor_password" ]; then ++ ocf_exit_reason "monitor_user has to be set if monitor_password is set" ++ return $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then ++ validate_ocf_check_level_10 ++ rc=$? ++ [ $rc -ne "$OCF_SUCCESS" ] && exit $rc ++ fi ++ ++ return $OCF_SUCCESS ++} ++ + + # + # Check if we need to create a log file +@@ -2163,6 +2173,7 @@ case "$1" in + exit $OCF_SUCCESS;; + esac + ++[ "$__OCF_ACTION" != "validate-all" ] && OCF_CHECK_LEVEL=10 + pgsql_validate_all + rc=$? + diff --git a/SOURCES/bz2157873-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch b/SOURCES/bz2157873-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch new file mode 100644 index 0000000..0642086 --- /dev/null +++ b/SOURCES/bz2157873-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch @@ -0,0 +1,23 @@ +--- ClusterLabs-resource-agents-fd0720f7/heartbeat/pgsqlms 2023-01-16 10:54:30.897188238 +0100 ++++ pgsqlms 2023-01-10 14:21:19.281286242 +0100 +@@ -1351,12 +1351,14 @@ + return $OCF_ERR_ARGS; + } + +- $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts}; +- unless ($guc =~ /\bapplication_name='?$nodename'?\b/) { +- ocf_exit_reason( +- q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }. +- q{It is currently set to '%s'}, $nodename, $guc ); +- return $OCF_ERR_ARGS; ++ if ( $ocf_check_level == 10 ) { ++ $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts}; ++ unless ($guc =~ /\bapplication_name='?$nodename'?\b/) { ++ ocf_exit_reason( ++ q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }. ++ q{It is currently set to '%s'}, $nodename, $guc ); ++ return $OCF_ERR_ARGS; ++ } + } + } + else { diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index fc41590..7613def 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -69,7 +69,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.9.0 -Release: 29%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.3 +Release: 40%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents %if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel} @@ -120,9 +120,22 @@ Patch28: bz2103370-ocf-tester-2-remove-deprecated-lrmd-lrmadmin-code.patch Patch29: bz1908146-bz1908147-bz1908148-bz1949114-openstack-agents-set-domain-parameters-default.patch Patch30: bz2090370-CTDB-move-process-to-root-cgroup-if-rt-enabled.patch Patch31: bz2116941-ethmonitor-ovsmonitor-pgsql-fix-attrd_updater-q.patch -Patch32: bz2130986-azure-events-az-new-ra.patch -Patch33: bz2134536-IPsrcaddr-proto-metric-scope-default-route-fixes.patch -Patch34: bz2144866-vdo-vol-dont-fail-probe-action.patch +Patch32: bz2109159-storage_mon-1-exit-after-help.patch +Patch33: bz2109159-storage_mon-2-fix-specified-scores-count.patch +Patch34: bz2109159-storage_mon-3-fix-child-process-exit.patch +Patch35: bz2109159-storage_mon-4-fix-possible-false-negatives.patch +Patch36: bz1905820-LVM-activate-fix-return-codes.patch +Patch37: bz1977012-azure-events-az-new-ra.patch +Patch38: bz2133682-IPsrcaddr-proto-metric-scope-default-route-fixes.patch +Patch39: bz2141836-vdo-vol-dont-fail-probe-action.patch +Patch40: bz2049319-Filesystem-add-support-for-Amazon-EFS.patch +Patch41: bz2127117-nfsserver-nfsv4_only-parameter.patch +Patch42: bz2139131-mysql-common-return-error-if-kill-fails.patch +Patch43: bz2157873-1-all-ras-validate-all-OCF_CHECK_LEVEL-10.patch +Patch44: bz2157873-2-Filesystem-CTDB-validate-all-improvements.patch +Patch45: bz2157873-3-pgsqlms-validate-all-OCF_CHECK_LEVEL-10.patch +Patch46: bz2157873-4-exportfs-pgsql-validate-all-fixes.patch +Patch47: bz2157873-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch # bundle patches Patch1000: 7-gcp-bundled.patch @@ -337,6 +350,19 @@ exit 1 %patch32 -p1 %patch33 -p1 %patch34 -p1 +%patch35 -p1 +%patch36 -p1 +%patch37 -p1 +%patch38 -p1 +%patch39 -p1 +%patch40 -p1 +%patch41 -p1 +%patch42 -p1 +%patch43 -p1 +%patch44 -p1 +%patch45 -p1 +%patch46 -p1 +%patch47 -p1 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate @@ -912,21 +938,45 @@ ccs_update_schema > /dev/null 2>&1 ||: %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog -* Tue Nov 22 2022 Oyvind Albrigtsen - 4.9.0-29.3 +* Tue Jan 17 2023 Oyvind Albrigtsen - 4.9.0-40 +- all agents: dont check notify/promotable settings during + validate-action + + Resolves: rhbz#2157873 + +* Thu Nov 24 2022 Oyvind Albrigtsen - 4.9.0-35 +- mysql-common: return error in stop-action if kill fails to stop + the process, so the node can get fenced + + Resolves: rhbz#2139131 + +* Tue Nov 22 2022 Oyvind Albrigtsen - 4.9.0-34 +- nfsserver: add nfsv4_only parameter to make it run without + rpc-statd/rpcbind services + + Resolves: rhbz#2127117 + +* Mon Nov 14 2022 Oyvind Albrigtsen - 4.9.0-33 +- Filesystem: add support for Amazon EFS (Elastic File System) - vdo-vol: dont fail probe action when the underlying device doesnt exist - Resolves: rhbz#2144866 + Resolves: rhbz#2049319 + Resolves: rhbz#2141836 -* Fri Oct 14 2022 Oyvind Albrigtsen - 4.9.0-29.2 +* Fri Oct 14 2022 Oyvind Albrigtsen - 4.9.0-31 - IPsrcaddr: proto, metric, scope and default route fixes - Resolves: rhbz#2134536 + Resolves: rhbz#2133682 -* Mon Oct 3 2022 Oyvind Albrigtsen - 4.9.0-29.1 +* Thu Sep 8 2022 Oyvind Albrigtsen - 4.9.0-30 +- storage_mon: fix specified scores count and possible false negatives +- LVM-activate: use correct return codes to fix unexpected behaviour - azure-events-az: new resource agent - Resolves: rhbz#2130986 + Resolves: rhbz#2109159 + Resolves: rhbz#1905820 + Resolves: rhbz#1977012 * Wed Aug 10 2022 Oyvind Albrigtsen - 4.9.0-29 - ethmonitor/pgsql: remove attrd_updater "-q" parameter to solve issue