Blob Blame History Raw
From: Andrew Beekhof <andrew@beekhof.net>
Date: Tue, 1 Sep 2015 13:17:45 +1000
Subject: [PATCH] Feature: crmd: Implement reliable event notifications

(cherry picked from commit 0cd1b8f02b403976afe106e0ca3a8a8a16864c6c)
---
 crmd/Makefile.am            |   2 +-
 crmd/callbacks.c            |   4 +
 crmd/control.c              |  67 +++++++++++++---
 crmd/crmd_utils.h           |   1 +
 crmd/lrm.c                  |   2 +
 crmd/notify.c               | 188 ++++++++++++++++++++++++++++++++++++++++++++
 crmd/notify.h               |  30 +++++++
 crmd/te_utils.c             |   2 +
 cts/CIB.py                  |   2 +
 extra/pcmk_notify_sample.sh |  68 ++++++++++++++++
 include/crm_internal.h      |   1 +
 lib/common/utils.c          |  27 +++++++
 12 files changed, 380 insertions(+), 14 deletions(-)
 create mode 100644 crmd/notify.c
 create mode 100644 crmd/notify.h
 create mode 100755 extra/pcmk_notify_sample.sh

diff --git a/crmd/Makefile.am b/crmd/Makefile.am
index 8e5e1df..984f5d0 100644
--- a/crmd/Makefile.am
+++ b/crmd/Makefile.am
@@ -28,7 +28,7 @@ noinst_HEADERS	= crmd.h crmd_fsa.h crmd_messages.h fsa_defines.h 	\
 		fsa_matrix.h fsa_proto.h crmd_utils.h crmd_callbacks.h \
 		crmd_lrm.h te_callbacks.h tengine.h
 
-crmd_SOURCES	= main.c crmd.c corosync.c					\
+crmd_SOURCES	= main.c crmd.c corosync.c notify.c				\
 		fsa.c control.c messages.c membership.c callbacks.c		\
 		election.c join_client.c join_dc.c subsystems.c throttle.c	\
 		cib.c pengine.c tengine.c lrm.c lrm_state.c remote_lrmd_ra.c	\
diff --git a/crmd/callbacks.c b/crmd/callbacks.c
index f646927..38fb30b 100644
--- a/crmd/callbacks.c
+++ b/crmd/callbacks.c
@@ -126,6 +126,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
         case crm_status_nstate:
             crm_info("%s is now %s (was %s)",
                      node->uname, state_text(node->state), state_text(data));
+
             if (safe_str_eq(data, node->state)) {
                 /* State did not change */
                 return;
@@ -147,7 +148,10 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d
                     }
                 }
             }
+
+            crmd_notify_node_event(node);
             break;
+
         case crm_status_processes:
             if (data) {
                 old = *(const uint32_t *)data;
diff --git a/crmd/control.c b/crmd/control.c
index f4add49..d92f46b 100644
--- a/crmd/control.c
+++ b/crmd/control.c
@@ -873,28 +873,64 @@ do_recover(long long action,
 
 /* *INDENT-OFF* */
 pe_cluster_option crmd_opts[] = {
-	/* name, old-name, validate, default, description */
-	{ "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from.  Used for diagnostic purposes." },
-	{ "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." },
-	{ XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." },
+	/* name, old-name, validate, values, default, short description, long description */
+	{ "dc-version", NULL, "string", NULL, "none", NULL,
+          "Version of Pacemaker on the cluster's DC.",
+          "Includes the hash which identifies the exact changeset it was built from.  Used for diagnostic purposes."
+        },
+	{ "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL,
+          "The messaging stack on which Pacemaker is currently running.",
+          "Used for informational and diagnostic purposes." },
+	{ XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time,
+          "How long to wait for a response from other nodes during startup.",
+          "The \"correct\" value will depend on the speed/load of your network and the type of switches used."
+        },
 	{ XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time",
-	  "Zero disables polling.  Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer,
+	  "Zero disables polling.  Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)",
+          "15min", &check_timer,
 	  "Polling interval for time based changes to options, resource parameters and constraints.",
 	  "The Cluster is primarily event driven, however the configuration can have elements that change based on time."
-	  "  To ensure these changes take effect, we can optionally poll the cluster's status for changes." },
+	  "  To ensure these changes take effect, we can optionally poll the cluster's status for changes."
+        },
+
+	{ "notification-script", NULL, "string", NULL, "/dev/null", &check_script,
+          "Notification script to be called after significant cluster events",
+          "Full path to a script that will be invoked when resources start/stop/fail, fencing occurs or nodes join/leave the cluster.\n"
+          "Must exist on all nodes in the cluster."
+        },
+	{ "notification-target", NULL, "string", NULL, "", NULL,
+          "Destination for notifications (Optional)",
+          "Where should the supplied script send notifications to.  Useful to avoid hard-coding this in the script."
+        },
+
 	{ "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization,
 	  "The maximum amount of system resources that should be used by nodes in the cluster",
 	  "The cluster will slow down its recovery process when the amount of system resources used"
-          " (currently CPU) approaches this limit", },
+          " (currently CPU) approaches this limit",
+        },
 	{ "node-action-limit", NULL, "integer", NULL, "0", &check_number,
           "The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"},
-	{ XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
-	{ XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
-	{ "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." },
-	{ "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." },
-	{ "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." },
+	{ XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer,
+          "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug."
+        },
+	{ XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer,
+          "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug."
+        },
+	{ "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer,
+          "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug."
+        },
+	{ "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer,
+          "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug."
+        },
+	{ "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer,
+          "*** Advanced Use Only ***\n"
+          "Enabling this option will slow down cluster recovery under all conditions",
+          "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\n"
+          "Useful if your configuration is sensitive to the order in which ping updates arrive."
+        },
 	{ "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer,
-	  "How long to wait before we can assume nodes are safely down", NULL },
+	  "How long to wait before we can assume nodes are safely down", NULL
+        },
 	{ "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL },
 
 #if SUPPORT_PLUGIN
@@ -927,6 +963,7 @@ crmd_pref(GHashTable * options, const char *name)
 static void
 config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
 {
+    const char *script = NULL;
     const char *value = NULL;
     GHashTable *config_hash = NULL;
     crm_time_t *now = crm_time_new(NULL);
@@ -955,6 +992,10 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
 
     verify_crmd_options(config_hash);
 
+    script = crmd_pref(config_hash, "notification-script");
+    value  = crmd_pref(config_hash, "notification-target");
+    crmd_enable_notifications(script, value);
+
     value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME);
     election_trigger->period_ms = crm_get_msec(value);
 
diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h
index 78214bf..7e8c3e6 100644
--- a/crmd/crmd_utils.h
+++ b/crmd/crmd_utils.h
@@ -21,6 +21,7 @@
 #  include <crm/crm.h>
 #  include <crm/common/xml.h>
 #  include <crm/cib/internal.h> /* For CIB_OP_MODIFY */
+#  include "notify.h"
 
 #  define CLIENT_EXIT_WAIT 30
 #  define FAKE_TE_ID	"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
diff --git a/crmd/lrm.c b/crmd/lrm.c
index 418e7cf..48195e8 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -2415,6 +2415,8 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr
         free(prefix);
     }
 
+    crmd_notify_resource_op(lrm_state->node_name, op);
+
     if (op->rsc_deleted) {
         crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key);
         delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL);
diff --git a/crmd/notify.c b/crmd/notify.c
new file mode 100644
index 0000000..980bfa6
--- /dev/null
+++ b/crmd/notify.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (C) 2015 Andrew Beekhof <andrew@beekhof.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <crm_internal.h>
+#include <crm/crm.h>
+#include <crm/msg_xml.h>
+#include "notify.h"
+
+char *notify_script = NULL;
+char *notify_target = NULL;
+
+
+static const char *notify_keys[] = 
+{
+    "CRM_notify_recipient",
+    "CRM_notify_node",
+    "CRM_notify_rsc",
+    "CRM_notify_task",
+    "CRM_notify_interval",
+    "CRM_notify_desc",
+    "CRM_notify_status",
+    "CRM_notify_target_rc",
+    "CRM_notify_rc",
+    "CRM_notify_kind",
+    "CRM_notify_version",
+};
+
+
+void
+crmd_enable_notifications(const char *script, const char *target)
+{
+    free(notify_script);
+    notify_script = NULL;
+
+    free(notify_target);
+    notify_target = NULL;
+
+    if(safe_str_eq(script, "/dev/null")) {
+        crm_notice("Notifications disabled");
+        return;
+    }
+
+    notify_script = strdup(script);
+    notify_target = strdup(target);
+    crm_notice("Notifications enabled");
+}
+
+static void
+set_notify_key(const char *name, const char *cvalue, char *value)
+{
+    int lpc;
+    bool found = 0;
+
+    if(cvalue == NULL) {
+        cvalue = value;
+    }
+
+    for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) {
+        if(safe_str_eq(name, notify_keys[lpc])) {
+            found = 1;
+            crm_trace("Setting notify key %s = '%s'", name, cvalue);
+            setenv(name, cvalue, 1);
+            break;
+        }
+    }
+
+    CRM_ASSERT(found != 0);
+    free(value);
+}
+
+
+static void
+send_notification(const char *kind)
+{
+    int lpc;
+    pid_t pid;
+
+    crm_debug("Sending '%s' notification to '%s' via '%s'", kind, notify_target, notify_script);
+
+    set_notify_key("CRM_notify_recipient", notify_target, NULL);
+    set_notify_key("CRM_notify_kind", kind, NULL);
+    set_notify_key("CRM_notify_version", VERSION, NULL);
+
+    pid = fork();
+    if (pid == -1) {
+        crm_perror(LOG_ERR, "notification failed");
+    }
+
+    if (pid == 0) {
+        /* crm_debug("notification: I am the child. Executing the nofitication program."); */
+        execl(notify_script, notify_script, NULL);
+        exit(EXIT_FAILURE);
+
+    } else {
+        for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) {
+            unsetenv(notify_keys[lpc]);
+        }
+    }
+}
+
+void crmd_notify_node_event(crm_node_t *node)
+{
+    if(notify_script == NULL) {
+        return;
+    }
+
+    set_notify_key("CRM_notify_node", node->uname, NULL);
+    set_notify_key("CRM_notify_desc", node->state, NULL);
+
+    send_notification("node");
+}
+
+void
+crmd_notify_fencing_op(stonith_event_t * e)
+{
+    char *desc = NULL;
+
+    if(notify_script) {
+        return;
+    }
+
+    desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)",
+                                   e->operation, e->origin, e->target, pcmk_strerror(e->result),
+                                   e->id);
+
+    set_notify_key("CRM_notify_node", e->target, NULL);
+    set_notify_key("CRM_notify_task", e->operation, NULL);
+    set_notify_key("CRM_notify_desc", NULL, desc);
+    set_notify_key("CRM_notify_rc", NULL, crm_itoa(e->result));
+
+    send_notification("fencing");
+}
+
+void
+crmd_notify_resource_op(const char *node, lrmd_event_data_t * op)
+{
+    int target_rc = 0;
+
+    if(notify_script == NULL) {
+        return;
+    }
+
+    target_rc = rsc_op_expected_rc(op);
+    if(op->interval == 0 && target_rc == op->rc && safe_str_eq(op->op_type, RSC_STATUS)) {
+        /* Leave it up to the script if they want to notify for
+         * 'failed' probes, only swallow ones for which the result was
+         * unexpected.
+         *
+         * Even if we find a resource running, it was probably because
+         * someone erased the status section.
+         */
+        return;
+    }
+
+    set_notify_key("CRM_notify_node", node, NULL);
+
+    set_notify_key("CRM_notify_rsc", op->rsc_id, NULL);
+    set_notify_key("CRM_notify_task", op->op_type, NULL);
+    set_notify_key("CRM_notify_interval", NULL, crm_itoa(op->interval));
+
+    set_notify_key("CRM_notify_target_rc", NULL, crm_itoa(target_rc));
+    set_notify_key("CRM_notify_status", NULL, crm_itoa(op->op_status));
+    set_notify_key("CRM_notify_rc", NULL, crm_itoa(op->rc));
+
+    if(op->op_status == PCMK_LRM_OP_DONE) {
+        set_notify_key("CRM_notify_desc", services_ocf_exitcode_str(op->rc), NULL);
+    } else {
+        set_notify_key("CRM_notify_desc", services_lrm_status_str(op->op_status), NULL);
+    }
+
+    send_notification("resource");
+}
+
diff --git a/crmd/notify.h b/crmd/notify.h
new file mode 100644
index 0000000..4b138ea
--- /dev/null
+++ b/crmd/notify.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2015 Andrew Beekhof <andrew@beekhof.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef CRMD_NOTIFY__H
+#  define CRMD_NOTIFY__H
+
+#  include <crm/crm.h>
+#  include <crm/cluster.h>
+#  include <crm/stonith-ng.h>
+
+void crmd_enable_notifications(const char *script, const char *target);
+void crmd_notify_node_event(crm_node_t *node);
+void crmd_notify_fencing_op(stonith_event_t * e);
+void crmd_notify_resource_op(const char *node, lrmd_event_data_t * op);
+
+#endif
diff --git a/crmd/te_utils.c b/crmd/te_utils.c
index a1d29f6..22551ba 100644
--- a/crmd/te_utils.c
+++ b/crmd/te_utils.c
@@ -124,6 +124,8 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
         return;
     }
 
+    crmd_notify_fencing_op(st_event);
+
     if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) {
         crm_notice("%s was successfully unfenced by %s (at the request of %s)",
                    st_event->target, st_event->executioner ? st_event->executioner : "<anyone>", st_event->origin);
diff --git a/cts/CIB.py b/cts/CIB.py
index 8fbba6c..cd3a6a1 100644
--- a/cts/CIB.py
+++ b/cts/CIB.py
@@ -219,6 +219,8 @@ class CIB11(ConfigBase):
         o["dc-deadtime"] = "5s"
         o["no-quorum-policy"] = no_quorum
         o["expected-quorum-votes"] = self.num_nodes
+        o["notification-script"] = "/var/lib/pacemaker/notify.sh"
+        o["notification-target"] = "/var/lib/pacemaker/notify.log"
 
         if self.CM.Env["DoBSC"] == 1:
             o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!"
diff --git a/extra/pcmk_notify_sample.sh b/extra/pcmk_notify_sample.sh
new file mode 100755
index 0000000..83cf8e9
--- /dev/null
+++ b/extra/pcmk_notify_sample.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 Andrew Beekhof <andrew@beekhof.net>
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This software is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+if [ -z $CRM_notify_version ]; then
+    echo "Pacemaker version 1.1.14 is required" >> ${CRM_notify_recipient}
+    exit 0
+fi
+
+case $CRM_notify_kind in
+    node)
+	echo "Node '${CRM_notify_node}' is now '${CRM_notify_desc}'" >> ${CRM_notify_recipient}
+	;;
+    fencing)
+	# Other keys:
+	# 
+	# CRM_notify_node
+	# CRM_notify_task
+	# CRM_notify_rc
+	#
+	echo "Fencing ${CRM_notify_desc}" >> ${CRM_notify_recipient}
+	;;
+    resource)
+	# Other keys:
+	# 
+	# CRM_notify_target_rc
+	# CRM_notify_status
+	# CRM_notify_rc
+	#
+	if [ ${CRM_notify_interval} = "0" ]; then
+	    CRM_notify_interval=""
+	else
+	    CRM_notify_interval=" (${CRM_notify_interval})"
+	fi
+
+	if [ ${CRM_notify_target_rc} = "0" ]; then
+	    CRM_notify_target_rc=""
+	else
+	    CRM_notify_target_rc=" (target: ${CRM_notify_target_rc})"
+	fi
+	
+	case ${CRM_notify_desc} in
+	    Cancelled) ;;
+	    *)
+		echo "Resource operation '${CRM_notify_task}${CRM_notify_interval}' for '${CRM_notify_rsc}' on '${CRM_notify_node}': ${CRM_notify_desc}${CRM_notify_target_rc}" >> ${CRM_notify_recipient}
+		;;
+	esac
+	;;
+    *)
+        echo "Unhandled $CRM_notify_kind notification" >> ${CRM_notify_recipient}
+	env | grep CRM_notify >> ${CRM_notify_recipient}
+        ;;
+
+esac
diff --git a/include/crm_internal.h b/include/crm_internal.h
index c13bc7b..fb03537 100644
--- a/include/crm_internal.h
+++ b/include/crm_internal.h
@@ -127,6 +127,7 @@ gboolean check_timer(const char *value);
 gboolean check_boolean(const char *value);
 gboolean check_number(const char *value);
 gboolean check_quorum(const char *value);
+gboolean check_script(const char *value);
 gboolean check_utilization(const char *value);
 
 /* Shared PE/crmd functionality */
diff --git a/lib/common/utils.c b/lib/common/utils.c
index 6a234dc..628cf2f 100644
--- a/lib/common/utils.c
+++ b/lib/common/utils.c
@@ -180,6 +180,33 @@ check_quorum(const char *value)
 }
 
 gboolean
+check_script(const char *value)
+{
+    struct stat st;
+
+    if(safe_str_eq(value, "/dev/null")) {
+        return TRUE;
+    }
+
+    if(stat(value, &st) != 0) {
+        crm_err("Script %s does not exist", value);
+        return FALSE;
+    }
+
+    if(S_ISREG(st.st_mode) == 0) {
+        crm_err("Script %s is not a regular file", value);
+        return FALSE;
+    }
+
+    if( (st.st_mode & (S_IXUSR | S_IXGRP )) == 0) {
+        crm_err("Script %s is not executable", value);
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
+gboolean
 check_utilization(const char *value)
 {
     char *end = NULL;