From: Andrew Beekhof Date: Tue, 1 Sep 2015 13:17:45 +1000 Subject: [PATCH] Feature: crmd: Implement reliable event notifications (cherry picked from commit 0cd1b8f02b403976afe106e0ca3a8a8a16864c6c) --- crmd/Makefile.am | 2 +- crmd/callbacks.c | 4 + crmd/control.c | 67 +++++++++++++--- crmd/crmd_utils.h | 1 + crmd/lrm.c | 2 + crmd/notify.c | 188 ++++++++++++++++++++++++++++++++++++++++++++ crmd/notify.h | 30 +++++++ crmd/te_utils.c | 2 + cts/CIB.py | 2 + extra/pcmk_notify_sample.sh | 68 ++++++++++++++++ include/crm_internal.h | 1 + lib/common/utils.c | 27 +++++++ 12 files changed, 380 insertions(+), 14 deletions(-) create mode 100644 crmd/notify.c create mode 100644 crmd/notify.h create mode 100755 extra/pcmk_notify_sample.sh diff --git a/crmd/Makefile.am b/crmd/Makefile.am index 8e5e1df..984f5d0 100644 --- a/crmd/Makefile.am +++ b/crmd/Makefile.am @@ -28,7 +28,7 @@ noinst_HEADERS = crmd.h crmd_fsa.h crmd_messages.h fsa_defines.h \ fsa_matrix.h fsa_proto.h crmd_utils.h crmd_callbacks.h \ crmd_lrm.h te_callbacks.h tengine.h -crmd_SOURCES = main.c crmd.c corosync.c \ +crmd_SOURCES = main.c crmd.c corosync.c notify.c \ fsa.c control.c messages.c membership.c callbacks.c \ election.c join_client.c join_dc.c subsystems.c throttle.c \ cib.c pengine.c tengine.c lrm.c lrm_state.c remote_lrmd_ra.c \ diff --git a/crmd/callbacks.c b/crmd/callbacks.c index f646927..38fb30b 100644 --- a/crmd/callbacks.c +++ b/crmd/callbacks.c @@ -126,6 +126,7 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d case crm_status_nstate: crm_info("%s is now %s (was %s)", node->uname, state_text(node->state), state_text(data)); + if (safe_str_eq(data, node->state)) { /* State did not change */ return; @@ -147,7 +148,10 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d } } } + + crmd_notify_node_event(node); break; + case crm_status_processes: if (data) { old = *(const uint32_t *)data; diff --git a/crmd/control.c b/crmd/control.c index f4add49..d92f46b 100644 --- a/crmd/control.c +++ b/crmd/control.c @@ -873,28 +873,64 @@ do_recover(long long action, /* *INDENT-OFF* */ pe_cluster_option crmd_opts[] = { - /* name, old-name, validate, default, description */ - { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." }, - { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, - { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, + /* name, old-name, validate, values, default, short description, long description */ + { "dc-version", NULL, "string", NULL, "none", NULL, + "Version of Pacemaker on the cluster's DC.", + "Includes the hash which identifies the exact changeset it was built from. Used for diagnostic purposes." + }, + { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, + "The messaging stack on which Pacemaker is currently running.", + "Used for informational and diagnostic purposes." }, + { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, + "How long to wait for a response from other nodes during startup.", + "The \"correct\" value will depend on the speed/load of your network and the type of switches used." + }, { XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time", - "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, + "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", + "15min", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time." - " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, + " To ensure these changes take effect, we can optionally poll the cluster's status for changes." + }, + + { "notification-script", NULL, "string", NULL, "/dev/null", &check_script, + "Notification script to be called after significant cluster events", + "Full path to a script that will be invoked when resources start/stop/fail, fencing occurs or nodes join/leave the cluster.\n" + "Must exist on all nodes in the cluster." + }, + { "notification-target", NULL, "string", NULL, "", NULL, + "Destination for notifications (Optional)", + "Where should the supplied script send notifications to. Useful to avoid hard-coding this in the script." + }, + { "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization, "The maximum amount of system resources that should be used by nodes in the cluster", "The cluster will slow down its recovery process when the amount of system resources used" - " (currently CPU) approaches this limit", }, + " (currently CPU) approaches this limit", + }, { "node-action-limit", NULL, "integer", NULL, "0", &check_number, "The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"}, - { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, - { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, - { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, - { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, - { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, + { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, + "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." + }, + { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, + "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." + }, + { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, + "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." + }, + { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, + "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." + }, + { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, + "*** Advanced Use Only ***\n" + "Enabling this option will slow down cluster recovery under all conditions", + "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\n" + "Useful if your configuration is sensitive to the order in which ping updates arrive." + }, { "stonith-watchdog-timeout", NULL, "time", NULL, NULL, &check_timer, - "How long to wait before we can assume nodes are safely down", NULL }, + "How long to wait before we can assume nodes are safely down", NULL + }, { "no-quorum-policy", "no_quorum_policy", "enum", "stop, freeze, ignore, suicide", "stop", &check_quorum, NULL, NULL }, #if SUPPORT_PLUGIN @@ -927,6 +963,7 @@ crmd_pref(GHashTable * options, const char *name) static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { + const char *script = NULL; const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); @@ -955,6 +992,10 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void verify_crmd_options(config_hash); + script = crmd_pref(config_hash, "notification-script"); + value = crmd_pref(config_hash, "notification-target"); + crmd_enable_notifications(script, value); + value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h index 78214bf..7e8c3e6 100644 --- a/crmd/crmd_utils.h +++ b/crmd/crmd_utils.h @@ -21,6 +21,7 @@ # include # include # include /* For CIB_OP_MODIFY */ +# include "notify.h" # define CLIENT_EXIT_WAIT 30 # define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" diff --git a/crmd/lrm.c b/crmd/lrm.c index 418e7cf..48195e8 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -2415,6 +2415,8 @@ process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurr free(prefix); } + crmd_notify_resource_op(lrm_state->node_name, op); + if (op->rsc_deleted) { crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key); delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL); diff --git a/crmd/notify.c b/crmd/notify.c new file mode 100644 index 0000000..980bfa6 --- /dev/null +++ b/crmd/notify.c @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2015 Andrew Beekhof + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include "notify.h" + +char *notify_script = NULL; +char *notify_target = NULL; + + +static const char *notify_keys[] = +{ + "CRM_notify_recipient", + "CRM_notify_node", + "CRM_notify_rsc", + "CRM_notify_task", + "CRM_notify_interval", + "CRM_notify_desc", + "CRM_notify_status", + "CRM_notify_target_rc", + "CRM_notify_rc", + "CRM_notify_kind", + "CRM_notify_version", +}; + + +void +crmd_enable_notifications(const char *script, const char *target) +{ + free(notify_script); + notify_script = NULL; + + free(notify_target); + notify_target = NULL; + + if(safe_str_eq(script, "/dev/null")) { + crm_notice("Notifications disabled"); + return; + } + + notify_script = strdup(script); + notify_target = strdup(target); + crm_notice("Notifications enabled"); +} + +static void +set_notify_key(const char *name, const char *cvalue, char *value) +{ + int lpc; + bool found = 0; + + if(cvalue == NULL) { + cvalue = value; + } + + for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) { + if(safe_str_eq(name, notify_keys[lpc])) { + found = 1; + crm_trace("Setting notify key %s = '%s'", name, cvalue); + setenv(name, cvalue, 1); + break; + } + } + + CRM_ASSERT(found != 0); + free(value); +} + + +static void +send_notification(const char *kind) +{ + int lpc; + pid_t pid; + + crm_debug("Sending '%s' notification to '%s' via '%s'", kind, notify_target, notify_script); + + set_notify_key("CRM_notify_recipient", notify_target, NULL); + set_notify_key("CRM_notify_kind", kind, NULL); + set_notify_key("CRM_notify_version", VERSION, NULL); + + pid = fork(); + if (pid == -1) { + crm_perror(LOG_ERR, "notification failed"); + } + + if (pid == 0) { + /* crm_debug("notification: I am the child. Executing the nofitication program."); */ + execl(notify_script, notify_script, NULL); + exit(EXIT_FAILURE); + + } else { + for(lpc = 0; lpc < DIMOF(notify_keys); lpc++) { + unsetenv(notify_keys[lpc]); + } + } +} + +void crmd_notify_node_event(crm_node_t *node) +{ + if(notify_script == NULL) { + return; + } + + set_notify_key("CRM_notify_node", node->uname, NULL); + set_notify_key("CRM_notify_desc", node->state, NULL); + + send_notification("node"); +} + +void +crmd_notify_fencing_op(stonith_event_t * e) +{ + char *desc = NULL; + + if(notify_script) { + return; + } + + desc = crm_strdup_printf("Operation %s requested by %s for peer %s: %s (ref=%s)", + e->operation, e->origin, e->target, pcmk_strerror(e->result), + e->id); + + set_notify_key("CRM_notify_node", e->target, NULL); + set_notify_key("CRM_notify_task", e->operation, NULL); + set_notify_key("CRM_notify_desc", NULL, desc); + set_notify_key("CRM_notify_rc", NULL, crm_itoa(e->result)); + + send_notification("fencing"); +} + +void +crmd_notify_resource_op(const char *node, lrmd_event_data_t * op) +{ + int target_rc = 0; + + if(notify_script == NULL) { + return; + } + + target_rc = rsc_op_expected_rc(op); + if(op->interval == 0 && target_rc == op->rc && safe_str_eq(op->op_type, RSC_STATUS)) { + /* Leave it up to the script if they want to notify for + * 'failed' probes, only swallow ones for which the result was + * unexpected. + * + * Even if we find a resource running, it was probably because + * someone erased the status section. + */ + return; + } + + set_notify_key("CRM_notify_node", node, NULL); + + set_notify_key("CRM_notify_rsc", op->rsc_id, NULL); + set_notify_key("CRM_notify_task", op->op_type, NULL); + set_notify_key("CRM_notify_interval", NULL, crm_itoa(op->interval)); + + set_notify_key("CRM_notify_target_rc", NULL, crm_itoa(target_rc)); + set_notify_key("CRM_notify_status", NULL, crm_itoa(op->op_status)); + set_notify_key("CRM_notify_rc", NULL, crm_itoa(op->rc)); + + if(op->op_status == PCMK_LRM_OP_DONE) { + set_notify_key("CRM_notify_desc", services_ocf_exitcode_str(op->rc), NULL); + } else { + set_notify_key("CRM_notify_desc", services_lrm_status_str(op->op_status), NULL); + } + + send_notification("resource"); +} + diff --git a/crmd/notify.h b/crmd/notify.h new file mode 100644 index 0000000..4b138ea --- /dev/null +++ b/crmd/notify.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2015 Andrew Beekhof + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This software is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef CRMD_NOTIFY__H +# define CRMD_NOTIFY__H + +# include +# include +# include + +void crmd_enable_notifications(const char *script, const char *target); +void crmd_notify_node_event(crm_node_t *node); +void crmd_notify_fencing_op(stonith_event_t * e); +void crmd_notify_resource_op(const char *node, lrmd_event_data_t * op); + +#endif diff --git a/crmd/te_utils.c b/crmd/te_utils.c index a1d29f6..22551ba 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -124,6 +124,8 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) return; } + crmd_notify_fencing_op(st_event); + if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { crm_notice("%s was successfully unfenced by %s (at the request of %s)", st_event->target, st_event->executioner ? st_event->executioner : "", st_event->origin); diff --git a/cts/CIB.py b/cts/CIB.py index 8fbba6c..cd3a6a1 100644 --- a/cts/CIB.py +++ b/cts/CIB.py @@ -219,6 +219,8 @@ class CIB11(ConfigBase): o["dc-deadtime"] = "5s" o["no-quorum-policy"] = no_quorum o["expected-quorum-votes"] = self.num_nodes + o["notification-script"] = "/var/lib/pacemaker/notify.sh" + o["notification-target"] = "/var/lib/pacemaker/notify.log" if self.CM.Env["DoBSC"] == 1: o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!" diff --git a/extra/pcmk_notify_sample.sh b/extra/pcmk_notify_sample.sh new file mode 100755 index 0000000..83cf8e9 --- /dev/null +++ b/extra/pcmk_notify_sample.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# +# Copyright (C) 2015 Andrew Beekhof +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +if [ -z $CRM_notify_version ]; then + echo "Pacemaker version 1.1.14 is required" >> ${CRM_notify_recipient} + exit 0 +fi + +case $CRM_notify_kind in + node) + echo "Node '${CRM_notify_node}' is now '${CRM_notify_desc}'" >> ${CRM_notify_recipient} + ;; + fencing) + # Other keys: + # + # CRM_notify_node + # CRM_notify_task + # CRM_notify_rc + # + echo "Fencing ${CRM_notify_desc}" >> ${CRM_notify_recipient} + ;; + resource) + # Other keys: + # + # CRM_notify_target_rc + # CRM_notify_status + # CRM_notify_rc + # + if [ ${CRM_notify_interval} = "0" ]; then + CRM_notify_interval="" + else + CRM_notify_interval=" (${CRM_notify_interval})" + fi + + if [ ${CRM_notify_target_rc} = "0" ]; then + CRM_notify_target_rc="" + else + CRM_notify_target_rc=" (target: ${CRM_notify_target_rc})" + fi + + case ${CRM_notify_desc} in + Cancelled) ;; + *) + echo "Resource operation '${CRM_notify_task}${CRM_notify_interval}' for '${CRM_notify_rsc}' on '${CRM_notify_node}': ${CRM_notify_desc}${CRM_notify_target_rc}" >> ${CRM_notify_recipient} + ;; + esac + ;; + *) + echo "Unhandled $CRM_notify_kind notification" >> ${CRM_notify_recipient} + env | grep CRM_notify >> ${CRM_notify_recipient} + ;; + +esac diff --git a/include/crm_internal.h b/include/crm_internal.h index c13bc7b..fb03537 100644 --- a/include/crm_internal.h +++ b/include/crm_internal.h @@ -127,6 +127,7 @@ gboolean check_timer(const char *value); gboolean check_boolean(const char *value); gboolean check_number(const char *value); gboolean check_quorum(const char *value); +gboolean check_script(const char *value); gboolean check_utilization(const char *value); /* Shared PE/crmd functionality */ diff --git a/lib/common/utils.c b/lib/common/utils.c index 6a234dc..628cf2f 100644 --- a/lib/common/utils.c +++ b/lib/common/utils.c @@ -180,6 +180,33 @@ check_quorum(const char *value) } gboolean +check_script(const char *value) +{ + struct stat st; + + if(safe_str_eq(value, "/dev/null")) { + return TRUE; + } + + if(stat(value, &st) != 0) { + crm_err("Script %s does not exist", value); + return FALSE; + } + + if(S_ISREG(st.st_mode) == 0) { + crm_err("Script %s is not a regular file", value); + return FALSE; + } + + if( (st.st_mode & (S_IXUSR | S_IXGRP )) == 0) { + crm_err("Script %s is not executable", value); + return FALSE; + } + + return TRUE; +} + +gboolean check_utilization(const char *value) { char *end = NULL;