From b49f49576ef9d801a48ce7a01a78c72e65be7880 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Fri, 30 Jul 2021 18:07:25 +0200 Subject: [PATCH 1/3] Fix, Refactor: fenced: add return value to get_agent_metadata Used to distinguish between empty metadata per design, case of failed getting metadata that might succeed on a retry and fatal failure. Fixes as well regression that leads to endless retries getting metadata for #watchdog - not superserious as it happens with delays in between but still undesirable. --- daemons/fenced/fenced_commands.c | 92 +++++++++++++++++++------------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c index a778801b1..cd9968f1a 100644 --- a/daemons/fenced/fenced_commands.c +++ b/daemons/fenced/fenced_commands.c @@ -69,7 +69,7 @@ static void stonith_send_reply(xmlNode * reply, int call_options, const char *re static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence); -static xmlNode * get_agent_metadata(const char *agent); +static int get_agent_metadata(const char *agent, xmlNode **metadata); static void read_action_metadata(stonith_device_t *device); typedef struct async_command_s { @@ -323,19 +323,26 @@ fork_cb(GPid pid, gpointer user_data) static int get_agent_metadata_cb(gpointer data) { stonith_device_t *device = data; + guint period_ms; - device->agent_metadata = get_agent_metadata(device->agent); - if (device->agent_metadata) { - read_action_metadata(device); - stonith__device_parameter_flags(&(device->flags), device->id, + switch (get_agent_metadata(device->agent, &device->agent_metadata)) { + case pcmk_rc_ok: + if (device->agent_metadata) { + read_action_metadata(device); + stonith__device_parameter_flags(&(device->flags), device->id, device->agent_metadata); - return G_SOURCE_REMOVE; - } else { - guint period_ms = pcmk__mainloop_timer_get_period(device->timer); - if (period_ms < 160 * 1000) { - mainloop_timer_set_period(device->timer, 2 * period_ms); - } - return G_SOURCE_CONTINUE; + } + return G_SOURCE_REMOVE; + + case EAGAIN: + period_ms = pcmk__mainloop_timer_get_period(device->timer); + if (period_ms < 160 * 1000) { + mainloop_timer_set_period(device->timer, 2 * period_ms); + } + return G_SOURCE_CONTINUE; + + default: + return G_SOURCE_REMOVE; } } @@ -700,38 +707,41 @@ init_metadata_cache(void) { } } -static xmlNode * -get_agent_metadata(const char *agent) +int +get_agent_metadata(const char *agent, xmlNode ** metadata) { - xmlNode *xml = NULL; char *buffer = NULL; + if (metadata == NULL) { + return EINVAL; + } + *metadata = NULL; + if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT, pcmk__str_none)) { + return pcmk_rc_ok; + } init_metadata_cache(); buffer = g_hash_table_lookup(metadata_cache, agent); - if(pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT, pcmk__str_casei)) { - return NULL; - - } else if(buffer == NULL) { + if (buffer == NULL) { stonith_t *st = stonith_api_new(); int rc; if (st == NULL) { crm_warn("Could not get agent meta-data: " "API memory allocation failed"); - return NULL; + return EAGAIN; } - rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10); + rc = st->cmds->metadata(st, st_opt_sync_call, agent, + NULL, &buffer, 10); stonith_api_delete(st); if (rc || !buffer) { crm_err("Could not retrieve metadata for fencing agent %s", agent); - return NULL; + return EAGAIN; } g_hash_table_replace(metadata_cache, strdup(agent), buffer); } - xml = string2xml(buffer); - - return xml; + *metadata = string2xml(buffer); + return pcmk_rc_ok; } static gboolean @@ -962,19 +972,27 @@ build_device_from_xml(xmlNode * msg) g_list_free_full(device->targets, free); device->targets = NULL; } - device->agent_metadata = get_agent_metadata(device->agent); - if (device->agent_metadata) { - read_action_metadata(device); - stonith__device_parameter_flags(&(device->flags), device->id, - device->agent_metadata); - } else { - if (device->timer == NULL) { - device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000, + switch (get_agent_metadata(device->agent, &device->agent_metadata)) { + case pcmk_rc_ok: + if (device->agent_metadata) { + read_action_metadata(device); + stonith__device_parameter_flags(&(device->flags), device->id, + device->agent_metadata); + } + break; + + case EAGAIN: + if (device->timer == NULL) { + device->timer = mainloop_timer_add("get_agent_metadata", 10 * 1000, TRUE, get_agent_metadata_cb, device); - } - if (!mainloop_timer_running(device->timer)) { - mainloop_timer_start(device->timer); - } + } + if (!mainloop_timer_running(device->timer)) { + mainloop_timer_start(device->timer); + } + break; + + default: + break; } value = g_hash_table_lookup(device->params, "nodeid"); -- 2.27.0 From 5dd1e4459335764e0adf5fa78d81c875ae2332e9 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Fri, 30 Jul 2021 18:15:10 +0200 Subject: [PATCH 2/3] feature: watchdog-fencing: allow restriction to certain nodes Bump CRM_FEATURE_SET to 3.11.0 to encourage cluster being fully upgraded to a version that supports the feature before explicitly adding a watchdog-fence-device. --- configure.ac | 1 + daemons/controld/controld_control.c | 2 +- daemons/controld/controld_fencing.c | 14 ++ daemons/controld/controld_fencing.h | 1 + daemons/fenced/Makefile.am | 2 +- daemons/fenced/fence_watchdog.in | 283 ++++++++++++++++++++++++++++ daemons/fenced/fenced_commands.c | 141 +++++++++++--- daemons/fenced/fenced_remote.c | 71 ++++--- daemons/fenced/pacemaker-fenced.c | 131 +++++++++---- daemons/fenced/pacemaker-fenced.h | 5 +- include/crm/crm.h | 2 +- include/crm/fencing/internal.h | 8 +- lib/fencing/st_client.c | 61 ++++++ lib/lrmd/lrmd_client.c | 6 +- rpm/pacemaker.spec.in | 3 + 16 files changed, 635 insertions(+), 97 deletions(-) create mode 100755 daemons/fenced/fence_watchdog.in diff --git a/configure.ac b/configure.ac index 436100c81..013562e46 100644 --- a/configure.ac +++ b/configure.ac @@ -1972,6 +1972,7 @@ CONFIG_FILES_EXEC([cts/cts-cli], [cts/support/fence_dummy], [cts/support/pacemaker-cts-dummyd], [daemons/fenced/fence_legacy], + [daemons/fenced/fence_watchdog], [doc/abi-check], [extra/resources/ClusterMon], [extra/resources/HealthSMART], diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 45a70bb92..b5da6a46c 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -615,7 +615,7 @@ static pcmk__cluster_option_t crmd_opts[] = { }, { "stonith-watchdog-timeout", NULL, "time", NULL, - "0", pcmk__valid_sbd_timeout, + "0", controld_verify_stonith_watchdog_timeout, "How long to wait before we can assume nodes are safely down " "when watchdog-based self-fencing via SBD is in use", "If nonzero, along with `have-watchdog=true` automatically set by the " diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index 0fba6613b..6c2a6c550 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -886,6 +887,19 @@ te_fence_node(crm_graph_t *graph, crm_action_t *action) return TRUE; } +bool +controld_verify_stonith_watchdog_timeout(const char *value) +{ + gboolean rv = TRUE; + + if (stonith_api && (stonith_api->state != stonith_disconnected) && + stonith__watchdog_fencing_enabled_for_node_api(stonith_api, + fsa_our_uname)) { + rv = pcmk__valid_sbd_timeout(value); + } + return rv; +} + /* end stonith API client functions */ diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h index d0ecc8234..ef68a0c83 100644 --- a/daemons/controld/controld_fencing.h +++ b/daemons/controld/controld_fencing.h @@ -24,6 +24,7 @@ void update_stonith_max_attempts(const char* value); void controld_trigger_fencer_connect(void); void controld_disconnect_fencer(bool destroy); gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action); +bool controld_verify_stonith_watchdog_timeout(const char *value); // stonith cleanup list void add_stonith_cleanup(const char *target); diff --git a/daemons/fenced/Makefile.am b/daemons/fenced/Makefile.am index 43413e11d..2923d7c9b 100644 --- a/daemons/fenced/Makefile.am +++ b/daemons/fenced/Makefile.am @@ -15,7 +15,7 @@ halibdir = $(CRM_DAEMON_DIR) halib_PROGRAMS = pacemaker-fenced cts-fence-helper -sbin_SCRIPTS = fence_legacy +sbin_SCRIPTS = fence_legacy fence_watchdog noinst_HEADERS = pacemaker-fenced.h diff --git a/daemons/fenced/fence_watchdog.in b/daemons/fenced/fence_watchdog.in new file mode 100755 index 000000000..c83304f1d --- /dev/null +++ b/daemons/fenced/fence_watchdog.in @@ -0,0 +1,283 @@ +#!@PYTHON@ +"""Dummy watchdog fence agent for providing meta-data for the pacemaker internal agent +""" + +__copyright__ = "Copyright 2012-2021 the Pacemaker project contributors" +__license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" + +import io +import os +import re +import sys +import atexit +import getopt + +SHORT_DESC = "Dummy watchdog fence agent" +LONG_DESC = """fence_watchdog just provides +meta-data - actual fencing is done by the pacemaker internal watchdog agent.""" + +ALL_OPT = { + "version" : { + "getopt" : "V", + "longopt" : "version", + "help" : "-V, --version Display version information and exit", + "required" : "0", + "shortdesc" : "Display version information and exit", + "order" : 53 + }, + "help" : { + "getopt" : "h", + "longopt" : "help", + "help" : "-h, --help Display this help and exit", + "required" : "0", + "shortdesc" : "Display help and exit", + "order" : 54 + }, + "action" : { + "getopt" : "o:", + "longopt" : "action", + "help" : "-o, --action=[action] Action: metadata", + "required" : "1", + "shortdesc" : "Fencing Action", + "default" : "metadata", + "order" : 1 + }, + "nodename" : { + "getopt" : "N:", + "longopt" : "nodename", + "help" : "-N, --nodename Node name of fence victim (ignored)", + "required" : "0", + "shortdesc" : "Ignored", + "order" : 2 + }, + "plug" : { + "getopt" : "n:", + "longopt" : "plug", + "help" : "-n, --plug=[id] Physical plug number on device (ignored)", + "required" : "1", + "shortdesc" : "Ignored", + "order" : 4 + } +} + + +def agent(): + """ Return name this file was run as. """ + + return os.path.basename(sys.argv[0]) + + +def fail_usage(message): + """ Print a usage message and exit. """ + + sys.exit("%s\nPlease use '-h' for usage" % message) + + +def show_docs(options): + """ Handle informational options (display info and exit). """ + + device_opt = options["device_opt"] + + if "-h" in options: + usage(device_opt) + sys.exit(0) + + if "-o" in options and options["-o"].lower() == "metadata": + metadata(device_opt, options) + sys.exit(0) + + if "-V" in options: + print(AGENT_VERSION) + sys.exit(0) + + +def sorted_options(avail_opt): + """ Return a list of all options, in their internally specified order. """ + + sorted_list = [(key, ALL_OPT[key]) for key in avail_opt] + sorted_list.sort(key=lambda x: x[1]["order"]) + return sorted_list + + +def usage(avail_opt): + """ Print a usage message. """ + print(LONG_DESC) + print() + print("Usage:") + print("\t" + agent() + " [options]") + print("Options:") + + for dummy, value in sorted_options(avail_opt): + if len(value["help"]) != 0: + print(" " + value["help"]) + + +def metadata(avail_opt, options): + """ Print agent metadata. """ + + print(""" + +%s +""" % (agent(), SHORT_DESC, LONG_DESC)) + + for option, dummy in sorted_options(avail_opt): + if "shortdesc" in ALL_OPT[option]: + print(' ') + + default = "" + default_name_arg = "-" + ALL_OPT[option]["getopt"][:-1] + default_name_no_arg = "-" + ALL_OPT[option]["getopt"] + + if "default" in ALL_OPT[option]: + default = 'default="%s"' % str(ALL_OPT[option]["default"]) + elif default_name_arg in options: + if options[default_name_arg]: + try: + default = 'default="%s"' % options[default_name_arg] + except TypeError: + ## @todo/@note: Currently there is no clean way how to handle lists + ## we can create a string from it but we can't set it on command line + default = 'default="%s"' % str(options[default_name_arg]) + elif default_name_no_arg in options: + default = 'default="true"' + + mixed = ALL_OPT[option]["help"] + ## split it between option and help text + res = re.compile(r"^(.*--\S+)\s+", re.IGNORECASE | re.S).search(mixed) + if None != res: + mixed = res.group(1) + mixed = mixed.replace("<", "<").replace(">", ">") + print(' ') + + if ALL_OPT[option]["getopt"].count(":") > 0: + print(' ') + else: + print(' ') + + print(' ' + ALL_OPT[option]["shortdesc"] + '') + print(' ') + + print(' \n ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print('') + + +def option_longopt(option): + """ Return the getopt-compatible long-option name of the given option. """ + + if ALL_OPT[option]["getopt"].endswith(":"): + return ALL_OPT[option]["longopt"] + "=" + else: + return ALL_OPT[option]["longopt"] + + +def opts_from_command_line(argv, avail_opt): + """ Read options from command-line arguments. """ + + # Prepare list of options for getopt + getopt_string = "" + longopt_list = [] + for k in avail_opt: + if k in ALL_OPT: + getopt_string += ALL_OPT[k]["getopt"] + else: + fail_usage("Parse error: unknown option '" + k + "'") + + if k in ALL_OPT and "longopt" in ALL_OPT[k]: + longopt_list.append(option_longopt(k)) + + try: + opt, dummy = getopt.gnu_getopt(argv, getopt_string, longopt_list) + except getopt.GetoptError as error: + fail_usage("Parse error: " + error.msg) + + # Transform longopt to short one which are used in fencing agents + old_opt = opt + opt = {} + for old_option in dict(old_opt).keys(): + if old_option.startswith("--"): + for option in ALL_OPT.keys(): + if "longopt" in ALL_OPT[option] and "--" + ALL_OPT[option]["longopt"] == old_option: + opt["-" + ALL_OPT[option]["getopt"].rstrip(":")] = dict(old_opt)[old_option] + else: + opt[old_option] = dict(old_opt)[old_option] + + return opt + + +def opts_from_stdin(avail_opt): + """ Read options from standard input. """ + + opt = {} + name = "" + for line in sys.stdin.readlines(): + line = line.strip() + if line.startswith("#") or (len(line) == 0): + continue + + (name, value) = (line + "=").split("=", 1) + value = value[:-1] + + if name not in avail_opt: + print("Parse error: Ignoring unknown option '%s'" % line, + file=sys.stderr) + continue + + if ALL_OPT[name]["getopt"].endswith(":"): + opt["-"+ALL_OPT[name]["getopt"].rstrip(":")] = value + elif value.lower() in ["1", "yes", "on", "true"]: + opt["-"+ALL_OPT[name]["getopt"]] = "1" + + return opt + + +def process_input(avail_opt): + """ Set standard environment variables, and parse all options. """ + + # Set standard environment + os.putenv("LANG", "C") + os.putenv("LC_ALL", "C") + + # Read options from command line or standard input + if len(sys.argv) > 1: + return opts_from_command_line(sys.argv[1:], avail_opt) + else: + return opts_from_stdin(avail_opt) + + +def atexit_handler(): + """ Close stdout on exit. """ + + try: + sys.stdout.close() + os.close(1) + except IOError: + sys.exit("%s failed to close standard output" % agent()) + + +def main(): + """ Make it so! """ + + device_opt = ALL_OPT.keys() + + ## Defaults for fence agent + atexit.register(atexit_handler) + options = process_input(device_opt) + options["device_opt"] = device_opt + show_docs(options) + + print("Watchdog fencing may be initiated only by the cluster, not this agent.", + file=sys.stderr) + + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/daemons/fenced/fenced_commands.c b/daemons/fenced/fenced_commands.c index cd9968f1a..9470ea2c1 100644 --- a/daemons/fenced/fenced_commands.c +++ b/daemons/fenced/fenced_commands.c @@ -397,15 +397,13 @@ stonith_device_execute(stonith_device_t * device) return TRUE; } - if(pcmk__str_eq(device->agent, STONITH_WATCHDOG_AGENT, pcmk__str_casei)) { - if(pcmk__str_eq(cmd->action, "reboot", pcmk__str_casei)) { - pcmk__panic(__func__); - goto done; - - } else if(pcmk__str_eq(cmd->action, "off", pcmk__str_casei)) { - pcmk__panic(__func__); - goto done; - + if (pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, + STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { + if (pcmk__strcase_any_of(cmd->action, "reboot", "off", NULL)) { + if (node_does_watchdog_fencing(stonith_our_uname)) { + pcmk__panic(__func__); + goto done; + } } else { crm_info("Faking success for %s watchdog operation", cmd->action); cmd->done_cb(0, 0, NULL, cmd); @@ -716,7 +714,7 @@ get_agent_metadata(const char *agent, xmlNode ** metadata) return EINVAL; } *metadata = NULL; - if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT, pcmk__str_none)) { + if (pcmk__str_eq(agent, STONITH_WATCHDOG_AGENT_INTERNAL, pcmk__str_none)) { return pcmk_rc_ok; } init_metadata_cache(); @@ -1050,24 +1048,6 @@ schedule_internal_command(const char *origin, schedule_stonith_command(cmd, device); } -gboolean -string_in_list(GList *list, const char *item) -{ - int lpc = 0; - int max = g_list_length(list); - - for (lpc = 0; lpc < max; lpc++) { - const char *value = g_list_nth_data(list, lpc); - - if (pcmk__str_eq(item, value, pcmk__str_casei)) { - return TRUE; - } else { - crm_trace("%d: '%s' != '%s'", lpc, item, value); - } - } - return FALSE; -} - static void status_search_cb(GPid pid, int rc, const char *output, gpointer user_data) { @@ -1144,7 +1124,7 @@ dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) if (!alias) { alias = search->host; } - if (string_in_list(dev->targets, alias)) { + if (pcmk__str_in_list(dev->targets, alias, pcmk__str_casei)) { can_fence = TRUE; } } @@ -1215,9 +1195,62 @@ stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib) stonith_device_t *dup = NULL; stonith_device_t *device = build_device_from_xml(msg); guint ndevices = 0; + int rv = pcmk_ok; CRM_CHECK(device != NULL, return -ENOMEM); + /* do we have a watchdog-device? */ + if (pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, pcmk__str_none) || + pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, + STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) do { + if (stonith_watchdog_timeout_ms <= 0) { + crm_err("Ignoring watchdog fence device without " + "stonith-watchdog-timeout set."); + rv = -ENODEV; + /* fall through to cleanup & return */ + } else if (!pcmk__str_any_of(device->agent, STONITH_WATCHDOG_AGENT, + STONITH_WATCHDOG_AGENT_INTERNAL, NULL)) { + crm_err("Ignoring watchdog fence device with unknown " + "agent '%s' unequal '" STONITH_WATCHDOG_AGENT "'.", + device->agent?device->agent:""); + rv = -ENODEV; + /* fall through to cleanup & return */ + } else if (!pcmk__str_eq(device->id, STONITH_WATCHDOG_ID, + pcmk__str_none)) { + crm_err("Ignoring watchdog fence device " + "named %s !='"STONITH_WATCHDOG_ID"'.", + device->id?device->id:""); + rv = -ENODEV; + /* fall through to cleanup & return */ + } else { + if (pcmk__str_eq(device->agent, STONITH_WATCHDOG_AGENT, + pcmk__str_none)) { + /* this either has an empty list or the targets + configured for watchdog-fencing + */ + g_list_free_full(stonith_watchdog_targets, free); + stonith_watchdog_targets = device->targets; + device->targets = NULL; + } + if (node_does_watchdog_fencing(stonith_our_uname)) { + g_list_free_full(device->targets, free); + device->targets = stonith__parse_targets(stonith_our_uname); + g_hash_table_replace(device->params, + strdup(PCMK_STONITH_HOST_LIST), + strdup(stonith_our_uname)); + /* proceed as with any other stonith-device */ + break; + } + + crm_debug("Skip registration of watchdog fence device on node not in host-list."); + /* cleanup and fall through to more cleanup and return */ + device->targets = NULL; + stonith_device_remove(device->id, from_cib); + } + free_device(device); + return rv; + } while (0); + dup = device_has_duplicate(device); if (dup) { ndevices = g_hash_table_size(device_list); @@ -1598,6 +1631,39 @@ stonith_level_remove(xmlNode *msg, char **desc) * (CIB registration is not sufficient), because monitor should not be * possible unless the device is "started" (API registered). */ + +static char * +list_to_string(GList *list, const char *delim, gboolean terminate_with_delim) +{ + int max = g_list_length(list); + size_t delim_len = delim?strlen(delim):0; + size_t alloc_size = 1 + (max?((max-1+(terminate_with_delim?1:0))*delim_len):0); + char *rv; + GList *gIter; + + for (gIter = list; gIter != NULL; gIter = gIter->next) { + const char *value = (const char *) gIter->data; + + alloc_size += strlen(value); + } + rv = calloc(alloc_size, sizeof(char)); + if (rv) { + char *pos = rv; + const char *lead_delim = ""; + + for (gIter = list; gIter != NULL; gIter = gIter->next) { + const char *value = (const char *) gIter->data; + + pos = &pos[sprintf(pos, "%s%s", lead_delim, value)]; + lead_delim = delim; + } + if (max && terminate_with_delim) { + sprintf(pos, "%s", delim); + } + } + return rv; +} + static int stonith_device_action(xmlNode * msg, char **output) { @@ -1615,6 +1681,19 @@ stonith_device_action(xmlNode * msg, char **output) return -EPROTO; } + if (pcmk__str_eq(id, STONITH_WATCHDOG_ID, pcmk__str_none)) { + if (stonith_watchdog_timeout_ms <= 0) { + return -ENODEV; + } else { + if (pcmk__str_eq(action, "list", pcmk__str_casei)) { + *output = list_to_string(stonith_watchdog_targets, "\n", TRUE); + return pcmk_ok; + } else if (pcmk__str_eq(action, "monitor", pcmk__str_casei)) { + return pcmk_ok; + } + } + } + device = g_hash_table_lookup(device_list, id); if ((device == NULL) || (!device->api_registered && !strcmp(action, "monitor"))) { @@ -1742,7 +1821,7 @@ can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *searc * Only use if all hosts on which the device can be active can always fence all listed hosts */ - if (string_in_list(dev->targets, host)) { + if (pcmk__str_in_list(dev->targets, host, pcmk__str_casei)) { can = TRUE; } else if (g_hash_table_lookup(dev->params, PCMK_STONITH_HOST_MAP) && g_hash_table_lookup(dev->aliases, host)) { @@ -1763,7 +1842,7 @@ can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *searc return; } - if (string_in_list(dev->targets, alias)) { + if (pcmk__str_in_list(dev->targets, alias, pcmk__str_casei)) { can = TRUE; } diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c index cf91acaed..224f2baba 100644 --- a/daemons/fenced/fenced_remote.c +++ b/daemons/fenced/fenced_remote.c @@ -1522,6 +1522,25 @@ advance_topology_device_in_level(remote_fencing_op_t *op, const char *device, } } +static gboolean +check_watchdog_fencing_and_wait(remote_fencing_op_t * op) +{ + if (node_does_watchdog_fencing(op->target)) { + + crm_notice("Waiting %lds for %s to self-fence (%s) for " + "client %s " CRM_XS " id=%.8s", + (stonith_watchdog_timeout_ms / 1000), + op->target, op->action, op->client_name, op->id); + op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, + remote_op_watchdog_done, op); + return TRUE; + } else { + crm_debug("Skipping fallback to watchdog-fencing as %s is " + "not in host-list", op->target); + } + return FALSE; +} + void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) { @@ -1592,26 +1611,33 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) g_source_remove(op->op_timer_one); } - if(stonith_watchdog_timeout_ms > 0 && device && pcmk__str_eq(device, "watchdog", pcmk__str_casei)) { - crm_notice("Waiting %lds for %s to self-fence (%s) for client %s " - CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000), - op->target, op->action, op->client_name, op->id); - op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); - - /* TODO check devices to verify watchdog will be in use */ - } else if(stonith_watchdog_timeout_ms > 0 - && pcmk__str_eq(peer->host, op->target, pcmk__str_casei) - && !pcmk__str_eq(op->action, "on", pcmk__str_casei)) { - crm_notice("Waiting %lds for %s to self-fence (%s) for client %s " - CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000), - op->target, op->action, op->client_name, op->id); - op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); - - } else { + if (!(stonith_watchdog_timeout_ms > 0 && ( + (pcmk__str_eq(device, STONITH_WATCHDOG_ID, + pcmk__str_none)) || + (pcmk__str_eq(peer->host, op->target, pcmk__str_casei) + && !pcmk__str_eq(op->action, "on", pcmk__str_casei))) && + check_watchdog_fencing_and_wait(op))) { + + /* Some thoughts about self-fencing cases reaching this point: + - Actually check in check_watchdog_fencing_and_wait + shouldn't fail if STONITH_WATCHDOG_ID is + chosen as fencing-device and it being present implies + watchdog-fencing is enabled anyway + - If watchdog-fencing is disabled either in general or for + a specific target - detected in check_watchdog_fencing_and_wait - + for some other kind of self-fencing we can't expect + a success answer but timeout is fine if the node doesn't + come back in between + - Delicate might be the case where we have watchdog-fencing + enabled for a node but the watchdog-fencing-device isn't + explicitly chosen for suicide. Local pe-execution in sbd + may detect the node as unclean and lead to timely suicide. + Otherwise the selection of stonith-watchdog-timeout at + least is questionable. + */ op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op); } - send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE); peer->tried = TRUE; free_xml(remote_op); @@ -1645,12 +1671,11 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc) * but we have all the expected replies, then no devices * are available to execute the fencing operation. */ - if(stonith_watchdog_timeout_ms && pcmk__str_eq(device, "watchdog", pcmk__str_null_matches | pcmk__str_casei)) { - crm_notice("Waiting %lds for %s to self-fence (%s) for client %s " - CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000), - op->target, op->action, op->client_name, op->id); - op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op); - return; + if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device, + STONITH_WATCHDOG_ID, pcmk__str_null_matches)) { + if (check_watchdog_fencing_and_wait(op)) { + return; + } } if (op->state == st_query) { diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index 39738d8be..7f8b427d9 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -42,6 +42,7 @@ char *stonith_our_uname = NULL; long stonith_watchdog_timeout_ms = 0; +GList *stonith_watchdog_targets = NULL; static GMainLoop *mainloop = NULL; @@ -578,7 +579,44 @@ our_node_allowed_for(pe_resource_t *rsc) } static void -watchdog_device_update(xmlNode *cib) +watchdog_device_update(void) +{ + if (stonith_watchdog_timeout_ms > 0) { + if (!g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID) && + !stonith_watchdog_targets) { + /* getting here watchdog-fencing enabled, no device there yet + and reason isn't stonith_watchdog_targets preventing that + */ + int rc; + xmlNode *xml; + + xml = create_device_registration_xml( + STONITH_WATCHDOG_ID, + st_namespace_internal, + STONITH_WATCHDOG_AGENT, + NULL, /* stonith_device_register will add our + own name as PCMK_STONITH_HOST_LIST param + so we can skip that here + */ + NULL); + rc = stonith_device_register(xml, NULL, TRUE); + free_xml(xml); + if (rc != pcmk_ok) { + crm_crit("Cannot register watchdog pseudo fence agent"); + crm_exit(CRM_EX_FATAL); + } + } + + } else { + /* be silent if no device - todo parameter to stonith_device_remove */ + if (g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID)) { + stonith_device_remove(STONITH_WATCHDOG_ID, TRUE); + } + } +} + +static void +update_stonith_watchdog_timeout_ms(xmlNode *cib) { xmlNode *stonith_enabled_xml = NULL; const char *stonith_enabled_s = NULL; @@ -608,33 +646,7 @@ watchdog_device_update(xmlNode *cib) } } - if (timeout_ms != stonith_watchdog_timeout_ms) { - crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000); - stonith_watchdog_timeout_ms = timeout_ms; - - if (stonith_watchdog_timeout_ms > 0) { - int rc; - xmlNode *xml; - stonith_key_value_t *params = NULL; - - params = stonith_key_value_add(params, PCMK_STONITH_HOST_LIST, - stonith_our_uname); - - xml = create_device_registration_xml("watchdog", st_namespace_internal, - STONITH_WATCHDOG_AGENT, params, - NULL); - stonith_key_value_freeall(params, 1, 1); - rc = stonith_device_register(xml, NULL, FALSE); - free_xml(xml); - if (rc != pcmk_ok) { - crm_crit("Cannot register watchdog pseudo fence agent"); - crm_exit(CRM_EX_FATAL); - } - - } else { - stonith_device_remove("watchdog", FALSE); - } - } + stonith_watchdog_timeout_ms = timeout_ms; } /*! @@ -677,6 +689,16 @@ static void cib_device_update(pe_resource_t *rsc, pe_working_set_t *data_set) return; } + /* if watchdog-fencing is disabled handle any watchdog-fence + resource as if it was disabled + */ + if ((stonith_watchdog_timeout_ms <= 0) && + pcmk__str_eq(rsc->id, STONITH_WATCHDOG_ID, pcmk__str_none)) { + crm_info("Watchdog-fencing disabled thus handling " + "device %s as disabled", rsc->id); + return; + } + /* Check whether our node is allowed for this resource (and its parent if in a group) */ node = our_node_allowed_for(rsc); if (rsc->parent && (rsc->parent->variant == pe_group)) { @@ -772,6 +794,12 @@ cib_devices_update(void) } } + /* have list repopulated if cib has a watchdog-fencing-resource + TODO: keep a cached list for queries happening while we are refreshing + */ + g_list_free_full(stonith_watchdog_targets, free); + stonith_watchdog_targets = NULL; + for (gIter = fenced_data_set->resources; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, fenced_data_set); } @@ -825,6 +853,8 @@ update_cib_stonith_devices_v2(const char *event, xmlNode * msg) if (search != NULL) { *search = 0; stonith_device_remove(rsc_id, TRUE); + /* watchdog_device_update called afterwards + to fall back to implicit definition if needed */ } else { crm_warn("Ignoring malformed CIB update (resource deletion)"); } @@ -968,6 +998,24 @@ node_has_attr(const char *node, const char *name, const char *value) return (match != NULL); } +/*! + * \internal + * \brief Check whether a node does watchdog-fencing + * + * \param[in] node Name of node to check + * + * \return TRUE if node found in stonith_watchdog_targets + * or stonith_watchdog_targets is empty indicating + * all nodes are doing watchdog-fencing + */ +gboolean +node_does_watchdog_fencing(const char *node) +{ + return ((stonith_watchdog_targets == NULL) || + pcmk__str_in_list(stonith_watchdog_targets, node, pcmk__str_casei)); +} + + static void update_fencing_topology(const char *event, xmlNode * msg) { @@ -1073,6 +1121,8 @@ update_cib_cache_cb(const char *event, xmlNode * msg) xmlNode *stonith_enabled_xml = NULL; const char *stonith_enabled_s = NULL; static gboolean stonith_enabled_saved = TRUE; + long timeout_ms_saved = stonith_watchdog_timeout_ms; + gboolean need_full_refresh = FALSE; if(!have_cib_devices) { crm_trace("Skipping updates until we get a full dump"); @@ -1127,6 +1177,7 @@ update_cib_cache_cb(const char *event, xmlNode * msg) } pcmk__refresh_node_caches_from_cib(local_cib); + update_stonith_watchdog_timeout_ms(local_cib); stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", local_cib, LOG_NEVER); @@ -1134,23 +1185,30 @@ update_cib_cache_cb(const char *event, xmlNode * msg) stonith_enabled_s = crm_element_value(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE); } - watchdog_device_update(local_cib); - if (stonith_enabled_s && crm_is_true(stonith_enabled_s) == FALSE) { crm_trace("Ignoring CIB updates while fencing is disabled"); stonith_enabled_saved = FALSE; - return; } else if (stonith_enabled_saved == FALSE) { crm_info("Updating fencing device and topology lists " "now that fencing is enabled"); stonith_enabled_saved = TRUE; - fencing_topology_init(); - cib_devices_update(); + need_full_refresh = TRUE; } else { - update_fencing_topology(event, msg); - update_cib_stonith_devices(event, msg); + if (timeout_ms_saved != stonith_watchdog_timeout_ms) { + need_full_refresh = TRUE; + } else { + update_fencing_topology(event, msg); + update_cib_stonith_devices(event, msg); + watchdog_device_update(); + } + } + + if (need_full_refresh) { + fencing_topology_init(); + cib_devices_update(); + watchdog_device_update(); } } @@ -1162,10 +1220,11 @@ init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *us local_cib = copy_xml(output); pcmk__refresh_node_caches_from_cib(local_cib); + update_stonith_watchdog_timeout_ms(local_cib); fencing_topology_init(); - watchdog_device_update(local_cib); cib_devices_update(); + watchdog_device_update(); } static void diff --git a/daemons/fenced/pacemaker-fenced.h b/daemons/fenced/pacemaker-fenced.h index d330fda4d..14e085e98 100644 --- a/daemons/fenced/pacemaker-fenced.h +++ b/daemons/fenced/pacemaker-fenced.h @@ -260,14 +260,15 @@ bool fencing_peer_active(crm_node_t *peer); int stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op); -gboolean string_in_list(GList *list, const char *item); - gboolean node_has_attr(const char *node, const char *name, const char *value); +gboolean node_does_watchdog_fencing(const char *node); + extern char *stonith_our_uname; extern gboolean stand_alone; extern GHashTable *device_list; extern GHashTable *topology; extern long stonith_watchdog_timeout_ms; +extern GList *stonith_watchdog_targets; extern GHashTable *stonith_remote_op_list; diff --git a/include/crm/crm.h b/include/crm/crm.h index ee52c3630..7861c160e 100644 --- a/include/crm/crm.h +++ b/include/crm/crm.h @@ -66,7 +66,7 @@ extern "C" { * >=3.0.13: Fail counts include operation name and interval * >=3.2.0: DC supports PCMK_LRM_OP_INVALID and PCMK_LRM_OP_NOT_CONNECTED */ -# define CRM_FEATURE_SET "3.10.2" +# define CRM_FEATURE_SET "3.11.0" /* Pacemaker's CPG protocols use fixed-width binary fields for the sender and * recipient of a CPG message. This imposes an arbitrary limit on cluster node diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h index 8bcb544d8..f222edba3 100644 --- a/include/crm/fencing/internal.h +++ b/include/crm/fencing/internal.h @@ -164,7 +164,10 @@ void stonith__device_parameter_flags(uint32_t *device_flags, # define STONITH_OP_LEVEL_ADD "st_level_add" # define STONITH_OP_LEVEL_DEL "st_level_remove" -# define STONITH_WATCHDOG_AGENT "#watchdog" +# define STONITH_WATCHDOG_AGENT "fence_watchdog" +/* Don't change 2 below as it would break rolling upgrade */ +# define STONITH_WATCHDOG_AGENT_INTERNAL "#watchdog" +# define STONITH_WATCHDOG_ID "watchdog" # ifdef HAVE_STONITH_STONITH_H // utilities from st_lha.c @@ -211,4 +214,7 @@ stonith__op_state_pending(enum op_state state) return state != st_failed && state != st_done; } +gboolean stonith__watchdog_fencing_enabled_for_node(const char *node); +gboolean stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node); + #endif diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index e285f51e2..0ff98157b 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -195,6 +195,67 @@ stonith_get_namespace(const char *agent, const char *namespace_s) return st_namespace_invalid; } +gboolean +stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) +{ + gboolean rv = FALSE; + stonith_t *stonith_api = st?st:stonith_api_new(); + char *list = NULL; + + if(stonith_api) { + if (stonith_api->state == stonith_disconnected) { + int rc = stonith_api->cmds->connect(stonith_api, "stonith-api", NULL); + + if (rc != pcmk_ok) { + crm_err("Failed connecting to Stonith-API for watchdog-fencing-query."); + } + } + + if (stonith_api->state != stonith_disconnected) { + /* caveat!!! + * this might fail when when stonithd is just updating the device-list + * probably something we should fix as well for other api-calls */ + int rc = stonith_api->cmds->list(stonith_api, st_opt_sync_call, STONITH_WATCHDOG_ID, &list, 0); + if ((rc != pcmk_ok) || (list == NULL)) { + /* due to the race described above it can happen that + * we drop in here - so as not to make remote nodes + * panic on that answer + */ + crm_warn("watchdog-fencing-query failed"); + } else if (list[0] == '\0') { + crm_warn("watchdog-fencing-query returned an empty list - any node"); + rv = TRUE; + } else { + GList *targets = stonith__parse_targets(list); + rv = pcmk__str_in_list(targets, node, pcmk__str_casei); + g_list_free_full(targets, free); + } + free(list); + if (!st) { + /* if we're provided the api we still might have done the + * connection - but let's assume the caller won't bother + */ + stonith_api->cmds->disconnect(stonith_api); + } + } + + if (!st) { + stonith_api_delete(stonith_api); + } + } else { + crm_err("Stonith-API for watchdog-fencing-query couldn't be created."); + } + crm_trace("Pacemaker assumes node %s %sto do watchdog-fencing.", + node, rv?"":"not "); + return rv; +} + +gboolean +stonith__watchdog_fencing_enabled_for_node(const char *node) +{ + return stonith__watchdog_fencing_enabled_for_node_api(NULL, node); +} + static void log_action(stonith_action_t *action, pid_t pid) { diff --git a/lib/lrmd/lrmd_client.c b/lib/lrmd/lrmd_client.c index 87d050ed1..bf4bceb42 100644 --- a/lib/lrmd/lrmd_client.c +++ b/lib/lrmd/lrmd_client.c @@ -34,6 +34,7 @@ #include #include +#include #ifdef HAVE_GNUTLS_GNUTLS_H # undef KEYFILE @@ -934,7 +935,10 @@ lrmd__validate_remote_settings(lrmd_t *lrmd, GHashTable *hash) crm_xml_add(data, F_LRMD_ORIGIN, __func__); value = g_hash_table_lookup(hash, "stonith-watchdog-timeout"); - crm_xml_add(data, F_LRMD_WATCHDOG, value); + if ((value) && + (stonith__watchdog_fencing_enabled_for_node(native->remote_nodename))) { + crm_xml_add(data, F_LRMD_WATCHDOG, value); + } rc = lrmd_send_command(lrmd, LRMD_OP_CHECK, data, NULL, 0, 0, (native->type == pcmk__client_ipc)); diff --git a/rpm/pacemaker.spec.in b/rpm/pacemaker.spec.in index 79e78ede9..f58357a77 100644 --- a/rpm/pacemaker.spec.in +++ b/rpm/pacemaker.spec.in @@ -744,6 +744,7 @@ exit 0 %doc %{_mandir}/man8/crm_attribute.* %doc %{_mandir}/man8/crm_master.* %doc %{_mandir}/man8/fence_legacy.* +%doc %{_mandir}/man8/fence_watchdog.* %doc %{_mandir}/man8/pacemakerd.* %doc %{_datadir}/pacemaker/alerts @@ -796,6 +797,7 @@ exit 0 %{_sbindir}/crm_simulate %{_sbindir}/crm_report %{_sbindir}/crm_ticket +%{_sbindir}/fence_watchdog %{_sbindir}/stonith_admin # "dirname" is owned by -schemas, which is a prerequisite %{_datadir}/pacemaker/report.collector @@ -822,6 +824,7 @@ exit 0 %exclude %{_mandir}/man8/crm_attribute.* %exclude %{_mandir}/man8/crm_master.* %exclude %{_mandir}/man8/fence_legacy.* +%exclude %{_mandir}/man8/fence_watchdog.* %exclude %{_mandir}/man8/pacemakerd.* %exclude %{_mandir}/man8/pacemaker-remoted.* -- 2.27.0 From 53dd360f096e5f005e3221e8d44d82d3654b5172 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Wed, 4 Aug 2021 15:57:23 +0200 Subject: [PATCH 3/3] Fix: watchdog-fencing: Silence warning without node restriction --- lib/fencing/st_client.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index 0ff98157b..14fa7b2a6 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -223,7 +223,6 @@ stonith__watchdog_fencing_enabled_for_node_api(stonith_t *st, const char *node) */ crm_warn("watchdog-fencing-query failed"); } else if (list[0] == '\0') { - crm_warn("watchdog-fencing-query returned an empty list - any node"); rv = TRUE; } else { GList *targets = stonith__parse_targets(list); -- 2.27.0