From edd133ade2bd9b003d3437280271a9c9dbab3ed6 Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 23 May 2019 16:36:12 -0500 Subject: [PATCH] Refactor: controller: separate fencing-related functionality into own source file Before: 748 daemons/controld/controld_te_actions.c 942 daemons/controld/controld_te_callbacks.c 725 daemons/controld/controld_te_utils.c 84 daemons/controld/controld_transition.h 110 daemons/controld/controld_utils.h After: 838 daemons/controld/controld_fencing.c 37 daemons/controld/controld_fencing.h 631 daemons/controld/controld_te_actions.c 701 daemons/controld/controld_te_callbacks.c 298 daemons/controld/controld_te_utils.c 65 daemons/controld/controld_transition.h 106 daemons/controld/controld_utils.h --- daemons/controld/Makefile.am | 5 +- daemons/controld/controld_callbacks.c | 3 +- daemons/controld/controld_control.c | 2 +- daemons/controld/controld_election.c | 3 +- daemons/controld/controld_fencing.c | 838 +++++++++++++++++++++++++++++++ daemons/controld/controld_fencing.h | 37 ++ daemons/controld/controld_fsa.c | 1 + daemons/controld/controld_messages.c | 1 + daemons/controld/controld_te_actions.c | 121 +---- daemons/controld/controld_te_callbacks.c | 243 +-------- daemons/controld/controld_te_utils.c | 429 +--------------- daemons/controld/controld_transition.c | 1 - daemons/controld/controld_transition.h | 21 +- daemons/controld/controld_utils.h | 4 - 14 files changed, 891 insertions(+), 818 deletions(-) create mode 100644 daemons/controld/controld_fencing.c create mode 100644 daemons/controld/controld_fencing.h diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am index 17c3342..858e1bb 100644 --- a/daemons/controld/Makefile.am +++ b/daemons/controld/Makefile.am @@ -1,5 +1,7 @@ # -# Copyright 2004-2018 Andrew Beekhof +# Copyright 2018-2019 the Pacemaker project contributors +# +# The version control history for this file may have further details. # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -46,6 +48,7 @@ pacemaker_controld_SOURCES = pacemaker-controld.c \ controld_election.c \ controld_execd.c \ controld_execd_state.c \ + controld_fencing.c \ controld_fsa.c \ controld_join_client.c \ controld_join_dc.c \ diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c index a188263..06ffb9d 100644 --- a/daemons/controld/controld_callbacks.c +++ b/daemons/controld/controld_callbacks.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2018 Andrew Beekhof + * Copyright 2004-2019 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 6d9f335..7f918c0 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -147,7 +148,6 @@ extern char *max_generation_from; extern xmlNode *max_generation_xml; extern GHashTable *resource_history; extern GHashTable *voted; -extern char *te_client_id; void crmd_fast_exit(crm_exit_t exit_code) diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c index 5d6858c..9e49c7b 100644 --- a/daemons/controld/controld_election.c +++ b/daemons/controld/controld_election.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2019 Andrew Beekhof + * Copyright 2004-2019 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c new file mode 100644 index 0000000..cde57b5 --- /dev/null +++ b/daemons/controld/controld_fencing.c @@ -0,0 +1,838 @@ +/* + * Copyright 2004-2019 the Pacemaker project contributors + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef HAVE_SYS_REBOOT_H +# include +# include +#endif + +/* + * stonith failure counting + * + * We don't want to get stuck in a permanent fencing loop. Keep track of the + * number of fencing failures for each target node, and the most we'll restart a + * transition for. + */ + +struct st_fail_rec { + int count; +}; + +static unsigned long int stonith_max_attempts = 10; +static GHashTable *stonith_failures = NULL; + +void +update_stonith_max_attempts(const char *value) +{ + if (safe_str_eq(value, CRM_INFINITY_S)) { + stonith_max_attempts = CRM_SCORE_INFINITY; + } else { + stonith_max_attempts = crm_int_helper(value, NULL); + } +} + +static gboolean +too_many_st_failures(const char *target) +{ + GHashTableIter iter; + const char *key = NULL; + struct st_fail_rec *value = NULL; + + if (stonith_failures == NULL) { + return FALSE; + } + + if (target == NULL) { + g_hash_table_iter_init(&iter, stonith_failures); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, + (gpointer *) &value)) { + + if (value->count >= stonith_max_attempts) { + target = (const char*)key; + goto too_many; + } + } + } else { + value = g_hash_table_lookup(stonith_failures, target); + if ((value != NULL) && (value->count >= stonith_max_attempts)) { + goto too_many; + } + } + return FALSE; + +too_many: + crm_warn("Too many failures (%d) to fence %s, giving up", + value->count, target); + return TRUE; +} + +/*! + * \internal + * \brief Reset a stonith fail count + * + * \param[in] target Name of node to reset, or NULL for all + */ +void +st_fail_count_reset(const char *target) +{ + if (stonith_failures == NULL) { + return; + } + + if (target) { + struct st_fail_rec *rec = NULL; + + rec = g_hash_table_lookup(stonith_failures, target); + if (rec) { + rec->count = 0; + } + } else { + GHashTableIter iter; + const char *key = NULL; + struct st_fail_rec *rec = NULL; + + g_hash_table_iter_init(&iter, stonith_failures); + while (g_hash_table_iter_next(&iter, (gpointer *) &key, + (gpointer *) &rec)) { + rec->count = 0; + } + } +} + +static void +st_fail_count_increment(const char *target) +{ + struct st_fail_rec *rec = NULL; + + if (stonith_failures == NULL) { + stonith_failures = crm_str_table_new(); + } + + rec = g_hash_table_lookup(stonith_failures, target); + if (rec) { + rec->count++; + } else { + rec = malloc(sizeof(struct st_fail_rec)); + if(rec == NULL) { + return; + } + + rec->count = 1; + g_hash_table_insert(stonith_failures, strdup(target), rec); + } +} + +/* end stonith fail count functions */ + + +static void +cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, + void *user_data) +{ + if (rc < pcmk_ok) { + crm_err("Fencing update %d for %s: failed - %s (%d)", + call_id, (char *)user_data, pcmk_strerror(rc), rc); + crm_log_xml_warn(msg, "Failed update"); + abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); + + } else { + crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); + } +} + +static void +send_stonith_update(crm_action_t *action, const char *target, const char *uuid) +{ + int rc = pcmk_ok; + crm_node_t *peer = NULL; + + /* We (usually) rely on the membership layer to do node_update_cluster, + * and the peer status callback to do node_update_peer, because the node + * might have already rejoined before we get the stonith result here. + */ + int flags = node_update_join | node_update_expected; + + /* zero out the node-status & remove all LRM status info */ + xmlNode *node_state = NULL; + + CRM_CHECK(target != NULL, return); + CRM_CHECK(uuid != NULL, return); + + /* Make sure the membership and join caches are accurate */ + peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); + + CRM_CHECK(peer != NULL, return); + + if (peer->state == NULL) { + /* Usually, we rely on the membership layer to update the cluster state + * in the CIB. However, if the node has never been seen, do it here, so + * the node is not considered unclean. + */ + flags |= node_update_cluster; + } + + if (peer->uuid == NULL) { + crm_info("Recording uuid '%s' for node '%s'", uuid, target); + peer->uuid = strdup(uuid); + } + + crmd_peer_down(peer, TRUE); + + /* Generate a node state update for the CIB */ + node_state = create_node_state_update(peer, flags, NULL, __FUNCTION__); + + /* we have to mark whether or not remote nodes have already been fenced */ + if (peer->flags & crm_remote_node) { + time_t now = time(NULL); + char *now_s = crm_itoa(now); + crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s); + free(now_s); + } + + /* Force our known ID */ + crm_xml_add(node_state, XML_ATTR_UUID, uuid); + + rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, + cib_quorum_override | cib_scope_local | cib_can_create); + + /* Delay processing the trigger until the update completes */ + crm_debug("Sending fencing update %d for %s", rc, target); + fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); + + /* Make sure it sticks */ + /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ + + erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local); + erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); + + free_xml(node_state); + return; +} + +/*! + * \internal + * \brief Abort transition due to stonith failure + * + * \param[in] abort_action Whether to restart or stop transition + * \param[in] target Don't restart if this (NULL for any) has too many failures + * \param[in] reason Log this stonith action XML as abort reason (or NULL) + */ +static void +abort_for_stonith_failure(enum transition_action abort_action, + const char *target, xmlNode *reason) +{ + /* If stonith repeatedly fails, we eventually give up on starting a new + * transition for that reason. + */ + if ((abort_action != tg_stop) && too_many_st_failures(target)) { + abort_action = tg_stop; + } + abort_transition(INFINITY, abort_action, "Stonith failed", reason); +} + + +/* + * stonith cleanup list + * + * If the DC is shot, proper notifications might not go out. + * The stonith cleanup list allows the cluster to (re-)send + * notifications once a new DC is elected. + */ + +static GListPtr stonith_cleanup_list = NULL; + +/*! + * \internal + * \brief Add a node to the stonith cleanup list + * + * \param[in] target Name of node to add + */ +void +add_stonith_cleanup(const char *target) { + stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); +} + +/*! + * \internal + * \brief Remove a node from the stonith cleanup list + * + * \param[in] Name of node to remove + */ +void +remove_stonith_cleanup(const char *target) +{ + GListPtr iter = stonith_cleanup_list; + + while (iter != NULL) { + GListPtr tmp = iter; + char *iter_name = tmp->data; + + iter = iter->next; + if (safe_str_eq(target, iter_name)) { + crm_trace("Removing %s from the cleanup list", iter_name); + stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); + free(iter_name); + } + } +} + +/*! + * \internal + * \brief Purge all entries from the stonith cleanup list + */ +void +purge_stonith_cleanup() +{ + if (stonith_cleanup_list) { + GListPtr iter = NULL; + + for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { + char *target = iter->data; + + crm_info("Purging %s from stonith cleanup list", target); + free(target); + } + g_list_free(stonith_cleanup_list); + stonith_cleanup_list = NULL; + } +} + +/*! + * \internal + * \brief Send stonith updates for all entries in cleanup list, then purge it + */ +void +execute_stonith_cleanup() +{ + GListPtr iter; + + for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { + char *target = iter->data; + crm_node_t *target_node = crm_get_peer(0, target); + const char *uuid = crm_peer_uuid(target_node); + + crm_notice("Marking %s, target of a previous stonith action, as clean", target); + send_stonith_update(NULL, target, uuid); + free(target); + } + g_list_free(stonith_cleanup_list); + stonith_cleanup_list = NULL; +} + +/* end stonith cleanup list functions */ + + +/* stonith API client + * + * Functions that need to interact directly with the fencer via its API + */ + +stonith_t *stonith_api = NULL; +crm_trigger_t *stonith_reconnect = NULL; +char *te_client_id = NULL; + +static gboolean +fail_incompletable_stonith(crm_graph_t *graph) +{ + GListPtr lpc = NULL; + const char *task = NULL; + xmlNode *last_action = NULL; + + if (graph == NULL) { + return FALSE; + } + + for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { + GListPtr lpc2 = NULL; + synapse_t *synapse = (synapse_t *) lpc->data; + + if (synapse->confirmed) { + continue; + } + + for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { + crm_action_t *action = (crm_action_t *) lpc2->data; + + if (action->type != action_type_crm || action->confirmed) { + continue; + } + + task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); + if (task && safe_str_eq(task, CRM_OP_FENCE)) { + action->failed = TRUE; + last_action = action->xml; + update_graph(graph, action); + crm_notice("Failing action %d (%s): fencer terminated", + action->id, ID(action->xml)); + } + } + } + + if (last_action != NULL) { + crm_warn("Fencer failure resulted in unrunnable actions"); + abort_for_stonith_failure(tg_restart, NULL, last_action); + return TRUE; + } + + return FALSE; +} + +static void +tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) +{ + if (is_set(fsa_input_register, R_ST_REQUIRED)) { + crm_crit("Fencing daemon connection failed"); + mainloop_set_trigger(stonith_reconnect); + + } else { + crm_info("Fencing daemon disconnected"); + } + + if (stonith_api) { + stonith_api->state = stonith_disconnected; + } + + if (AM_I_DC) { + fail_incompletable_stonith(transition_graph); + trigger_graph(); + } +} + +static void +tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) +{ + if (te_client_id == NULL) { + te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, + (unsigned long) getpid()); + } + + if (st_event == NULL) { + crm_err("Notify data not found"); + return; + } + + crmd_alert_fencing_op(st_event); + + if ((st_event->result == pcmk_ok) && safe_str_eq("on", st_event->action)) { + crm_notice("%s was successfully unfenced by %s (at the request of %s)", + st_event->target, + st_event->executioner? st_event->executioner : "", + st_event->origin); + /* TODO: Hook up st_event->device */ + return; + + } else if (safe_str_eq("on", st_event->action)) { + crm_err("Unfencing of %s by %s failed: %s (%d)", + st_event->target, + st_event->executioner? st_event->executioner : "", + pcmk_strerror(st_event->result), st_event->result); + return; + + } else if ((st_event->result == pcmk_ok) + && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { + + crm_crit("We were allegedly just fenced by %s for %s!", + st_event->executioner? st_event->executioner : "", + st_event->origin); /* Dumps blackbox if enabled */ + + qb_log_fini(); /* Try to get the above log message to disk - somehow */ + + /* Get out ASAP and do not come back up. + * + * Triggering a reboot is also not the worst idea either since + * the rest of the cluster thinks we're safely down + */ + +#ifdef RB_HALT_SYSTEM + reboot(RB_HALT_SYSTEM); +#endif + + /* + * If reboot() fails or is not supported, coming back up will + * probably lead to a situation where the other nodes set our + * status to 'lost' because of the fencing callback and will + * discard subsequent election votes with: + * + * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) + * + * So just stay dead, something is seriously messed up anyway. + * + */ + exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini() + return; + } + + /* Update the count of stonith failures for this target, in case we become + * DC later. The current DC has already updated its fail count in + * tengine_stonith_callback(). + */ + if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { + if (st_event->result == pcmk_ok) { + st_fail_count_reset(st_event->target); + } else { + st_fail_count_increment(st_event->target); + } + } + + crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " + CRM_XS " initiator=%s ref=%s", + st_event->target, st_event->result == pcmk_ok ? "" : " not", + st_event->action, + st_event->executioner ? st_event->executioner : "", + (st_event->client_origin? st_event->client_origin : ""), + pcmk_strerror(st_event->result), + st_event->origin, st_event->id); + + if (st_event->result == pcmk_ok) { + crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY); + const char *uuid = NULL; + gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); + + if (peer == NULL) { + return; + } + + uuid = crm_peer_uuid(peer); + + crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); + if(AM_I_DC) { + /* The DC always sends updates */ + send_stonith_update(NULL, st_event->target, uuid); + + /* @TODO Ideally, at this point, we'd check whether the fenced node + * hosted any guest nodes, and call remote_node_down() for them. + * Unfortunately, the controller doesn't have a simple, reliable way + * to map hosts to guests. It might be possible to track this in the + * peer cache via crm_remote_peer_cache_refresh(). For now, we rely + * on the PE creating fence pseudo-events for the guests. + */ + + if (st_event->client_origin + && safe_str_neq(st_event->client_origin, te_client_id)) { + + /* Abort the current transition graph if it wasn't us + * that invoked stonith to fence someone + */ + crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); + abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); + } + + /* Assume it was our leader if we don't currently have one */ + } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target)) + && is_not_set(peer->flags, crm_remote_node)) { + + crm_notice("Target %s our leader %s (recorded: %s)", + fsa_our_dc ? "was" : "may have been", st_event->target, + fsa_our_dc ? fsa_our_dc : ""); + + /* Given the CIB resyncing that occurs around elections, + * have one node update the CIB now and, if the new DC is different, + * have them do so too after the election + */ + if (we_are_executioner) { + send_stonith_update(NULL, st_event->target, uuid); + } + add_stonith_cleanup(st_event->target); + } + + /* If the target is a remote node, and we host its connection, + * immediately fail all monitors so it can be recovered quickly. + * The connection won't necessarily drop when a remote node is fenced, + * so the failure might not otherwise be detected until the next poke. + */ + if (is_set(peer->flags, crm_remote_node)) { + remote_ra_fail(st_event->target); + } + + crmd_peer_down(peer, TRUE); + } +} + +/*! + * \brief Connect to fencer + * + * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop + * + * \return TRUE + * \note If user_data is NULL, this will wait 2s between attempts, for up to + * 30 attempts, meaning the controller could be blocked as long as 58s. + */ +gboolean +te_connect_stonith(gpointer user_data) +{ + int rc = pcmk_ok; + + if (stonith_api == NULL) { + stonith_api = stonith_api_new(); + } + + if (stonith_api->state != stonith_disconnected) { + crm_trace("Already connected to fencer, no need to retry"); + return TRUE; + } + + if (user_data == NULL) { + // Blocking (retry failures now until successful) + rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); + if (rc != pcmk_ok) { + crm_err("Could not connect to fencer in 30 attempts: %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); + } + } else { + // Non-blocking (retry failures later in main loop) + rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); + if (rc != pcmk_ok) { + if (is_set(fsa_input_register, R_ST_REQUIRED)) { + crm_err("Fencer connection failed (will retry): %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); + mainloop_set_trigger(stonith_reconnect); + } else { + crm_info("Fencer connection failed (ignoring because no longer required): %s " + CRM_XS " rc=%d", pcmk_strerror(rc), rc); + } + return TRUE; + } + } + + if (rc == pcmk_ok) { + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_DISCONNECT, + tengine_stonith_connection_destroy); + stonith_api->cmds->register_notification(stonith_api, + T_STONITH_NOTIFY_FENCE, + tengine_stonith_notify); + } + return TRUE; +} + +static gboolean +do_stonith_history_sync(gpointer user_data) +{ + if (stonith_api && (stonith_api->state != stonith_disconnected)) { + stonith_history_t *history = NULL; + + stonith_api->cmds->history(stonith_api, + st_opt_sync_call | st_opt_broadcast, + NULL, &history, 5); + stonith_history_free(history); + return TRUE; + } else { + crm_info("Skip triggering stonith history-sync as stonith is disconnected"); + return FALSE; + } +} + +static void +tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) +{ + char *uuid = NULL; + int stonith_id = -1; + int transition_id = -1; + crm_action_t *action = NULL; + int call_id = data->call_id; + int rc = data->rc; + char *userdata = data->userdata; + + CRM_CHECK(userdata != NULL, return); + crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, + pcmk_strerror(rc), rc); + + if (AM_I_DC == FALSE) { + return; + } + + /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ + /* op->call_id, op->optype, op->node_name, op->op_result, */ + /* (char *)op->node_list, op->private_data); */ + + /* filter out old STONITH actions */ + CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), + goto bail); + + if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) + || transition_graph->id != transition_id) { + crm_info("Ignoring STONITH action initiated outside of the current transition"); + goto bail; + } + + action = get_action(stonith_id, FALSE); + if (action == NULL) { + crm_err("Stonith action not matched"); + goto bail; + } + + stop_te_timer(action->timer); + if (rc == pcmk_ok) { + const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + const char *op = crm_meta_value(action->params, "stonith_action"); + + crm_info("Stonith operation %d for %s passed", call_id, target); + if (action->confirmed == FALSE) { + te_action_confirmed(action); + if (safe_str_eq("on", op)) { + const char *value = NULL; + char *now = crm_itoa(time(NULL)); + + update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE); + free(now); + + value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); + update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE); + + value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); + update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE); + + } else if (action->sent_update == FALSE) { + send_stonith_update(action, target, uuid); + action->sent_update = TRUE; + } + } + st_fail_count_reset(target); + + } else { + const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + enum transition_action abort_action = tg_restart; + + action->failed = TRUE; + crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", + call_id, target, pcmk_strerror(rc)); + + /* If no fence devices were available, there's no use in immediately + * checking again, so don't start a new transition in that case. + */ + if (rc == -ENODEV) { + crm_warn("No devices found in cluster to fence %s, giving up", + target); + abort_action = tg_stop; + } + + /* Increment the fail count now, so abort_for_stonith_failure() can + * check it. Non-DC nodes will increment it in tengine_stonith_notify(). + */ + st_fail_count_increment(target); + abort_for_stonith_failure(abort_action, target, NULL); + } + + update_graph(transition_graph, action); + trigger_graph(); + + bail: + free(userdata); + free(uuid); + return; +} + +gboolean +te_fence_node(crm_graph_t *graph, crm_action_t *action) +{ + int rc = 0; + const char *id = NULL; + const char *uuid = NULL; + const char *target = NULL; + const char *type = NULL; + gboolean invalid_action = FALSE; + enum stonith_call_options options = st_opt_none; + + id = ID(action->xml); + target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); + uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + type = crm_meta_value(action->params, "stonith_action"); + + CRM_CHECK(id != NULL, invalid_action = TRUE); + CRM_CHECK(uuid != NULL, invalid_action = TRUE); + CRM_CHECK(type != NULL, invalid_action = TRUE); + CRM_CHECK(target != NULL, invalid_action = TRUE); + + if (invalid_action) { + crm_log_xml_warn(action->xml, "BadAction"); + return FALSE; + } + + crm_notice("Requesting fencing (%s) of node %s " + CRM_XS " action=%s timeout=%d", + type, target, id, transition_graph->stonith_timeout); + + /* Passing NULL means block until we can connect... */ + te_connect_stonith(NULL); + + if (crmd_join_phase_count(crm_join_confirmed) == 1) { + options |= st_opt_allow_suicide; + } + + rc = stonith_api->cmds->fence(stonith_api, options, target, type, + transition_graph->stonith_timeout / 1000, 0); + + stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000, + st_opt_timeout_updates, + generate_transition_key(transition_graph->id, action->id, + 0, te_uuid), + "tengine_stonith_callback", tengine_stonith_callback); + + return TRUE; +} + +/* end stonith API client functions */ + + +/* + * stonith history synchronization + * + * Each node's fencer keeps track of a cluster-wide fencing history. When a node + * joins or leaves, we need to synchronize the history across all nodes. + */ + +static crm_trigger_t *stonith_history_sync_trigger = NULL; +static mainloop_timer_t *stonith_history_sync_timer = NULL; + +static gboolean +stonith_history_sync_set_trigger(gpointer user_data) +{ + mainloop_set_trigger(stonith_history_sync_trigger); + return FALSE; +} + +void +te_trigger_stonith_history_sync(void) +{ + /* trigger a sync in 5s to give more nodes the + * chance to show up so that we don't create + * unnecessary stonith-history-sync traffic + */ + + /* as we are finally checking the stonith-connection + * in do_stonith_history_sync we should be fine + * leaving stonith_history_sync_time & stonith_history_sync_trigger + * around + */ + if (stonith_history_sync_trigger == NULL) { + stonith_history_sync_trigger = + mainloop_add_trigger(G_PRIORITY_LOW, + do_stonith_history_sync, NULL); + } + + if(stonith_history_sync_timer == NULL) { + stonith_history_sync_timer = + mainloop_timer_add("history_sync", 5000, + FALSE, stonith_history_sync_set_trigger, + NULL); + } + crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); + mainloop_timer_start(stonith_history_sync_timer); +} + +/* end stonith history synchronization functions */ diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h new file mode 100644 index 0000000..b80a6c9 --- /dev/null +++ b/daemons/controld/controld_fencing.h @@ -0,0 +1,37 @@ +/* + * Copyright 2004-2019 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. + */ + +#ifndef CONTROLD_FENCING__H +# define CONTROLD_FENCING__H + +#include // bool +#include // crm_graph_t, crm_action_t + +extern crm_trigger_t *stonith_reconnect; +extern char *te_client_id; +extern stonith_t *stonith_api; + +// stonith fail counts +void st_fail_count_reset(const char * target); +void update_stonith_max_attempts(const char* value); + +// stonith API client +gboolean te_connect_stonith(gpointer user_data); +gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action); + +// stonith cleanup list +void add_stonith_cleanup(const char *target); +void remove_stonith_cleanup(const char *target); +void purge_stonith_cleanup(void); +void execute_stonith_cleanup(void); + +// stonith history synchronization +void te_trigger_stonith_history_sync(void); + +#endif diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c index 9eca530..dc1937f 100644 --- a/daemons/controld/controld_fsa.c +++ b/daemons/controld/controld_fsa.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c index 2ebc203..8f37cbf 100644 --- a/daemons/controld/controld_messages.c +++ b/daemons/controld/controld_messages.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c index c95c6c7..2f61556 100644 --- a/daemons/controld/controld_te_actions.c +++ b/daemons/controld/controld_te_actions.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2018 Andrew Beekhof + * Copyright 2004-2019 the Pacemaker project contributors * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -76,124 +77,6 @@ te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) return TRUE; } -void -send_stonith_update(crm_action_t * action, const char *target, const char *uuid) -{ - int rc = pcmk_ok; - crm_node_t *peer = NULL; - - /* We (usually) rely on the membership layer to do node_update_cluster, - * and the peer status callback to do node_update_peer, because the node - * might have already rejoined before we get the stonith result here. - */ - int flags = node_update_join | node_update_expected; - - /* zero out the node-status & remove all LRM status info */ - xmlNode *node_state = NULL; - - CRM_CHECK(target != NULL, return); - CRM_CHECK(uuid != NULL, return); - - /* Make sure the membership and join caches are accurate */ - peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); - - CRM_CHECK(peer != NULL, return); - - if (peer->state == NULL) { - /* Usually, we rely on the membership layer to update the cluster state - * in the CIB. However, if the node has never been seen, do it here, so - * the node is not considered unclean. - */ - flags |= node_update_cluster; - } - - if (peer->uuid == NULL) { - crm_info("Recording uuid '%s' for node '%s'", uuid, target); - peer->uuid = strdup(uuid); - } - - crmd_peer_down(peer, TRUE); - - /* Generate a node state update for the CIB */ - node_state = create_node_state_update(peer, flags, NULL, __FUNCTION__); - - /* we have to mark whether or not remote nodes have already been fenced */ - if (peer->flags & crm_remote_node) { - time_t now = time(NULL); - char *now_s = crm_itoa(now); - crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s); - free(now_s); - } - - /* Force our known ID */ - crm_xml_add(node_state, XML_ATTR_UUID, uuid); - - rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, - cib_quorum_override | cib_scope_local | cib_can_create); - - /* Delay processing the trigger until the update completes */ - crm_debug("Sending fencing update %d for %s", rc, target); - fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated); - - /* Make sure it sticks */ - /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ - - erase_status_tag(peer->uname, XML_CIB_TAG_LRM, cib_scope_local); - erase_status_tag(peer->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); - - free_xml(node_state); - return; -} - -static gboolean -te_fence_node(crm_graph_t * graph, crm_action_t * action) -{ - int rc = 0; - const char *id = NULL; - const char *uuid = NULL; - const char *target = NULL; - const char *type = NULL; - gboolean invalid_action = FALSE; - enum stonith_call_options options = st_opt_none; - - id = ID(action->xml); - target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); - type = crm_meta_value(action->params, "stonith_action"); - - CRM_CHECK(id != NULL, invalid_action = TRUE); - CRM_CHECK(uuid != NULL, invalid_action = TRUE); - CRM_CHECK(type != NULL, invalid_action = TRUE); - CRM_CHECK(target != NULL, invalid_action = TRUE); - - if (invalid_action) { - crm_log_xml_warn(action->xml, "BadAction"); - return FALSE; - } - - crm_notice("Requesting fencing (%s) of node %s " - CRM_XS " action=%s timeout=%d", - type, target, id, transition_graph->stonith_timeout); - - /* Passing NULL means block until we can connect... */ - te_connect_stonith(NULL); - - if (crmd_join_phase_count(crm_join_confirmed) == 1) { - options |= st_opt_allow_suicide; - } - - rc = stonith_api->cmds->fence(stonith_api, options, target, type, - transition_graph->stonith_timeout / 1000, 0); - - stonith_api->cmds->register_callback(stonith_api, rc, transition_graph->stonith_timeout / 1000, - st_opt_timeout_updates, - generate_transition_key(transition_graph->id, action->id, - 0, te_uuid), - "tengine_stonith_callback", tengine_stonith_callback); - - return TRUE; -} - static int get_target_rc(crm_action_t * action) { diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c index 22b5f4b..1ab703f 100644 --- a/daemons/controld/controld_te_callbacks.c +++ b/daemons/controld/controld_te_callbacks.c @@ -17,6 +17,7 @@ #include #include +#include #include /* For ONLINESTATUS etc */ @@ -27,21 +28,9 @@ gboolean shuttingdown = FALSE; crm_graph_t *transition_graph; crm_trigger_t *transition_trigger = NULL; -static unsigned long int stonith_max_attempts = 10; - /* #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ #define RSC_OP_TEMPLATE "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" -void -update_stonith_max_attempts(const char* value) -{ - if (safe_str_eq(value, CRM_INFINITY_S)) { - stonith_max_attempts = CRM_SCORE_INFINITY; - } - else { - stonith_max_attempts = crm_int_helper(value, NULL); - } -} static void te_update_diff_v1(const char *event, xmlNode *diff) { @@ -646,236 +635,6 @@ process_te_message(xmlNode * msg, xmlNode * xml_data) return TRUE; } -GHashTable *stonith_failures = NULL; -struct st_fail_rec { - int count; -}; - -static gboolean -too_many_st_failures(const char *target) -{ - GHashTableIter iter; - const char *key = NULL; - struct st_fail_rec *value = NULL; - - if (stonith_failures == NULL) { - return FALSE; - } - - if (target == NULL) { - g_hash_table_iter_init(&iter, stonith_failures); - while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { - if (value->count >= stonith_max_attempts) { - target = (const char*)key; - goto too_many; - } - } - } else { - value = g_hash_table_lookup(stonith_failures, target); - if ((value != NULL) && (value->count >= stonith_max_attempts)) { - goto too_many; - } - } - return FALSE; - -too_many: - crm_warn("Too many failures (%d) to fence %s, giving up", - value->count, target); - return TRUE; -} - -/*! - * \internal - * \brief Reset a stonith fail count - * - * \param[in] target Name of node to reset, or NULL for all - */ -void -st_fail_count_reset(const char *target) -{ - if (stonith_failures == NULL) { - return; - } - - if (target) { - struct st_fail_rec *rec = NULL; - - rec = g_hash_table_lookup(stonith_failures, target); - if (rec) { - rec->count = 0; - } - } else { - GHashTableIter iter; - const char *key = NULL; - struct st_fail_rec *rec = NULL; - - g_hash_table_iter_init(&iter, stonith_failures); - while (g_hash_table_iter_next(&iter, (gpointer *) &key, - (gpointer *) &rec)) { - rec->count = 0; - } - } -} - -void -st_fail_count_increment(const char *target) -{ - struct st_fail_rec *rec = NULL; - - if (stonith_failures == NULL) { - stonith_failures = crm_str_table_new(); - } - - rec = g_hash_table_lookup(stonith_failures, target); - if (rec) { - rec->count++; - } else { - rec = malloc(sizeof(struct st_fail_rec)); - if(rec == NULL) { - return; - } - - rec->count = 1; - g_hash_table_insert(stonith_failures, strdup(target), rec); - } -} - -/*! - * \internal - * \brief Abort transition due to stonith failure - * - * \param[in] abort_action Whether to restart or stop transition - * \param[in] target Don't restart if this (NULL for any) has too many failures - * \param[in] reason Log this stonith action XML as abort reason (or NULL) - */ -void -abort_for_stonith_failure(enum transition_action abort_action, - const char *target, xmlNode *reason) -{ - /* If stonith repeatedly fails, we eventually give up on starting a new - * transition for that reason. - */ - if ((abort_action != tg_stop) && too_many_st_failures(target)) { - abort_action = tg_stop; - } - abort_transition(INFINITY, abort_action, "Stonith failed", reason); -} - -void -tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) -{ - char *uuid = NULL; - int stonith_id = -1; - int transition_id = -1; - crm_action_t *action = NULL; - int call_id = data->call_id; - int rc = data->rc; - char *userdata = data->userdata; - - CRM_CHECK(userdata != NULL, return); - crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, - pcmk_strerror(rc), rc); - - if (AM_I_DC == FALSE) { - return; - } - - /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ - /* op->call_id, op->optype, op->node_name, op->op_result, */ - /* (char *)op->node_list, op->private_data); */ - - /* filter out old STONITH actions */ - CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL), - goto bail); - - if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) - || transition_graph->id != transition_id) { - crm_info("Ignoring STONITH action initiated outside of the current transition"); - goto bail; - } - - action = get_action(stonith_id, FALSE); - if (action == NULL) { - crm_err("Stonith action not matched"); - goto bail; - } - - stop_te_timer(action->timer); - if (rc == pcmk_ok) { - const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); - const char *op = crm_meta_value(action->params, "stonith_action"); - - crm_info("Stonith operation %d for %s passed", call_id, target); - if (action->confirmed == FALSE) { - te_action_confirmed(action); - if (safe_str_eq("on", op)) { - const char *value = NULL; - char *now = crm_itoa(time(NULL)); - - update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE); - free(now); - - value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL); - update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE); - - value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE); - update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE); - - } else if (action->sent_update == FALSE) { - send_stonith_update(action, target, uuid); - action->sent_update = TRUE; - } - } - st_fail_count_reset(target); - - } else { - const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); - enum transition_action abort_action = tg_restart; - - action->failed = TRUE; - crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", - call_id, target, pcmk_strerror(rc)); - - /* If no fence devices were available, there's no use in immediately - * checking again, so don't start a new transition in that case. - */ - if (rc == -ENODEV) { - crm_warn("No devices found in cluster to fence %s, giving up", - target); - abort_action = tg_stop; - } - - /* Increment the fail count now, so abort_for_stonith_failure() can - * check it. Non-DC nodes will increment it in tengine_stonith_notify(). - */ - st_fail_count_increment(target); - abort_for_stonith_failure(abort_action, target, NULL); - } - - update_graph(transition_graph, action); - trigger_graph(); - - bail: - free(userdata); - free(uuid); - return; -} - -void -cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) -{ - if (rc < pcmk_ok) { - crm_err("Fencing update %d for %s: failed - %s (%d)", - call_id, (char *)user_data, pcmk_strerror(rc), rc); - crm_log_xml_warn(msg, "Failed update"); - abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); - - } else { - crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); - } -} - void cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c index 22f83ad..1496244 100644 --- a/daemons/controld/controld_te_utils.c +++ b/daemons/controld/controld_te_utils.c @@ -6,441 +6,14 @@ */ #include - -#include #include - #include - #include + #include #include -#include #include #include -#include - -crm_trigger_t *stonith_reconnect = NULL; -static crm_trigger_t *stonith_history_sync_trigger = NULL; -static mainloop_timer_t *stonith_history_sync_timer = NULL; - -/* - * stonith cleanup list - * - * If the DC is shot, proper notifications might not go out. - * The stonith cleanup list allows the cluster to (re-)send - * notifications once a new DC is elected. - */ - -static GListPtr stonith_cleanup_list = NULL; - -/*! - * \internal - * \brief Add a node to the stonith cleanup list - * - * \param[in] target Name of node to add - */ -void -add_stonith_cleanup(const char *target) { - stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target)); -} - -/*! - * \internal - * \brief Remove a node from the stonith cleanup list - * - * \param[in] Name of node to remove - */ -void -remove_stonith_cleanup(const char *target) -{ - GListPtr iter = stonith_cleanup_list; - - while (iter != NULL) { - GListPtr tmp = iter; - char *iter_name = tmp->data; - - iter = iter->next; - if (safe_str_eq(target, iter_name)) { - crm_trace("Removing %s from the cleanup list", iter_name); - stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp); - free(iter_name); - } - } -} - -/*! - * \internal - * \brief Purge all entries from the stonith cleanup list - */ -void -purge_stonith_cleanup() -{ - if (stonith_cleanup_list) { - GListPtr iter = NULL; - - for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { - char *target = iter->data; - - crm_info("Purging %s from stonith cleanup list", target); - free(target); - } - g_list_free(stonith_cleanup_list); - stonith_cleanup_list = NULL; - } -} - -/*! - * \internal - * \brief Send stonith updates for all entries in cleanup list, then purge it - */ -void -execute_stonith_cleanup() -{ - GListPtr iter; - - for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) { - char *target = iter->data; - crm_node_t *target_node = crm_get_peer(0, target); - const char *uuid = crm_peer_uuid(target_node); - - crm_notice("Marking %s, target of a previous stonith action, as clean", target); - send_stonith_update(NULL, target, uuid); - free(target); - } - g_list_free(stonith_cleanup_list); - stonith_cleanup_list = NULL; -} - -/* end stonith cleanup list functions */ - -static gboolean -fail_incompletable_stonith(crm_graph_t * graph) -{ - GListPtr lpc = NULL; - const char *task = NULL; - xmlNode *last_action = NULL; - - if (graph == NULL) { - return FALSE; - } - - for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) { - GListPtr lpc2 = NULL; - synapse_t *synapse = (synapse_t *) lpc->data; - - if (synapse->confirmed) { - continue; - } - - for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) { - crm_action_t *action = (crm_action_t *) lpc2->data; - - if (action->type != action_type_crm || action->confirmed) { - continue; - } - - task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); - if (task && safe_str_eq(task, CRM_OP_FENCE)) { - action->failed = TRUE; - last_action = action->xml; - update_graph(graph, action); - crm_notice("Failing action %d (%s): fencer terminated", - action->id, ID(action->xml)); - } - } - } - - if (last_action != NULL) { - crm_warn("Fencer failure resulted in unrunnable actions"); - abort_for_stonith_failure(tg_restart, NULL, last_action); - return TRUE; - } - - return FALSE; -} - -static void -tengine_stonith_connection_destroy(stonith_t * st, stonith_event_t * e) -{ - if (is_set(fsa_input_register, R_ST_REQUIRED)) { - crm_crit("Fencing daemon connection failed"); - mainloop_set_trigger(stonith_reconnect); - - } else { - crm_info("Fencing daemon disconnected"); - } - - /* cbchan will be garbage at this point, arrange for it to be reset */ - if(stonith_api) { - stonith_api->state = stonith_disconnected; - } - - if (AM_I_DC) { - fail_incompletable_stonith(transition_graph); - trigger_graph(); - } -} - -char *te_client_id = NULL; - -#ifdef HAVE_SYS_REBOOT_H -# include -# include -#endif - -static void -tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) -{ - if(te_client_id == NULL) { - te_client_id = crm_strdup_printf("%s.%lu", crm_system_name, - (unsigned long) getpid()); - } - - if (st_event == NULL) { - crm_err("Notify data not found"); - return; - } - - crmd_alert_fencing_op(st_event); - - if (st_event->result == pcmk_ok && safe_str_eq("on", st_event->action)) { - crm_notice("%s was successfully unfenced by %s (at the request of %s)", - st_event->target, st_event->executioner ? st_event->executioner : "", st_event->origin); - /* TODO: Hook up st_event->device */ - return; - - } else if (safe_str_eq("on", st_event->action)) { - crm_err("Unfencing of %s by %s failed: %s (%d)", - st_event->target, st_event->executioner ? st_event->executioner : "", - pcmk_strerror(st_event->result), st_event->result); - return; - - } else if (st_event->result == pcmk_ok && crm_str_eq(st_event->target, fsa_our_uname, TRUE)) { - crm_crit("We were allegedly just fenced by %s for %s!", - st_event->executioner ? st_event->executioner : "", st_event->origin); /* Dumps blackbox if enabled */ - - qb_log_fini(); /* Try to get the above log message to disk - somehow */ - - /* Get out ASAP and do not come back up. - * - * Triggering a reboot is also not the worst idea either since - * the rest of the cluster thinks we're safely down - */ - -#ifdef RB_HALT_SYSTEM - reboot(RB_HALT_SYSTEM); -#endif - - /* - * If reboot() fails or is not supported, coming back up will - * probably lead to a situation where the other nodes set our - * status to 'lost' because of the fencing callback and will - * discard subsequent election votes with: - * - * Election 87 (current: 5171, owner: 103): Processed vote from east-03 (Peer is not part of our cluster) - * - * So just stay dead, something is seriously messed up anyway. - * - */ - exit(CRM_EX_FATAL); // None of our wrappers since we already called qb_log_fini() - return; - } - - /* Update the count of stonith failures for this target, in case we become - * DC later. The current DC has already updated its fail count in - * tengine_stonith_callback(). - */ - if (!AM_I_DC && safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { - if (st_event->result == pcmk_ok) { - st_fail_count_reset(st_event->target); - } else { - st_fail_count_increment(st_event->target); - } - } - - crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s " - CRM_XS " initiator=%s ref=%s", - st_event->target, st_event->result == pcmk_ok ? "" : " not", - st_event->action, - st_event->executioner ? st_event->executioner : "", - (st_event->client_origin? st_event->client_origin : ""), - pcmk_strerror(st_event->result), - st_event->origin, st_event->id); - - if (st_event->result == pcmk_ok) { - crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY); - const char *uuid = NULL; - gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname); - - if (peer == NULL) { - return; - } - - uuid = crm_peer_uuid(peer); - - crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc); - if(AM_I_DC) { - /* The DC always sends updates */ - send_stonith_update(NULL, st_event->target, uuid); - - /* @TODO Ideally, at this point, we'd check whether the fenced node - * hosted any guest nodes, and call remote_node_down() for them. - * Unfortunately, the controller doesn't have a simple, reliable way - * to map hosts to guests. It might be possible to track this in the - * peer cache via crm_remote_peer_cache_refresh(). For now, we rely - * on the PE creating fence pseudo-events for the guests. - */ - - if (st_event->client_origin && safe_str_neq(st_event->client_origin, te_client_id)) { - - /* Abort the current transition graph if it wasn't us - * that invoked stonith to fence someone - */ - crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target); - abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL); - } - - /* Assume it was our leader if we don't currently have one */ - } else if (((fsa_our_dc == NULL) || safe_str_eq(fsa_our_dc, st_event->target)) - && !is_set(peer->flags, crm_remote_node)) { - - crm_notice("Target %s our leader %s (recorded: %s)", - fsa_our_dc ? "was" : "may have been", st_event->target, - fsa_our_dc ? fsa_our_dc : ""); - - /* Given the CIB resyncing that occurs around elections, - * have one node update the CIB now and, if the new DC is different, - * have them do so too after the election - */ - if (we_are_executioner) { - send_stonith_update(NULL, st_event->target, uuid); - } - add_stonith_cleanup(st_event->target); - } - - /* If the target is a remote node, and we host its connection, - * immediately fail all monitors so it can be recovered quickly. - * The connection won't necessarily drop when a remote node is fenced, - * so the failure might not otherwise be detected until the next poke. - */ - if (is_set(peer->flags, crm_remote_node)) { - remote_ra_fail(st_event->target); - } - - crmd_peer_down(peer, TRUE); - } -} - -static gboolean -do_stonith_history_sync(gpointer user_data) -{ - if (stonith_api && (stonith_api->state != stonith_disconnected)) { - stonith_history_t *history = NULL; - - stonith_api->cmds->history(stonith_api, - st_opt_sync_call | st_opt_broadcast, - NULL, &history, 5); - stonith_history_free(history); - return TRUE; - } else { - crm_info("Skip triggering stonith history-sync as stonith is disconnected"); - return FALSE; - } -} - -static gboolean -stonith_history_sync_set_trigger(gpointer user_data) -{ - mainloop_set_trigger(stonith_history_sync_trigger); - return FALSE; -} - -void -te_trigger_stonith_history_sync(void) -{ - /* trigger a sync in 5s to give more nodes the - * chance to show up so that we don't create - * unnecessary stonith-history-sync traffic - */ - - /* as we are finally checking the stonith-connection - * in do_stonith_history_sync we should be fine - * leaving stonith_history_sync_time & stonith_history_sync_trigger - * around - */ - if (stonith_history_sync_trigger == NULL) { - stonith_history_sync_trigger = - mainloop_add_trigger(G_PRIORITY_LOW, - do_stonith_history_sync, NULL); - } - - if(stonith_history_sync_timer == NULL) { - stonith_history_sync_timer = - mainloop_timer_add("history_sync", 5000, - FALSE, stonith_history_sync_set_trigger, - NULL); - } - crm_info("Fence history will be synchronized cluster-wide within 5 seconds"); - mainloop_timer_start(stonith_history_sync_timer); -} - -/*! - * \brief Connect to fencer - * - * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop - * - * \return TRUE - * \note If user_data is NULL, this will wait 2s between attempts, for up to - * 30 attempts, meaning the controller could be blocked as long as 58s. - */ -gboolean -te_connect_stonith(gpointer user_data) -{ - int rc = pcmk_ok; - - if (stonith_api == NULL) { - stonith_api = stonith_api_new(); - } - - if (stonith_api->state != stonith_disconnected) { - crm_trace("Already connected to fencer, no need to retry"); - return TRUE; - } - - if (user_data == NULL) { - // Blocking (retry failures now until successful) - rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30); - if (rc != pcmk_ok) { - crm_err("Could not connect to fencer in 30 attempts: %s " - CRM_XS " rc=%d", pcmk_strerror(rc), rc); - } - } else { - // Non-blocking (retry failures later in main loop) - rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); - if (rc != pcmk_ok) { - if (is_set(fsa_input_register, R_ST_REQUIRED)) { - crm_err("Fencer connection failed (will retry): %s " - CRM_XS " rc=%d", pcmk_strerror(rc), rc); - mainloop_set_trigger(stonith_reconnect); - } else { - crm_info("Fencer connection failed (ignoring because no longer required): %s " - CRM_XS " rc=%d", pcmk_strerror(rc), rc); - } - return TRUE; - } - } - - if (rc == pcmk_ok) { - stonith_api->cmds->register_notification(stonith_api, - T_STONITH_NOTIFY_DISCONNECT, - tengine_stonith_connection_destroy); - stonith_api->cmds->register_notification(stonith_api, - T_STONITH_NOTIFY_FENCE, - tengine_stonith_notify); - } - return TRUE; -} gboolean stop_te_timer(crm_action_timer_t * timer) diff --git a/daemons/controld/controld_transition.c b/daemons/controld/controld_transition.c index 5f164ab..b942ab4 100644 --- a/daemons/controld/controld_transition.c +++ b/daemons/controld/controld_transition.c @@ -18,7 +18,6 @@ extern crm_graph_functions_t te_graph_fns; -stonith_t *stonith_api = NULL; static void global_cib_callback(const xmlNode * msg, int callid, int rc, xmlNode * output) diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h index a162f99..f31ac2d 100644 --- a/daemons/controld/controld_transition.h +++ b/daemons/controld/controld_transition.h @@ -1,5 +1,5 @@ /* - * Copyright 2004-2018 Andrew Beekhof + * Copyright 2004-2019 the Pacemaker project contributors * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. @@ -12,15 +12,6 @@ # include # include # include -extern stonith_t *stonith_api; -extern void send_stonith_update(crm_action_t * stonith_action, const char *target, - const char *uuid); - -/* stonith cleanup list */ -void add_stonith_cleanup(const char *target); -void remove_stonith_cleanup(const char *target); -void purge_stonith_cleanup(void); -void execute_stonith_cleanup(void); /* tengine */ extern crm_action_t *match_down_event(const char *target); @@ -46,16 +37,11 @@ extern char *te_uuid; extern void notify_crmd(crm_graph_t * graph); -void cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, - void *user_data); void cib_action_updated(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data); gboolean action_timer_callback(gpointer data); gboolean te_graph_trigger(gpointer user_data); void te_update_diff(const char *event, xmlNode *msg); -void tengine_stonith_callback(stonith_t *stonith, - stonith_callback_data_t *data); -void update_stonith_max_attempts(const char* value); extern void trigger_graph_processing(const char *fn, int line); void abort_after_delay(int abort_priority, enum transition_action abort_action, @@ -68,12 +54,7 @@ extern void abort_transition_graph(int abort_priority, enum transition_action ab # define abort_transition(pri, action, text, reason) \ abort_transition_graph(pri, action, text, reason,__FUNCTION__,__LINE__); -extern gboolean te_connect_stonith(gpointer user_data); - -extern void te_trigger_stonith_history_sync(void); - extern crm_trigger_t *transition_trigger; -extern crm_trigger_t *stonith_reconnect; extern char *failed_stop_offset; extern char *failed_start_offset; diff --git a/daemons/controld/controld_utils.h b/daemons/controld/controld_utils.h index 68992f5..8b80e3c 100644 --- a/daemons/controld/controld_utils.h +++ b/daemons/controld/controld_utils.h @@ -85,10 +85,6 @@ int crmd_join_phase_count(enum crm_join_phase phase); void crmd_join_phase_log(int level); const char *get_timer_desc(fsa_timer_t * timer); -void st_fail_count_reset(const char * target); -void st_fail_count_increment(const char *target); -void abort_for_stonith_failure(enum transition_action abort_action, - const char *target, xmlNode *reason); void crmd_peer_down(crm_node_t *peer, bool full); unsigned int cib_op_timeout(void); -- 1.8.3.1 From 3002e485651e1ad18da6d44e7672dbe4f0380d3b Mon Sep 17 00:00:00 2001 From: Ken Gaillot Date: Thu, 23 May 2019 18:18:06 -0500 Subject: [PATCH] Refactor: controller: isolate stonith API handling can now make more variables and functions static --- daemons/controld/controld_control.c | 28 +++------------------ daemons/controld/controld_fencing.c | 49 ++++++++++++++++++++++++++++++++++--- daemons/controld/controld_fencing.h | 7 ++---- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index 7f918c0..e99d605 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -113,14 +113,7 @@ do_shutdown(long long action, { /* just in case */ set_bit(fsa_input_register, R_SHUTDOWN); - - if (stonith_api) { - /* Prevent it from coming up again */ - clear_bit(fsa_input_register, R_ST_REQUIRED); - - crm_info("Disconnecting from fencer"); - stonith_api->cmds->disconnect(stonith_api); - } + controld_disconnect_fencer(FALSE); } /* A_SHUTDOWN_REQ */ @@ -201,12 +194,7 @@ crmd_exit(crm_exit_t exit_code) controld_close_attrd_ipc(); pe_subsystem_free(); - - if(stonith_api) { - crm_trace("Disconnecting fencing API"); - clear_bit(fsa_input_register, R_ST_REQUIRED); - stonith_api->cmds->free(stonith_api); stonith_api = NULL; - } + controld_disconnect_fencer(TRUE); if ((exit_code == CRM_EX_OK) && (crmd_mainloop == NULL)) { crm_debug("No mainloop detected"); @@ -258,7 +246,6 @@ crmd_exit(crm_exit_t exit_code) mainloop_destroy_trigger(fsa_source); fsa_source = NULL; mainloop_destroy_trigger(config_read); config_read = NULL; - mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; crm_client_cleanup(); @@ -288,7 +275,6 @@ crmd_exit(crm_exit_t exit_code) free(fsa_cluster_name); fsa_cluster_name = NULL; free(te_uuid); te_uuid = NULL; - free(te_client_id); te_client_id = NULL; free(fsa_pe_ref); fsa_pe_ref = NULL; free(failed_stop_offset); failed_stop_offset = NULL; free(failed_start_offset); failed_start_offset = NULL; @@ -627,15 +613,7 @@ do_started(long long action, crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } - - // Try connecting to fencer (retrying later in mainloop if failed) - if (stonith_reconnect == NULL) { - stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, - te_connect_stonith, - GINT_TO_POINTER(TRUE)); - } - set_bit(fsa_input_register, R_ST_REQUIRED); - mainloop_set_trigger(stonith_reconnect); + controld_trigger_fencer_connect(); crm_notice("Pacemaker controller successfully started and accepting connections"); clear_bit(fsa_input_register, R_STARTING); diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index cde57b5..92336e9 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -341,9 +341,9 @@ execute_stonith_cleanup() * Functions that need to interact directly with the fencer via its API */ -stonith_t *stonith_api = NULL; -crm_trigger_t *stonith_reconnect = NULL; -char *te_client_id = NULL; +static stonith_t *stonith_api = NULL; +static crm_trigger_t *stonith_reconnect = NULL; +static char *te_client_id = NULL; static gboolean fail_incompletable_stonith(crm_graph_t *graph) @@ -571,7 +571,7 @@ tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event) * \note If user_data is NULL, this will wait 2s between attempts, for up to * 30 attempts, meaning the controller could be blocked as long as 58s. */ -gboolean +static gboolean te_connect_stonith(gpointer user_data) { int rc = pcmk_ok; @@ -619,6 +619,47 @@ te_connect_stonith(gpointer user_data) return TRUE; } +/*! + \internal + \brief Schedule fencer connection attempt in main loop +*/ +void +controld_trigger_fencer_connect() +{ + if (stonith_reconnect == NULL) { + stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, + te_connect_stonith, + GINT_TO_POINTER(TRUE)); + } + set_bit(fsa_input_register, R_ST_REQUIRED); + mainloop_set_trigger(stonith_reconnect); +} + +void +controld_disconnect_fencer(bool destroy) +{ + if (stonith_api) { + // Prevent fencer connection from coming up again + clear_bit(fsa_input_register, R_ST_REQUIRED); + + stonith_api->cmds->disconnect(stonith_api); + } + if (destroy) { + if (stonith_api) { + stonith_api->cmds->free(stonith_api); + stonith_api = NULL; + } + if (stonith_reconnect) { + mainloop_destroy_trigger(stonith_reconnect); + stonith_reconnect = NULL; + } + if (te_client_id) { + free(te_client_id); + te_client_id = NULL; + } + } +} + static gboolean do_stonith_history_sync(gpointer user_data) { diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h index b80a6c9..3ef537f 100644 --- a/daemons/controld/controld_fencing.h +++ b/daemons/controld/controld_fencing.h @@ -13,16 +13,13 @@ #include // bool #include // crm_graph_t, crm_action_t -extern crm_trigger_t *stonith_reconnect; -extern char *te_client_id; -extern stonith_t *stonith_api; - // stonith fail counts void st_fail_count_reset(const char * target); void update_stonith_max_attempts(const char* value); // stonith API client -gboolean te_connect_stonith(gpointer user_data); +void controld_trigger_fencer_connect(void); +void controld_disconnect_fencer(bool destroy); gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action); // stonith cleanup list -- 1.8.3.1