From 0113ff6fb6bb576356d201cf698b98455dbf5180 Mon Sep 17 00:00:00 2001 From: Klaus Wenninger Date: Wed, 21 Dec 2016 18:08:40 +0100 Subject: [PATCH] Fix: pacemaker-remote: pacemaker_remoted shutdown while unmanaged Since introduction of the graceful shutdown of pacemaker_remoted the shutdown is hanging if the remote-resource is unmanaged. This happens as pacemaker_remoted is waiting for all resources running on the remote-node to be shut down and pacemaker on the other hand doesn't touch resources on a remote-node when the remote-resource is unmanaged. Fixes rhbz#1388102 --- crmd/crmd_lrm.h | 2 + crmd/lrm_state.c | 20 ++++--- crmd/messages.c | 5 ++ crmd/remote_lrmd_ra.c | 124 ++++++++++++++++++++++++++++++++++++++++--- crmd/te_actions.c | 28 +++++++++- include/crm/crm.h | 1 + include/crm/lrmd.h | 1 + include/crm/msg_xml.h | 2 + include/crm/pengine/status.h | 1 + include/crm_internal.h | 1 + lib/lrmd/proxy_common.c | 15 ++++++ lib/pengine/unpack.c | 34 ++++++++---- lib/pengine/utils.c | 28 ++++++---- lrmd/ipc_proxy.c | 5 ++ lrmd/lrmd_private.h | 2 + lrmd/main.c | 24 +++++++++ pengine/allocate.c | 3 ++ pengine/graph.c | 68 +++++++++++++++++++++++- pengine/pengine.h | 1 + 19 files changed, 329 insertions(+), 36 deletions(-) diff --git a/crmd/crmd_lrm.h b/crmd/crmd_lrm.h index c6373f1..64d80c4 100644 --- a/crmd/crmd_lrm.h +++ b/crmd/crmd_lrm.h @@ -162,5 +162,7 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti void remote_ra_cleanup(lrm_state_t * lrm_state); void remote_ra_fail(const char *node_name); void remote_ra_process_pseudo(xmlNode *xml); +gboolean remote_ra_is_in_maintenance(lrm_state_t * lrm_state); +void remote_ra_process_maintenance_nodes(xmlNode *xml); gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending); diff --git a/crmd/lrm_state.c b/crmd/lrm_state.c index 7b4379b..d55755e 100644 --- a/crmd/lrm_state.c +++ b/crmd/lrm_state.c @@ -508,14 +508,22 @@ crmd_remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg) crm_notice("%s requested shutdown of its remote connection", lrm_state->node_name); - now_s = crm_itoa(now); - update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE); - free(now_s); + if (!remote_ra_is_in_maintenance(lrm_state)) { + now_s = crm_itoa(now); + update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE); + free(now_s); - remote_proxy_ack_shutdown(lrmd); + remote_proxy_ack_shutdown(lrmd); - crm_warn("Reconnection attempts to %s may result in failures that must be cleared", - lrm_state->node_name); + crm_warn("Reconnection attempts to %s may result in failures that must be cleared", + lrm_state->node_name); + } else { + remote_proxy_nack_shutdown(lrmd); + + crm_notice("Remote resource for %s is not managed so no ordered shutdown happening", + lrm_state->node_name); + } + return; } else if (safe_str_eq(op, LRMD_IPC_OP_REQUEST) && proxy->is_local) { /* this is for the crmd, which we are, so don't try diff --git a/crmd/messages.c b/crmd/messages.c index 87d0acf..c79d96e 100644 --- a/crmd/messages.c +++ b/crmd/messages.c @@ -872,6 +872,11 @@ handle_request(xmlNode * stored_msg, enum crmd_fsa_cause cause) reap_crm_member(id, name); } + } else if (strcmp(op, CRM_OP_MAINTENANCE_NODES) == 0) { + xmlNode *xml = get_message_xml(stored_msg, F_CRM_DATA); + + remote_ra_process_maintenance_nodes(xml); + } else { crm_err("Unexpected request (%s) sent to %s", op, AM_I_DC ? "the DC" : "non-DC node"); crm_log_xml_err(stored_msg, "Unexpected"); diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index e68d784..8085219 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -80,6 +80,10 @@ typedef struct remote_ra_data_s { enum remote_migration_status migrate_status; gboolean active; + gboolean is_maintenance; /* kind of complex to determine from crmd-context + * so we have it signalled back with the + * transition from pengine + */ } remote_ra_data_t; static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms); @@ -485,6 +489,28 @@ monitor_timeout_cb(gpointer data) return FALSE; } +static void +synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type) +{ + lrmd_event_data_t op = { 0, }; + + if (lrm_state == NULL) { + /* if lrm_state not given assume local */ + lrm_state = lrm_state_find(fsa_our_uname); + } + CRM_ASSERT(lrm_state != NULL); + + op.type = lrmd_event_exec_complete; + op.rsc_id = rsc_id; + op.op_type = op_type; + op.rc = PCMK_OCF_OK; + op.op_status = PCMK_LRM_OP_DONE; + op.t_run = time(NULL); + op.t_rcchange = op.t_run; + op.call_id = generate_callid(); + process_lrm_event(lrm_state, &op, NULL); +} + void remote_lrm_op_callback(lrmd_event_data_t * op) { @@ -536,9 +562,18 @@ remote_lrm_op_callback(lrmd_event_data_t * op) (ra_data->cur_cmd == NULL) && (ra_data->active == TRUE)) { - crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name); - ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds); - ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds); + if (!remote_ra_is_in_maintenance(lrm_state)) { + crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name); + ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds); + ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds); + } else { + crm_notice("Disconnect on unmanaged remote-node %s", lrm_state->node_name); + /* Do roughly what a 'stop' on the remote-resource would do */ + handle_remote_ra_stop(lrm_state, NULL); + remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM); + /* now fake the reply of a successful 'stop' */ + synthesize_lrmd_success(NULL, lrm_state->node_name, "stop"); + } return; } @@ -651,8 +686,6 @@ handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd) ra_data->active = FALSE; lrm_state_disconnect(lrm_state); - cmd->rc = PCMK_OCF_OK; - cmd->op_status = PCMK_LRM_OP_DONE; if (ra_data->cmds) { g_list_free_full(ra_data->cmds, free_cmd); @@ -664,7 +697,12 @@ handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd) ra_data->recurring_cmds = NULL; ra_data->cur_cmd = NULL; - report_remote_ra_result(cmd); + if (cmd) { + cmd->rc = PCMK_OCF_OK; + cmd->op_status = PCMK_LRM_OP_DONE; + + report_remote_ra_result(cmd); + } } static int @@ -1140,3 +1178,77 @@ remote_ra_process_pseudo(xmlNode *xml) } freeXpathObject(search); } + +static void +remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance) +{ + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + xmlNode *update, *state; + int call_opt, call_id = 0; + crm_node_t *node; + + call_opt = crmd_cib_smart_opt(); + node = crm_remote_peer_get(lrm_state->node_name); + CRM_CHECK(node != NULL, return); + update = create_xml_node(NULL, XML_CIB_TAG_STATUS); + state = create_node_state_update(node, node_update_none, update, + __FUNCTION__); + crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0"); + fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL); + if (call_id < 0) { + crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name); + } else { + /* TODO: still not 100% sure that async update will succeed ... */ + ra_data->is_maintenance = maintenance; + } + free_xml(update); +} + +#define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \ + "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \ + XML_GRAPH_TAG_MAINTENANCE + +/*! + * \internal + * \brief Check a pseudo-action holding updates for maintenance state + * + * \param[in] xml XML of pseudo-action to check + */ + +void +remote_ra_process_maintenance_nodes(xmlNode *xml) +{ + xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE); + + if (numXpathResults(search) == 1) { + xmlNode *node; + int cnt = 0, cnt_remote = 0; + + for (node = + first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE); + node; node = __xml_next(node)) { + lrm_state_t *lrm_state = lrm_state_find(ID(node)); + + cnt++; + if (lrm_state && lrm_state->remote_ra_data && + ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) { + cnt_remote++; + remote_ra_maintenance(lrm_state, + crm_atoi(crm_element_value(node, + XML_NODE_IS_MAINTENANCE), "0")); + + } + } + crm_trace("Action holds %d nodes (%d remotes found) " + "adjusting maintenance-mode", cnt, cnt_remote); + } + freeXpathObject(search); +} + +gboolean +remote_ra_is_in_maintenance(lrm_state_t * lrm_state) +{ + remote_ra_data_t *ra_data = lrm_state->remote_ra_data; + + return ra_data->is_maintenance; +} diff --git a/crmd/te_actions.c b/crmd/te_actions.c index 5508234..c41d44d 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -53,8 +53,32 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action) static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { - /* Check action for Pacemaker Remote node side effects */ - remote_ra_process_pseudo(pseudo->xml); + const char *task = crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK); + + /* send to peers as well? */ + if (safe_str_eq(task, CRM_OP_MAINTENANCE_NODES)) { + GHashTableIter iter; + crm_node_t *node = NULL; + + g_hash_table_iter_init(&iter, crm_peer_cache); + while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) { + xmlNode *cmd = NULL; + + if (safe_str_eq(fsa_our_uname, node->uname)) { + continue; + } + + cmd = create_request(task, pseudo->xml, node->uname, + CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); + send_cluster_message(node, crm_msg_crmd, cmd, FALSE); + free_xml(cmd); + } + + remote_ra_process_maintenance_nodes(pseudo->xml); + } else { + /* Check action for Pacemaker Remote node side effects */ + remote_ra_process_pseudo(pseudo->xml); + } crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id, crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY)); diff --git a/include/crm/crm.h b/include/crm/crm.h index 3f83a91..6afc771 100644 --- a/include/crm/crm.h +++ b/include/crm/crm.h @@ -142,6 +142,7 @@ extern char *crm_system_name; # define CRM_OP_RELAXED_SET "one-or-more" # define CRM_OP_RELAXED_CLONE "clone-one-or-more" # define CRM_OP_RM_NODE_CACHE "rm_node_cache" +# define CRM_OP_MAINTENANCE_NODES "maintenance_nodes" # define CRMD_JOINSTATE_DOWN "down" # define CRMD_JOINSTATE_PENDING "pending" diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h index 446b39c..e4dc61c 100644 --- a/include/crm/lrmd.h +++ b/include/crm/lrmd.h @@ -99,6 +99,7 @@ typedef struct lrmd_key_value_s { #define LRMD_IPC_OP_RESPONSE "response" #define LRMD_IPC_OP_SHUTDOWN_REQ "shutdown_req" #define LRMD_IPC_OP_SHUTDOWN_ACK "shutdown_ack" +#define LRMD_IPC_OP_SHUTDOWN_NACK "shutdown_nack" #define F_LRMD_IPC_OP "lrmd_ipc_op" #define F_LRMD_IPC_IPC_SERVER "lrmd_ipc_server" diff --git a/include/crm/msg_xml.h b/include/crm/msg_xml.h index 4a3cd1e..3a0c0e8 100644 --- a/include/crm/msg_xml.h +++ b/include/crm/msg_xml.h @@ -255,6 +255,7 @@ # define XML_NODE_IS_PEER "crmd" # define XML_NODE_IS_REMOTE "remote_node" # define XML_NODE_IS_FENCED "node_fenced" +# define XML_NODE_IS_MAINTENANCE "node_in_maintenance" # define XML_CIB_ATTR_SHUTDOWN "shutdown" # define XML_CIB_ATTR_STONITH "stonith" @@ -297,6 +298,7 @@ # define XML_GRAPH_TAG_PSEUDO_EVENT "pseudo_event" # define XML_GRAPH_TAG_CRM_EVENT "crm_event" # define XML_GRAPH_TAG_DOWNED "downed" +# define XML_GRAPH_TAG_MAINTENANCE "maintenance" # define XML_TAG_RULE "rule" # define XML_RULE_ATTR_SCORE "score" diff --git a/include/crm/pengine/status.h b/include/crm/pengine/status.h index 79e4572..eb401be 100644 --- a/include/crm/pengine/status.h +++ b/include/crm/pengine/status.h @@ -160,6 +160,7 @@ struct node_shared_s { gboolean rsc_discovery_enabled; gboolean remote_requires_reset; gboolean remote_was_fenced; + gboolean remote_maintenance; /* what the remote-rsc is thinking */ }; struct node_s { diff --git a/include/crm_internal.h b/include/crm_internal.h index a498bcb..297e6b3 100644 --- a/include/crm_internal.h +++ b/include/crm_internal.h @@ -381,6 +381,7 @@ typedef struct remote_proxy_s { int remote_proxy_check(lrmd_t *lrmd, GHashTable *hash); void remote_proxy_cb(lrmd_t *lrmd, const char *node_name, xmlNode *msg); void remote_proxy_ack_shutdown(lrmd_t *lrmd); +void remote_proxy_nack_shutdown(lrmd_t *lrmd); int remote_proxy_dispatch(const char *buffer, ssize_t length, gpointer userdata); void remote_proxy_disconnected(gpointer data); diff --git a/lib/lrmd/proxy_common.c b/lib/lrmd/proxy_common.c index eb17e4e..69cfa8c 100644 --- a/lib/lrmd/proxy_common.c +++ b/lib/lrmd/proxy_common.c @@ -59,6 +59,21 @@ remote_proxy_ack_shutdown(lrmd_t *lrmd) free_xml(msg); } +/*! + * \brief We're not gonna shutdown as response to + * a remote proxy shutdown request. + * + * \param[in] lrmd Connection to proxy + */ +void +remote_proxy_nack_shutdown(lrmd_t *lrmd) +{ + xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); + crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_NACK); + lrmd_internal_proxy_send(lrmd, msg); + free_xml(msg); +} + void remote_proxy_relay_event(remote_proxy_t *proxy, xmlNode *msg) { diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index e6a8f58..ed6ee7f 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -89,16 +89,22 @@ pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) set_bit(node->details->remote_rsc->flags, pe_rsc_failed); } else if (is_baremetal_remote_node(node)) { - if(pe_can_fence(data_set, node)) { - crm_warn("Node %s will be fenced %s", node->details->uname, reason); + resource_t *rsc = node->details->remote_rsc; + + if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) { + crm_notice("Not fencing node %s because connection is unmanaged, " + "otherwise would %s", node->details->uname, reason); } else { - crm_warn("Node %s is unclean %s", node->details->uname, reason); + if (pe_can_fence(data_set, node)) { + crm_warn("Node %s will be fenced %s", node->details->uname, reason); + } else { + crm_warn("Node %s is unclean %s", node->details->uname, reason); + } + node->details->remote_requires_reset = TRUE; } node->details->unclean = TRUE; - node->details->remote_requires_reset = TRUE; - } else if (node->details->unclean == FALSE) { - if(pe_can_fence(data_set, node)) { + if (pe_can_fence(data_set, node)) { crm_warn("Node %s will be fenced %s", node->details->uname, reason); } else { crm_warn("Node %s is unclean %s", node->details->uname, reason); @@ -1163,6 +1169,7 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) const char *id = NULL; const char *uname = NULL; const char *shutdown = NULL; + resource_t *rsc = NULL; GListPtr gIter = NULL; @@ -1202,6 +1209,10 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) } crm_trace("Processing remote node id=%s, uname=%s", id, uname); + this_node->details->remote_maintenance = + crm_atoi(crm_element_value(state, XML_NODE_IS_MAINTENANCE), "0"); + + rsc = this_node->details->remote_rsc; if (this_node->details->remote_requires_reset == FALSE) { this_node->details->unclean = FALSE; this_node->details->unseen = FALSE; @@ -1211,11 +1222,11 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) shutdown = g_hash_table_lookup(this_node->details->attrs, XML_CIB_ATTR_SHUTDOWN); if (shutdown != NULL && safe_str_neq("0", shutdown)) { - resource_t *rsc = this_node->details->remote_rsc; - crm_info("Node %s is shutting down", this_node->details->uname); this_node->details->shutdown = TRUE; - rsc->next_role = RSC_ROLE_STOPPED; + if (rsc) { + rsc->next_role = RSC_ROLE_STOPPED; + } } if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "standby"))) { @@ -1223,7 +1234,8 @@ unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) this_node->details->standby = TRUE; } - if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "maintenance"))) { + if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "maintenance")) || + (rsc && !is_set(rsc->flags, pe_rsc_managed))) { crm_info("Node %s is in maintenance-mode", this_node->details->uname); this_node->details->maintenance = TRUE; } @@ -2825,7 +2837,7 @@ determine_op_status( result = PCMK_LRM_OP_NOTSUPPORTED; break; - } else if(pe_can_fence(data_set, node) == FALSE + } else if (pe_can_fence(data_set, node) == FALSE && safe_str_eq(task, CRMD_ACTION_STOP)) { /* If a stop fails and we can't fence, there's nothing else we can do */ pe_proc_err("No further recovery can be attempted for %s: %s action failed with '%s' (%d)", diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c index 2b53999..0ce5c53 100644 --- a/lib/pengine/utils.c +++ b/lib/pengine/utils.c @@ -824,20 +824,28 @@ unpack_operation(action_t * action, xmlNode * xml_obj, resource_t * container, * 2. start - a start failure indicates that an active connection does not already * exist. The user can set op on-fail=fence if they really want to fence start * failures. */ - } else if (value == NULL && - is_rsc_baremetal_remote_node(action->rsc, data_set) && + } else if (((value == NULL) || !is_set(action->rsc->flags, pe_rsc_managed)) && + (is_rsc_baremetal_remote_node(action->rsc, data_set) && !(safe_str_eq(action->task, CRMD_ACTION_STATUS) && interval == 0) && - (safe_str_neq(action->task, CRMD_ACTION_START))) { + (safe_str_neq(action->task, CRMD_ACTION_START)))) { - if (is_set(data_set->flags, pe_flag_stonith_enabled)) { - value = "fence baremetal remote node (default)"; - } else { - value = "recover baremetal remote node connection (default)"; - } - if (action->rsc->remote_reconnect_interval) { + if (!is_set(action->rsc->flags, pe_rsc_managed)) { + action->on_fail = action_fail_stop; action->fail_role = RSC_ROLE_STOPPED; + value = "stop unmanaged baremetal remote node (enforcing default)"; + + } else { + if (is_set(data_set->flags, pe_flag_stonith_enabled)) { + value = "fence baremetal remote node (default)"; + } else { + value = "recover baremetal remote node connection (default)"; + } + + if (action->rsc->remote_reconnect_interval) { + action->fail_role = RSC_ROLE_STOPPED; + } + action->on_fail = action_fail_reset_remote; } - action->on_fail = action_fail_reset_remote; } else if (value == NULL && safe_str_eq(action->task, CRMD_ACTION_STOP)) { if (is_set(data_set->flags, pe_flag_stonith_enabled)) { diff --git a/lrmd/ipc_proxy.c b/lrmd/ipc_proxy.c index 07c13ab..5d6ab34 100644 --- a/lrmd/ipc_proxy.c +++ b/lrmd/ipc_proxy.c @@ -164,6 +164,11 @@ ipc_proxy_forward_client(crm_client_t *ipc_proxy, xmlNode *xml) return; } + if (safe_str_eq(msg_type, LRMD_IPC_OP_SHUTDOWN_NACK)) { + handle_shutdown_nack(); + return; + } + ipc_client = crm_client_get_by_id(session); if (ipc_client == NULL) { xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY); diff --git a/lrmd/lrmd_private.h b/lrmd/lrmd_private.h index 62e9c84..5579b92 100644 --- a/lrmd/lrmd_private.h +++ b/lrmd/lrmd_private.h @@ -85,6 +85,8 @@ void free_rsc(gpointer data); void handle_shutdown_ack(void); +void handle_shutdown_nack(void); + void lrmd_client_destroy(crm_client_t *client); void client_disconnect_cleanup(const char *client_id); diff --git a/lrmd/main.c b/lrmd/main.c index e3d3aaa..a3aa08f 100644 --- a/lrmd/main.c +++ b/lrmd/main.c @@ -364,6 +364,7 @@ void handle_shutdown_ack() crm_info("Received shutdown ack"); if (shutdown_ack_timer > 0) { g_source_remove(shutdown_ack_timer); + shutdown_ack_timer = 0; } return; } @@ -371,6 +372,29 @@ void handle_shutdown_ack() crm_debug("Ignoring unexpected shutdown ack"); } +/*! + * \internal + * \brief Make short exit timer fire immediately + */ +void handle_shutdown_nack() +{ +#ifdef ENABLE_PCMK_REMOTE + if (shutting_down) { + crm_info("Received shutdown nack"); + if (shutdown_ack_timer > 0) { + GSource *timer = + g_main_context_find_source_by_id(NULL, shutdown_ack_timer); + + if (timer != NULL) { + g_source_set_ready_time(timer, 0); + } + } + return; + } +#endif + crm_debug("Ignoring unexpected shutdown nack"); +} + /* *INDENT-OFF* */ static struct crm_option long_options[] = { /* Top-level Options */ diff --git a/pengine/allocate.c b/pengine/allocate.c index 9a87816..7562253 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -2150,6 +2150,9 @@ stage8(pe_working_set_t * data_set) crm_log_xml_trace(data_set->graph, "created resource-driven action list"); + /* pseudo action to distribute list of nodes with maintenance state update */ + add_maintenance_update(data_set); + /* catch any non-resource specific actions */ crm_trace("processing non-resource actions"); diff --git a/pengine/graph.c b/pengine/graph.c index 81d8355..5ba60f7 100644 --- a/pengine/graph.c +++ b/pengine/graph.c @@ -788,13 +788,15 @@ get_router_node(action_t *action) * \param[in] id Node UUID to add * \param[in,out] xml Parent XML tag to add to */ -static void +static xmlNode* add_node_to_xml_by_id(const char *id, xmlNode *xml) { xmlNode *node_xml; node_xml = create_xml_node(xml, XML_CIB_TAG_NODE); crm_xml_add(node_xml, XML_ATTR_UUID, id); + + return node_xml; } /*! @@ -812,6 +814,62 @@ add_node_to_xml(const node_t *node, void *xml) /*! * \internal + * \brief Add XML with nodes that need an update of their maintenance state + * + * \param[in,out] xml Parent XML tag to add to + * \param[in] data_set Working set for cluster + */ +static int +add_maintenance_nodes(xmlNode *xml, const pe_working_set_t *data_set) +{ + GListPtr gIter = NULL; + xmlNode *maintenance = + xml?create_xml_node(xml, XML_GRAPH_TAG_MAINTENANCE):NULL; + int count = 0; + + for (gIter = data_set->nodes; gIter != NULL; + gIter = gIter->next) { + node_t *node = (node_t *) gIter->data; + struct node_shared_s *details = node->details; + + if (!(is_remote_node(node))) { + continue; /* just remote nodes need to know atm */ + } + + if (details->maintenance != details->remote_maintenance) { + if (maintenance) { + crm_xml_add( + add_node_to_xml_by_id(node->details->id, maintenance), + XML_NODE_IS_MAINTENANCE, details->maintenance?"1":"0"); + } + count++; + } + } + crm_trace("%s %d nodes to adjust maintenance-mode " + "to transition", maintenance?"Added":"Counted", count); + return count; +} + +/*! + * \internal + * \brief Add pseudo action with nodes needing maintenance state update + * + * \param[in,out] data_set Working set for cluster + */ +void +add_maintenance_update(pe_working_set_t *data_set) +{ + action_t *action = NULL; + + if (add_maintenance_nodes(NULL, data_set)) { + crm_trace("adding maintenance state update pseudo action"); + action = get_pseudo_op(CRM_OP_MAINTENANCE_NODES, data_set); + set_bit(action->flags, pe_action_print_always); + } +} + +/*! + * \internal * \brief Add XML with nodes that an action is expected to bring down * * If a specified action is expected to bring any nodes down, add an XML block @@ -874,6 +932,7 @@ static xmlNode * action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) { gboolean needs_node_info = TRUE; + gboolean needs_maintenance_info = FALSE; xmlNode *action_xml = NULL; xmlNode *args_xml = NULL; @@ -901,6 +960,9 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) /* action_xml = create_xml_node(NULL, XML_GRAPH_TAG_CRM_EVENT); */ } else if (is_set(action->flags, pe_action_pseudo)) { + if (safe_str_eq(action->task, CRM_OP_MAINTENANCE_NODES)) { + needs_maintenance_info = TRUE; + } action_xml = create_xml_node(NULL, XML_GRAPH_TAG_PSEUDO_EVENT); needs_node_info = FALSE; @@ -1082,6 +1144,10 @@ action2xml(action_t * action, gboolean as_input, pe_working_set_t *data_set) add_downed_nodes(action_xml, action, data_set); } + if (needs_maintenance_info) { + add_maintenance_nodes(action_xml, data_set); + } + crm_log_xml_trace(action_xml, "dumped action"); return action_xml; } diff --git a/pengine/pengine.h b/pengine/pengine.h index 5500819..e3f4874 100644 --- a/pengine/pengine.h +++ b/pengine/pengine.h @@ -145,6 +145,7 @@ extern int new_rsc_order(resource_t * lh_rsc, const char *lh_task, new_rsc_order(rsc1, CRMD_ACTION_STOP, rsc2, CRMD_ACTION_STOP, type, data_set) extern void graph_element_from_action(action_t * action, pe_working_set_t * data_set); +extern void add_maintenance_update(pe_working_set_t *data_set); extern gboolean show_scores; extern int scores_log_level; -- 1.8.3.1